def basicinfo_execute(self): url = ('http://%s/aiccips/GSpublicity/GSpublicityList' '.html?service=entInfo') % self.host response = url_requests.post(url=url, data=self.data, headers=self.headers, proxies=proxies) soup = BeautifulSoup(response.text, 'lxml') self.soup = soup # 赋值到属性,可获取营业执照、主要成员、分支机构三项信息 body = soup.find('div', {'style': 'padding-left: 14px'}) info = {} labels = body.find_all('span', class_='label') for label in labels: key = label.text.replace(':', '').strip() value = label.next_sibling.text.strip() info[key] = value info = common.basicinfo_dict(info, '广东省') self.qyxx_basicinfo.append(info)
def verifyCode(checkCode, token): """检查验证码是否识别正确 :param checkCode: :param token: :return: """ url = 'http://gsxt.ynaic.gov.cn/notice/security/verify_captcha' data = {'captcha': checkCode, 'session.token': token} headers = { 'Host': 'gsxt.ynaic.gov.cn', 'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/54.0.2840.71 Safari/537.36'), 'Referer': 'http://gsxt.ynaic.gov.cn/notice /' } response = url_requests.post(url, data=data, headers=headers) return response.text
def black_info_execute(self): url = ('http://www.jsgsj.gov.cn:58888/ecipplatform' '/publicInfoQueryServlet.json?queryYzwf=true') resp = url_requests.post(url=url, data=self.post_data, headers=self.headers, proxies=proxies) data = json.loads(resp.content) data = data.get('data') # list key_list = [ 'reason_in', 'date_in', 'authority_in', 'reason_out', 'date_out', 'authority' ] # key_list = ['登记编号','登记日期','登记机关','被担保债权数额','状态','详情'] for each_data in data: info = {} info['reason_in'] = each_data.get('FACT_REASON', '') info['date_in'] = each_data.get('MARK_DATE', '') info['authority_in'] = each_data.get('MARK_ORG', '') info['reason_out'] = each_data.get('REMOVE_DATE', '') info['date_out'] = each_data.get('REMOVE_DATE', '') info['authority'] = each_data.get('REMOVE_ORG', '') self.qyxx_black_info.append(info)
def stock_freeze_execute(self): url = ('http://www.jsgsj.gov.cn:58888/ecipplatform' '/publicInfoQueryServlet.json?querySfxz=true') resp = url_requests.post(url=url, data=self.post_data, headers=self.headers, proxies=proxies) data = json.loads(resp.content) data = data.get('data') # list key_list = [ 'person', 'stock', 'court', 'notice_number', 'status', 'details' ] # key_list = ['序号', '被执行人', '股权数额', '执行法院', '协助公示通知书文号', '状态', '详情'] for each_data in data: info = {} info['person'] = each_data.get('ASSIST_NAME', '') info['stock'] = each_data.get('FREEZE_AMOUNT', '') info['court'] = each_data.get('EXECUTE_COURT', '') info['notice_number'] = each_data.get('NOTICE_NO', '') info['status'] = each_data.get('FREEZE_STATUS', '') info['details'] = each_data # 带有访问详情所需的参数 self.qyxx_stock_freeze.append(info)
def mortgage_basic_execute(self): url = ('http://www.jsgsj.gov.cn:58888/ecipplatform' '/publicInfoQueryServlet.json?queryDcdy=true') resp = url_requests.post(url=url, data=self.post_data, headers=self.headers, proxies=proxies) data = json.loads(resp.content) data = data.get('data') # list key_list = [ 'mortgage_reg_num', 'date_reg', 'authority', 'amount', 'status', 'details' ] # key_list = ['登记编号','登记日期','登记机关','被担保债权数额','状态','详情'] for each_data in data: info = {} info['mortgage_reg_num'] = each_data.get('GUARANTY_REG_NO', '') info['date_reg'] = each_data.get('START_DATE', '') info['authority'] = each_data.get('CREATE_ORG', '') info['amount'] = each_data.get('ASSURE_CAPI', '') info['status'] = each_data.get('STATUS', '') info['details'] = each_data self.qyxx_mortgage_basic.append(info)
def abnormal_execute(self): url = ('http://www.jsgsj.gov.cn:58888/ecipplatform' '/publicInfoQueryServlet.json?queryJyyc=true') resp = url_requests.post(url=url, data=self.post_data, headers=self.headers, proxies=proxies) data = json.loads(resp.content) data = data.get('data') # list key_list = [ 'xuhao', 'reason', 'date_occurred', 'juedinglierujiguan', 'reason_out', 'date_out', 'authority' ] # key_list = ['序号', '列入异常原因', '列入日期', '作出决定机关(列入)', '移出异常原因', '移出日期', '作出决定机关(列出)'] for each_data in data: info = {} info['xuhao'] = each_data.get('RN', '') info['reason'] = each_data.get('FACT_REASON', '') info['date_occurred'] = each_data.get('MARK_DATE', '') info['juedinglierujiguan'] = each_data.get('CREATE_ORG', '') info['reason_out'] = each_data.get('REMOVE_REASON', '') info['date_out'] = each_data.get('CREATE_DATE', '') info['authority'] = each_data.get('YICHU_ORG', '') self.qyxx_abnormal.append(info)
def verify(cookies, geetest_data): """将验证码得到的结果三个参数进行验证""" url = 'http://www.hebscztxyxx.gov.cn/notice/pc-geetest/validate' headers = { 'Host': 'www.hebscztxyxx.gov.cn', 'Referer': 'http://www.hebscztxyxx.gov.cn/notice/', 'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/54.0.2840.99 Safari/537.36') } resp = url_requests.post(url=url, data=geetest_data, headers=headers, cookies=cookies) try: js = json.loads(resp.content) status = js.get('status') return status except: return None
def get_result(url_result, company, cookies, geetest_data): ''' 主要传值包括: params: urlflag, challenge data: urlflag, nowNum, keyword, clear :return: search result ''' # only need them: 'Host: gx.gsxt.gov.cn # User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 # Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 # Referer: http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml # Cookie: JSESSIONID=hQBmYrkWfpL2lT4cG1HFwdJnTNjJ1bkQMHPXb95J34LH14SwGVZQ!1832446435; insert_cookie=48249391 # Content-Length: 231 # ' headers = { 'Host': 'gx.gsxt.gov.cn', 'Origin': 'http://gx.gsxt.gov.cn', 'Referer': 'http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml', 'Cookie': cookies, 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/54.0.2840.99 Safari/537.36'), 'Content-Length': '250', 'Content-Type': 'application/x-www-form-urlencoded' } params = dict(urlflag=0, challenge=geetest_data["geetest_challenge"]) data = dict(urlflag='', nowNum='', keyword=company, clear='true') print cookies # print dict(insert_cookie=cookies.split(";")[0].split("=")[1],JSESSIONID=cookies.split(";")[1].split("=")[1]) # 这个log里面也有出现报错的,应该要加判断的 # Cookie = dict(insert_cookie=cookies.split(";")[0].split("=")[1],JSESSIONID=cookies.split(";")[1].split("=")[1]) # ,cookies=Cookie resp = url_requests.post(url=url_result, params=params, data=data, headers=headers, proxies=proxies) html = resp.content soup = BeautifulSoup(html, 'lxml') if not soup: raise Exception('Query failed. Please try it again.') elif soup.find( 'p', {'class': 'p01'}) != None: #u'技术人员'in soup.find('p',{'class': 'p01'}): raise Exception('Too frequent. Please try it again.') else: if soup.find('div', {'class': 'search-result'}): result_num = soup.select('div.search-result > p > span') if len(result_num) == 0: raise Exception('query failed, please try it again.') if int(result_num[0].text) == 0: print soup.select('div.contentA1 > p')[0].text return [] links = soup.select('div.search-result > ul > li > h3') if not links: raise Exception('query failed, please try it again.') result_list = [] for each_link in links: link = each_link['onclick'].split('"')[1] result_list.append(link) return result_list
def main(**kwargs): """主函数, 被总控调用, 不得随意更改名称。 :param kwargs: name=企业名称 :return: """ company_name = kwargs.get('name') # 获取访问验证码时的cookies cookies = get_cookie() # 加载训练集(相对路径转绝对路径) path_a = os.path.dirname(__file__) path_a = os.path.abspath(os.path.join(path_a, '..')) abs_path = os.path.join(path_a, 'train_operator/d1_guangdong_train_operator.csv') trainX, trainY = loadTrainSet(abs_path) image = get_checkCode(cookies) code = main_check(image, trainX, trainY) if code == None: # 如果数字被识别成运算符,则回调main重来 return main(name=company_name) # print code # 先访问这个网站获取到 textfield url = 'http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/checkCode.html' data = {'textfield': company_name, 'code': code} headers = { 'Host': 'gsxt.gdgs.gov.cn', 'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/47.0.2526.73 Safari/537.36'), 'Referer': 'http://gsxt.gdgs.gov.cn/' } response = url_requests.post(url, headers=headers, data=data, cookies=cookies, timeout=10) if response != None: if 'textfield' in response.text: textfield = json.loads(response.text)['textfield'] # print textfield else: print '验证码识别错误' return main(name=company_name) # 再用 textfield 参数,访问下面的网址,获取搜索结果(企业列表) url = 'http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html ' data = {'textfield': textfield, 'code': str(code)} response = url_requests.post(url, headers=headers, data=data, cookies=cookies, timeout=10) if response != None: html = response.content soup = BeautifulSoup(html, 'lxml') list = soup.find_all('li', class_='font16') link_list = [] for each in list: link = each.a['href'].decode('utf-8') # 为了统一unicode编码, if '..' in link: link = 'http://gsxt.gdgs.gov.cn/aiccips' + link[2:] link_list.append(link) # print link_list return link_list else: return main(name=company_name)
def guangdong(**kwargs): link = kwargs.get('id_tag') # 先访问query脚本返回的url获取其中的三个参数entNo, entType, regOrg,并组成data用于访问每个模块。 headers = { 'Host': 'gsxt.gdgs.gov.cn', 'User-Agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/53.0.2785.143 Safari/537.36'), 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/main/initJyycInfoList.html' } response = url_requests.get(link, headers=headers, timeout=10) # print response.text soup = BeautifulSoup(response.text, 'lxml') req_data = {} req_data['entNo'] = soup.find('input', {'name': 'entNo'})['value'] req_data['entType'] = soup.find('input', {'name': 'entType'})['value'] req_data['regOrg'] = soup.find('input', {'name': 'regOrg'})['value'] # 接着用这个req_data作为参数,传给每一个模块的url,获取信息。 executeA = ['basicinfo_execute', 'abnormal_execute','black_info_execute', 'adm_punishment_execute', 'b_c_execute', 'branch_execute', 'member_execute', 'mortgage_basic_execute', 'pledge_execute','s_h_execute', 'spot_check_execute', 'stock_freeze_execute', 'stockholder_change_execute' ] execute_d = ['c_mortgage_execute', 's_creditor_execute', 'mortgage_execute' ] businessInfo = BusinessInfo() loop = True # 判断一次是否吊销,已吊销,则loop=False active = True # 默认未吊销 for each in executeA: print "%r %r %r" % ("*" * 20, each, "*" * 20) getattr(businessInfo, each)(req_data) # businessInfo.pledge_execute(req_data) if businessInfo.qyxx_basicinfo: while loop: loop = False if '已吊销' in businessInfo.qyxx_basicinfo[0]['check_type']: active = False if not active: break "暂时未发现有动产抵押的" # 此处声明的三个变量是用于当有多个动产抵押项目的时候能把三个表的结果放到一个列表里面 if businessInfo.qyxx_mortgage_basic: L = businessInfo.qyxx_mortgage_basic headers = { 'Host': 'gsxt.gdgs.gov.cn', 'Referer': link, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/54.0.2840.71 Safari/537.36' } for dict in L: url = ('http://gsxt.gdgs.gov.cn/aiccips/GSpublicity' '/GSpublicityList.html') pleNo = dict['detail'].split("'")[1] data = { 'service':'pleInfoData', 'pleNo': pleNo, 'entNo':req_data.get('entNo'), 'entType':req_data.get('entType'), 'regOrg':req_data.get('regOrg') } html = url_requests.post(url, data=data, headers=headers, cookies=response.cookies).text pageSoup = BeautifulSoup(html, 'lxml') for c in execute_d: print "%s %s %s" % ("*" * 20, c, "*" * 20) getattr(businessInfo, c)(pageSoup) results = businessInfo.returnData() return results