def get_contributive_info_detail(self, session, data, text): url = 'http://{host}/gjjbjTab/gjjTabQueryCreditAction!gdczDetail.dhtml'.format(host=self.host) pattern = 'lookAjaxInfo\(\'(.*?)\',\'(.*?)\',\'(.*?)\'\)' regex = re.compile(pattern) a_list = PyQuery(text, parser='html').find('a').items() for a_item in a_list: tr_text = a_item.attr('onclick') if tr_text is None or tr_text == '': continue search_list = regex.findall(tr_text) if len(search_list) <= 0: continue post_data = { 'ent_id': search_list[0][0], 'chr_id': search_list[0][1], 'ajax': True, 'time': util.get_time_stamp(), 'dateflag': search_list[0][2] } r = self.task_request(session, session.post, url, data=post_data) if r is None: self.append_model(data, Model.contributive_info, url, '', post_data=post_data, classify=Model.type_detail) continue self.append_model(data, Model.contributive_info, url, r.text, post_data=post_data, classify=Model.type_detail)
def get_annual_info(self, session, i_d, data): url = 'http://{host}/api/PubAnnualInfo/Annuals/{id}?_={rand}'.format( host=self.host, id=i_d, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: return json_data = util.json_loads(r.text) if json_data is None: return data_list = json_data.get('data', None) if data_list is None: return for item in data_list: anche_id = item.get('ancheId', None) year_info = item.get('year', None) if anche_id is None or year_info is None: continue year_list = re.findall('(\d+)', year_info) if len(year_list) <= 0: continue year = year_list[0] # 获得详细年报信息 self.get_annual_detail_info(session, i_d, anche_id, year, data)
def get_base_info(self, session, i_d): url = 'http://{host}/api/PubBaseInfo/Business/{id}?_={rand}'.format( host=self.host, id=i_d, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: return return url, r.text
def get_contributive_info_detail(self, session, text, data): try: json_data = util.json_loads(text) if json_data is None: return json_list = json_data[0].get('list', None) if json_list is None: return for index, item in enumerate(json_list): invid = item.get('invid', None) if invid is None: continue url = 'http://{host}/gsxt/api/einv/gdxx/{invid}?currentpage=1&pagesize=5&t={rand}'.format( host=self.host, invid=invid, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is not None: self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail) url = 'http://{host}/gsxt/api/einvpaidin/queryList/{invid}?currentpage=1&pagesize=5&t={rand}'.format( host=self.host, invid=invid, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is not None: self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail) url = 'http://{host}/gsxt/api/efactcontribution/queryList/{invid}?currentpage=1&pagesize=5&t={rand}'.format( host=self.host, invid=invid, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is not None: self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail) except Exception as e: self.log.exception(e)
def get_search_list_html(self, keyword, session): param_list = [] try: url = 'http://{host}/ztxy.do?method=index&random={rand}'.format( host=self.host, rand=util.get_time_stamp()) content = self.get_captcha_geetest(url, '#entname', '#popup-submit', keyword, 'p.result_desc', origin_session=session) if content is None: return param_list, self.SEARCH_ERROR # pripid,enttype,zt,type # openView('6100000000020342','11','K','1') jq = PyQuery(content, parser='html') if jq.find('p.result_desc').text().find('您搜索的条件无查询结果') != -1: return param_list, self.SEARCH_NOTHING_FIND pattern = 'openView\(\'(.*?)\',\'(.*?)\',\'(.*?)\',\'(.*?)\'\)' regex = re.compile(pattern) item_list = jq.find('.result_item').items() for item in item_list: onclick = item.attr('onclick') if onclick is None or onclick == '': continue search_list = regex.findall(onclick) if len(search_list) <= 0: continue company = item.find('#mySpan').attr('title') if company is None or company == '': continue search_name = company.replace(' ', '') if search_name == '': continue status = item.find('.status.diaoxiao').text() if status is None or status == '': status = item.find('.status.cunxu').text() data = { 'pripid': search_list[0][0], 'enttype': search_list[0][1], 'zt': search_list[0][2], 'type': search_list[0][3], 'search_name': search_name, } if status is not None and status != '': data['status'] = status param_list.append(data) except Exception as e: self.log.exception(e) return param_list, self.SEARCH_ERROR return param_list, self.SEARCH_SUCCESS if len(param_list) > 0 else self.SEARCH_ERROR
def get_branch_info(self, session, pri_pid, data): url = 'http://{host}/ztxy.do?method=showAllfzjg&maent.pripid={pripid}&random={rand}'.format( host=self.host, pripid=pri_pid, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.branch_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.branch_info, url, r.text)
def get_annual_detail_info(self, session, i_d, anche_id, year, data): url = 'http://{host}/api/PubAnnualInfo/Annual/{id}/{anche_id}'.format( host=self.host, id=i_d, anche_id=anche_id) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnWebSites/{id}/{anche_id}'.format( host=self.host, id=i_d, anche_id=anche_id) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnForInvestments/{id}/{anche_id}'.format( host=self.host, id=i_d, anche_id=anche_id) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnAsset/{id}/{anche_id}'.format( host=self.host, id=i_d, anche_id=anche_id) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnUpdates/{id}/{anche_id}'.format( host=self.host, id=i_d, anche_id=anche_id) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnSubCapitals/{id}/{anche_id}?_={rand}'.format( host=self.host, id=i_d, anche_id=anche_id, rand=util.get_time_stamp()) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnForGuarantees/{id}/{anche_id}?_={rand}'.format( host=self.host, id=i_d, anche_id=anche_id, rand=util.get_time_stamp()) self.__get_annual_detail_info(session, url, year, data) url = 'http://{host}/api/PubAnnualInfo/AnAlterStocks/{id}/{anche_id}?_={rand}'.format( host=self.host, id=i_d, anche_id=anche_id, rand=util.get_time_stamp()) self.__get_annual_detail_info(session, url, year, data)
def get_shareholder_info(self, session, i_d, data): url = 'http://{host}/api/PubSelfPubInfo/InvDetails/{id}?_={rand}'.format( host=self.host, id=i_d, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.shareholder_info, url, '', status=self.STATUS_FAIL) return # 存储数据 self.append_model(data, Model.shareholder_info, url, r.text)
def get_change_info(self, session, i_d, data): url = 'http://{host}/api/PubBaseInfo/BaseInfoAlters/{id}?_={rand}'.format( host=self.host, id=i_d, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.change_info, url, '', status=self.STATUS_FAIL) return # 存储数据 self.append_model(data, Model.change_info, url, r.text)
def get_contributive_info(self, session, i_d, data): url = 'http://{host}/api/PubBaseInfo/Invs/{id}?_={rand}'.format( host=self.host, id=i_d, rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL) return # 存储数据 self.append_model(data, Model.contributive_info, url, r.text) json_data = util.json_loads(r.text) if json_data is None: self.append_model(data, Model.contributive_info, url, r.text, status=self.STATUS_FAIL) return data_list = json_data.get('data', None) if data_list is None: return for index, item in enumerate(data_list): inv_id = item.get('invId', None) if inv_id is None: continue url = 'http://{host}/api/PubBaseInfo/InvDetail/{id}/{invid}'.format( host=self.host, id=i_d, invid=inv_id) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL, classify=Model.type_detail) continue self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail)
def get_annual_info(self, session, text, data): pattern = 'showNbDetail\(\'(.*?)\',\'(.*?)\'\);' search_list = re.findall(pattern, text) if len(search_list) <= 0: return for item in search_list: url = 'http://{host}/ztxy.do?method=qyinfo_nnbxx&pripid={pripid}&nd={year}&random={rand}'.format( host=self.host, pripid=item[0], year=item[1], rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.annual_info, url, '', status=self.STATUS_FAIL, year=item[1], classify=Model.type_detail) continue self.append_model(data, Model.annual_info, url, r.text, year=item[1], classify=Model.type_detail)
def get_contributive_info(self, session, base_text, data): pattern = 'showRyxx\(\'(.*?)\',\'(.*?)\',\'(.*?)\'\)' search_list = re.findall(pattern, base_text) length = len(search_list) if length <= 0: return for index, item in enumerate(search_list): url = 'http://{host}/ztxy.do?method=frInfoDetail&maent.xh={xh}&maent.pripid={pripid}&isck={issck}&random={rand}'.format( host=self.host, xh=item[0], pripid=item[1], issck=item[2], rand=util.get_time_stamp()) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL, classify=Model.type_detail) continue self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail)
def get_detail_html_list(self, seed, session, param_list): # 保存企业名称 data_list = [] for item in param_list: try: href = item.get('href', None) referer = item.get('Referer', None) if href is None or referer is None: self.log.error('参数存储异常: item = {item}'.format(item=item)) continue url = 'http://{host}/client/entsearch/{href}'.format(host=self.host, href=href) search_name = item.get('search_name', None) if search_name is None: self.log.error('参数错误: item = {item}'.format(item=item)) continue session.headers = { "Host": self.host, "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Referer": referer, } # 基本信息 base_text = self.get_base_info(session, url) if base_text is None: continue if base_text.strip() == '': continue # 页面不正确 pri_pid = PyQuery(base_text, parser='html').find('#priPID').attr('value') if pri_pid is None: continue # 获得公司名称 company = self.__get_company_name(base_text) if company is None or company == '': self.log.error('公司名称解析失败..item = {item} {text}'.format( text=base_text, item=item)) continue # 建立数据模型 data = self.get_model(company, seed, search_name, self.province) # yearreport_url = 'http://{host}/entinfo/list.json?_t={rand}'.format( # host=self.host, rand=util.get_time_stamp()) # yearreport_data = { # 'params[priPID]': pri_pid # } contributive_url = 'http://{host}/midinv/list.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) contributive_data = { 'params[priPID]': pri_pid, 'start': '0', 'length': '1000' } member_url = 'http://{host}/midmember/list.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) member_data = { 'priPID': pri_pid, } branch_url = 'http://{host}/midbranch/list.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) branch_data = { 'priPID': pri_pid, } change_url = 'http://{host}/midaltitem/list.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) change_data = { 'params[priPID]': pri_pid, 'start': '0', 'length': '1000' } shareholder_url = 'http://{host}/im/pub/investalter/investmentListJSON?_t={rand}&pageNum=0&' \ 'priPID={priPID}&length={length}¶ms%5BpageNum%5D=0'. \ format(host=self.host, rand=util.get_time_stamp(), priPID=pri_pid, length='1000') annual_url = 'http://{host}/entinfo/list.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) annual_data = { 'params[priPID]': pri_pid, 'start': '0', 'length': '10' } # 存储数据 self.append_model(data, Model.base_info, url, base_text) time.sleep(0.5) # 出资信息 if not self.get_contributive_info(session, contributive_url, contributive_data, data): self.log.warn('出资信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid)) continue time.sleep(0.5) # 主要人员信息 if not self.get_key_person_info(session, member_url, member_data, data): self.log.warn('主要人员抓取失败....pripid = {pripid}'.format(pripid=pri_pid)) continue time.sleep(0.5) # 分支机构 if not self.get_branch_info(session, branch_url, branch_data, data): self.log.warn('分支机构抓取失败....pripid = {pripid}'.format(pripid=pri_pid)) continue time.sleep(0.5) # 变更信息 if not self.get_change_info(session, change_url, change_data, data): self.log.warn('变更信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid)) continue time.sleep(0.5) # 股东信息 if not self.get_shareholder_info(session, shareholder_url, data): self.log.warn('股东信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid)) continue time.sleep(0.5) # 获得年报信息 if not self.get_annual_info(session, href, annual_url, annual_data, data): self.log.warn('年报信息抓取失败....pripid = {pripid}'.format(pripid=pri_pid)) continue data_list.append(data) except Exception as e: self.log.exception(e) return self.sent_to_target(data_list)
def get_annual_info(self, session, href, annual_url, data, total_data): encry_pri_pid = util.get_match_value('docId=', '&classFlag', href) if encry_pri_pid is None: return False r = self.filter_request(session, session.post, url=annual_url, data=data) if r is None: return False r_text = util.json_loads(r.text) if r_text is None: return False r_data = r_text.get('data') if r_data is None: return False for data in r_data: year = data.get('year') year_id = data.get('anCheID') if year is None or year_id is None: continue post_data1 = {'anCheID': year_id} post_data2 = {'start': '0', 'length': '100', 'params[anCheID]': year_id} # 基本信息 base_info_url = 'http://{host}/entinfo/yrinfo?year={year}&encryPriPID={encry_pri_pid}&classFlag=1'.format( host=self.host, year=year, encry_pri_pid=encry_pri_pid) r = self.filter_request(session, session.get, base_info_url) if r is not None: self.append_model(total_data, Model.annual_info, base_info_url, r.text, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, base_info_url, '', status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False # 网站信息 web_info_url = 'http://{host}/pub/WebsiteInfo/publist.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) r = self.filter_request(session, session.post, web_info_url, data=post_data1) if r is not None: self.append_model(total_data, Model.annual_info, web_info_url, r.text, post_data=post_data1, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, web_info_url, '', post_data=post_data1, status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False # 股东信息 shareholder_info_url = 'http://{host}/pub/subcapitalInfo/publist.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) r = self.filter_request(session, session.post, shareholder_info_url, data=post_data2) if r is not None: self.append_model(total_data, Model.annual_info, shareholder_info_url, r.text, post_data=post_data2, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, shareholder_info_url, '', post_data=post_data2, status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False # 对外投资 investment_info_url = 'http://{host}/pub/forinvestMentInfo/publist.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) r = self.filter_request(session, session.post, investment_info_url, data=post_data1) if r is not None: self.append_model(total_data, Model.annual_info, investment_info_url, r.text, post_data=post_data1, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, investment_info_url, '', post_data=post_data1, status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False # 资产状况 在基本信息里 # 担保信息 assurance_info_url = 'http://{host}/pub/GuaranteeInfo/publist.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) r = self.filter_request(session, session.post, assurance_info_url, data=post_data2) if r is not None: self.append_model(total_data, Model.annual_info, assurance_info_url, r.text, post_data=post_data2, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, assurance_info_url, '', post_data=post_data2, status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False # 股权变更 change_info_url = 'http://{host}/pub/alterStockInfo/publist.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) r = self.filter_request(session, session.post, change_info_url, data=post_data2) if r is not None: self.append_model(total_data, Model.annual_info, change_info_url, r.text, post_data=post_data2, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, change_info_url, '', post_data=post_data2, status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False # 修改记录 amendant_info_url = 'http://{host}/pub/updateinfo/publist.json?_t={rand}'.format( host=self.host, rand=util.get_time_stamp()) r = self.filter_request(session, session.post, amendant_info_url, data=post_data2) if r is not None: self.append_model(total_data, Model.annual_info, amendant_info_url, r.text, post_data=post_data2, year=year, classify=Model.type_detail) else: self.append_model(total_data, Model.annual_info, amendant_info_url, '', post_data=post_data2, status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return False return True
def get_annual_info(self, session, pripid, pritype, data): # 年报信息 annual_info_url = 'http://{host}/gsxt/api/anbaseindex/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pripid, pritype=pritype, rand=util.get_time_stamp()) r = self.task_request(session, session.get, annual_info_url) if r is None: return None json_data = util.json_loads(r.text) if json_data is None: return None for item in json_data: nb_list = item.get('list', None) if nb_list is None: continue for nb_item in nb_list: anche_id = nb_item.get('ancheid', None) anche_year = nb_item.get('ancheyear', None) if anche_id is None: continue if anche_year is None: continue # 基本信息 base_info_url = 'http://{host}/gsxt/api/anbaseinfo/queryForm/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, base_info_url) if r is not None: self.append_model(data, Model.annual_info, base_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, base_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 网站信息 web_info_url = 'http://{host}/gsxt/api/anwebsiteinfo/queryList/{ancheid}/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, pritype=pritype, rand=util.get_time_stamp(), pripid=pripid) r = self.task_request(session, session.get, web_info_url) if r is not None: self.append_model(data, Model.annual_info, web_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, web_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 股东信息 shareholder_info_url = 'http://{host}/gsxt/api/ansubcapital/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, shareholder_info_url) if r is not None: self.append_model(data, Model.annual_info, shareholder_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, shareholder_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 对外投资 investment_info_url = 'http://{host}/gsxt/api/anforinvestment/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, investment_info_url) if r is not None: self.append_model(data, Model.annual_info, investment_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, investment_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 资产状况 assets_info_url = 'http://{host}/gsxt/api/anbaseinfo/queryForm/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, assets_info_url) if r is not None: self.append_model(data, Model.annual_info, assets_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, assets_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 担保信息 assurance_info_url = 'http://{host}/gsxt/api/anforguaranteeinfo/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, assurance_info_url) if r is not None: self.append_model(data, Model.annual_info, assurance_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, assurance_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 社保信息 social_security_info_url = 'http://{host}/gsxt/api/ansocialinsuinfo/queryForm/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, social_security_info_url) if r is not None: self.append_model(data, Model.annual_info, social_security_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, social_security_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 股权变更 change_info_url = 'http://{host}/gsxt/api/analterstockinfo/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, change_info_url) if r is not None: self.append_model(data, Model.annual_info, change_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, change_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail) # 修改记录 amendant_info_url = 'http://{host}/gsxt/api/anupdateinfo/queryList/{ancheid}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, ancheid=anche_id, rand=util.get_time_stamp()) r = self.task_request(session, session.get, amendant_info_url) if r is not None: self.append_model(data, Model.annual_info, amendant_info_url, r.text, year=anche_year, classify=Model.type_detail) else: self.append_model(data, Model.annual_info, amendant_info_url, '', status=self.STATUS_FAIL, year=anche_year, classify=Model.type_detail)
def get_detail_html_list(self, seed, session, param_list): data_list = [] session.headers = { 'Host': self.host, 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/55.0.2883.95 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Cache-Control': 'no-cache', 'X-Requested-With': 'XMLHttpRequest', 'appkey': '8dc7959eeee2792ac2eebb490e60deed', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', } for item in param_list: try: pri_pid = item.get('pripid', None) pri_type = item.get('pritype', None) if pri_pid is None or pri_type is None: self.log.error('参数信息错误...item = {item}'.format(item=item)) continue search_name = item.get('search_name', None) if search_name is None: self.log.error('参数错误: item = {item}'.format(item=item)) continue base_info_url = "http://{host}/gsxt/api/ebaseinfo/queryForm/{pripid}/{pritype}?currentpage=1&pagesize=5&t={rand}".format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 基本信息 base_info = self.task_request(session, session.get, base_info_url) if base_info is None: continue if len(base_info.text) <= 15: base_info_url = 'http://{host}/gsxt/api/ebaseindex/queryForm/{pripid}/{pritype}?currentpage=1&pagesize=5&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) base_info = self.task_request(session, session.get, base_info_url) if base_info is None: self.log.info('基本信息抓取失败: pripid = {pripid} text = {text}'.format( pripid=pri_pid, text=base_info.text)) continue company = self.get_copmany_name(base_info.text) if company == '' or company is None: self.log.error('公司名称信息解析错误..pripid = {pripid} {text}'.format( pripid=pri_pid, text=base_info.text)) continue # 建立数据模型 data = self.get_model(company, seed, search_name, self.province) # 变更信息 change_info_url = 'http://{host}/gsxt/api/ealterrecoder/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 出资信息 contributive_info_url = 'http://{host}/gsxt/api/einv/gdjczxxList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 主要人员 key_person_info_url = 'http://{host}/gsxt/api/epriperson/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 分支机构 branch_info_url = 'http://{host}/gsxt/api/ebrchinfo/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 清算信息 liquidation_info_url = 'http://{host}/gsxt/api/eliqmbrn/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 股东信息 shareholder_info_url = 'http://{host}/gsxt/api/eiminvupdate/queryList/{pripid}/{pritype}?currentpage=1&pagesize=100&t={rand}'.format( host=self.host, pripid=pri_pid, pritype=pri_type, rand=util.get_time_stamp()) # 存储数据 self.append_model(data, Model.base_info, base_info_url, base_info.text) # 清算信息 liquidation_info = self.task_request(session, session.get, liquidation_info_url) if liquidation_info is not None: self.append_model(data, Model.liquidation_info, liquidation_info_url, liquidation_info.text) else: self.append_model(data, Model.liquidation_info, liquidation_info_url, '', status=self.STATUS_FAIL) # 变更信息 change_info = self.task_request(session, session.get, change_info_url) if change_info is not None: self.append_model(data, Model.change_info, change_info_url, change_info.text) else: self.append_model(data, Model.change_info, change_info_url, '', status=self.STATUS_FAIL) # 股东信息 shareholder_info = self.task_request(session, session.get, shareholder_info_url) if shareholder_info is not None: self.append_model(data, Model.shareholder_info, shareholder_info_url, shareholder_info.text) else: self.append_model(data, Model.shareholder_info, shareholder_info_url, '', status=self.STATUS_FAIL) # 出资信息 contributive_info = self.task_request(session, session.get, contributive_info_url) if contributive_info is not None: self.append_model(data, Model.contributive_info, contributive_info_url, contributive_info.text) self.get_contributive_info_detail(session, contributive_info.text, data) else: self.append_model(data, Model.contributive_info, contributive_info_url, '', status=self.STATUS_FAIL) # 主要人员 key_person_info = self.task_request(session, session.get, key_person_info_url) if key_person_info is not None: self.append_model(data, Model.key_person_info, key_person_info_url, key_person_info.text) else: self.append_model(data, Model.key_person_info, key_person_info_url, '', status=self.STATUS_FAIL) # 分支机构 branch_info = self.task_request(session, session.get, branch_info_url) if branch_info is not None: self.append_model(data, Model.branch_info, branch_info_url, branch_info.text) else: self.append_model(data, Model.branch_info, branch_info_url, '', status=self.STATUS_FAIL) # 获得年报信息 self.get_annual_info(session, pri_pid, pri_type, data) data_list.append(data) except Exception as e: self.log.exception(e) return self.sent_to_target(data_list)