def get_contributive_info(self, session, param_dict, data): try: url = 'http://{host}/gsbaseInfoAction_gdczInfo.action?randomNum={rand}&nbxh={nbxh}&qylx={qylx}&menustring=1'.format( host=self.host, rand=util.get_random_num(), nbxh=param_dict['nbxh'], qylx=param_dict['qylx']) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL) return try: page_num = int( PyQuery(r.text, parser='html').find('#countPage').attr('value')) except Exception as e: self.log.exception(e) page_num = 1 if page_num == 0: self.append_model(data, Model.contributive_info, url, r.text, status=self.STATUS_NOT_EXIST) return self.append_model(data, Model.contributive_info, url, r.text) # 获得出资详情 self.get_contributive_info_detail(session, r.text, data) for page in xrange(2, page_num + 1): url = 'http://{host}/gsbaseInfoAction_gdczInfo.action?randomNum={rand}&nbxh={nbxh}&qylx={qylx}&menustring=1&currPage={page}'.format( host=self.host, rand=util.get_random_num(), nbxh=param_dict['nbxh'], qylx=param_dict['qylx'], page=page) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.contributive_info, url, r.text) # 获得出资详情 self.get_contributive_info_detail(session, r.text, data) except Exception as e: self.log.exception(e)
def get_shareholder_info(self, session, pri_pid, data): page = 1 total_page = 1 while page <= total_page: url = 'http://{host}/ansubcapital/queryAnsubcapitaltrue.do' \ '?pripid={pripid}&randommath={randommath}¤tPage={page}' \ .format(host=self.host, pripid=pri_pid, randommath=util.get_random_num(), page=page) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.shareholder_info, url, '', status=self.STATUS_FAIL) return json_data = util.json_loads(r.text) if json_data is None: self.append_model(data, Model.shareholder_info, url, r.text, status=self.STATUS_FAIL) return page_info = json_data.get('page', None) if page_info is None: self.append_model(data, Model.shareholder_info, url, r.text, status=self.STATUS_FAIL) return total_page = page_info.get('totalPage', None) if total_page is None: self.append_model(data, Model.shareholder_info, url, r.text, status=self.STATUS_FAIL) return total_page = int(total_page) if total_page == 0: total_page = 1 self.append_model(data, Model.shareholder_info, url, r.text) page += 1
def get_key_person_info(self, session, pri_pid, data): rand = util.get_random_num() url = 'http://{host}/epriperson/queryPerson.do?' \ 'pripid={pripid}&randommath={randommath}' \ .format(host=self.host, pripid=pri_pid, randommath=rand) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.key_person_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.key_person_info, url, r.text)
def get_annual_base_info(self, session, pri_pid, data, year): rand = util.get_random_num() url = 'http://{host}/anbaseinfo/getquerbaseinfo.do' \ '?pripid={pripid}&year={year}&randommath={randommath}' \ .format(host=self.host, pripid=pri_pid, randommath=rand, year=year) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.annual_info, url, '', status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return self.append_model(data, Model.annual_info, url, r.text, year=year, classify=Model.type_detail)
def get_key_person_info(self, session, param_dict, data): url = 'http://{host}/gsbaseInfoAction_zzryMoreInfo.action?nbxh={nbxh}'.format( host=self.host, rand=util.get_random_num(), nbxh=param_dict['nbxh'], qylx=param_dict['qylx']) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.key_person_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.key_person_info, url, r.text)
def get_shareholder_info(self, session, param_dict, data): url = 'http://{host}/gsbaseInfoAction_gdczGtInfo.action?randomNum={rand}&nbxh={nbxh}&qylx={qylx}&menustring=1'.format( host=self.host, rand=util.get_random_num(), nbxh=param_dict['nbxh'], qylx=param_dict['qylx']) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.shareholder_info, url, '', status=self.STATUS_FAIL) return self.append_model(data, Model.shareholder_info, url, r.text)
def get_annual_shareholder_info(self, session, pri_pid, data, year): rand = util.get_random_num() url = 'http://{host}/ansubcapital/queryAnsubcapital.do' \ '?pripid={pripid}&year={year}&randommath={randommath}&showCount=100' \ .format(host=self.host, pripid=pri_pid, randommath=rand, year=year) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.annual_info, url, '', status=self.STATUS_FAIL, year=year, classify=Model.type_detail) return None, None self.append_model(data, Model.annual_info, url, r.text, year=year, classify=Model.type_detail) return url, r.text
def get_annual_info(self, session, pri_pid, data): rand = util.get_random_num() url = 'http://{host}/anbaseinfo/queryBaseinfoReport.do' \ '?pripid={pripid}&randommath={randommath}¤tPage=1' \ .format(host=self.host, pripid=pri_pid, randommath=rand) r = self.task_request(session, session.get, url) if r is None: return result = util.json_loads(r.text) if result is None: return len_year = len(result.get('data')) page_num = None if result.get('page', None) is not None: page_num = result.get('page').get('totalPage', None) for page in xrange(page_num): for i in xrange(len_year): try: year = result.get('data')[i]['ANCHEYEAR'] except: continue # 企业年报基本信息 self.get_annual_base_info(session, pri_pid, data, year) # 企业年报网点信息 self.get_annual_website_info(session, pri_pid, data, year) # 企业年报股东信息 self.get_annual_shareholder_info(session, pri_pid, data, year) # 企业年报对外投资信息 self.get_annual_investment_info(session, pri_pid, data, year) # 企业年报对外提供保证担保信息 self.get_annual_assurance_info(session, pri_pid, data, year) # 企业年报股权变更信息 self.get_annual_change_info(session, pri_pid, data, year) # 企业年报修改信息 self.get_annual_amendant_info(session, pri_pid, data, year) # 企业年报企业基本状况 self.get_annual_status_info(session, pri_pid, data, year)
def get_annual_info(self, session, param_dict, data): url = 'http://{host}/gsbaseInfoAction_qynbInfo.action?randomNum={rand}&nbxh={nbxh}&qylx={qylx}&menustring=4'.format( host=self.host, rand=util.get_random_num(), nbxh=param_dict['nbxh'], qylx=param_dict['qylx']) r = self.task_request(session, session.get, url) if r is None: return pattern = 'qynbBase\(\'(.*?)\',\'(.*?)\',\'(.*?)\'\)' find_list = re.findall(pattern, r.text) if len(find_list) <= 0: return for nb_item in find_list: nbxh = nb_item[0] year = nb_item[1] qylx = nb_item[2] url = 'http://{host}/gsQynbAction_qynbBaseInfo.action?nbxh={nbxh}&anCheYear={year}&qylxFlag=2&qylx={qylx}'.format( host=self.host, nbxh=nbxh, year=year, qylx=qylx) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.annual_info, url, '', status=self.STATUS_FAIL, year=year, classify=Model.type_detail) continue # 基本信息 self.append_model(data, Model.annual_info, url, r.text, year=year, classify=Model.type_detail) # nbxh ,year,qylx # 年报其他信息抓取 item_list = PyQuery(r.text, parser='html').find('iframe').items() for item in item_list: src = item.attr('src') if src is None or src == '': continue url = 'http://{host}{src}'.format(host=self.host, src=src) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.annual_info, url, '', status=self.STATUS_FAIL, year=year, classify=Model.type_detail) continue self.append_model(data, Model.annual_info, url, r.text, year=year, classify=Model.type_detail) ###服务器翻页,怎么办 jq = PyQuery(r.text, parser='html') somepagenum = jq.find('#countPage').attr('value') if somepagenum is not None: if somepagenum > 1: pagenum = int(somepagenum) # print somepagenum i = 2 while i <= pagenum: url = 'http://{host}{src}&currPage={pagenum}'.format( host=self.host, src=src, pagenum=i) # print url i += 1 r_item = self.task_request(session, session.get, url) if r_item is None: self.append_model(data, Model.annual_info, url, '', status=self.STATUS_FAIL, year=year, classify=Model.type_detail) self.append_model(data, Model.annual_info, url, r_item.text, year=year, classify=Model.type_detail)
def get_contributive_info(self, session, pri_pid, data): page = 1 total_page = 1 while page <= total_page: url = 'http://{host}/einvperson/getqueryeInvPersonService.do' \ '?pripid={pripid}&randommath={randommath}¤tPage={page}' \ .format(host=self.host, pripid=pri_pid, randommath=util.get_random_num(), page=page) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL, classify=Model.type_list) return json_data = util.json_loads(r.text) if json_data is None: self.append_model(data, Model.contributive_info, url, r.text, status=self.STATUS_FAIL, classify=Model.type_list) return page_info = json_data.get('page', None) if page_info is None: self.append_model(data, Model.contributive_info, url, r.text, status=self.STATUS_FAIL, classify=Model.type_list) return total_page = page_info.get('totalPage', None) if total_page is None: self.append_model(data, Model.contributive_info, url, r.text, status=self.STATUS_FAIL, classify=Model.type_list) return total_page = int(total_page) if total_page == 0: total_page = 1 self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_list) show_count = page_info.get('showCount', None) if show_count is None: return # 解析详细信息 data_info = json_data.get('data', None) if data_info is not None: for index, item in enumerate(data_info): invid = item.get('INVID', None) if invid is None: continue url = 'http://{host}/einvperson/queryInfo?invid={invid}&random={rand}'.format( host=self.host, invid=invid, rand=random.randint(10, 100)) r = self.task_request(session, session.get, url) if r is None: self.append_model(data, Model.contributive_info, url, '', status=self.STATUS_FAIL, classify=Model.type_detail) continue self.append_model(data, Model.contributive_info, url, r.text, classify=Model.type_detail) page += 1