def get_page(self, type, tab): """获取页面,为了简便,在url后面添加了所有可能用到的数据,即使有多余的参数也不影响 Args: tab: 访问页面时在url后面所用到的数据。1 工商公示信息, 2 企业公示信息, 3 其他部门公示信息 """ url = CrawlerUtils.add_params_to_url( self.urls[type], { 'entId': self.ent_id, 'ent_id': self.ent_id, 'entid': self.ent_id, 'credit_ticket': self.credit_ticket, 'entNo': self.ent_number, 'entName': '', 'timeStamp': self.generate_time_stamp(), 'clear': 'true', 'str': tab }) settings.logger.info('get %s, url:\n%s\n' % (type, url)) resp = self.reqst.get(url) if resp.status_code != 200: settings.logger.warn('get page failed by url %s' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + type + '.html', page) return page
def crawl_page_by_url(self, url): """根据url直接爬取页面 """ resp = self.reqst.get(url) if self.reqst.status_code != 200: settings.logger.error('crawl page by url failed! url = %s' % url) page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + 'detail.html', page) return page
def crawl_page_by_url(self, url): """通过url直接获取页面 """ resp = self.reqst.get(url, verify=False) if resp.status_code != 200: settings.logger.error('failed to crawl page by url' % url) return page = resp.content time.sleep(random.uniform(0.2, 1)) if settings.save_html: CrawlerUtils.save_page_to_file( self.html_restore_path + 'detail.html', page) return page
def parse_ent_pub_annual_report_page(self, base_page, page_type): """解析企业年报页面,该页面需要单独处理 """ def get_year_of_annual_report(page): soup = BeautifulSoup(page, 'html.parser') t = soup.body.find('table') return CrawlerUtils.get_raw_text_in_bstag(t.find('tr')) if settings.save_html: CrawlerUtils.save_page_to_file( self.crawler.html_restore_path + 'annual_report_base_info.html', base_page) page_data = {} soup = BeautifulSoup(base_page, 'html.parser') if soup.body.find('table'): base_table = soup.body.find('table') table_name = u'企业基本信息' page_data[table_name] = self.parse_table(base_table, table_name, base_page) if len(soup.find_all('table')) > 1: ent_property_table = soup.body.find_all('table')[1] table_name = self.get_table_title(ent_property_table) page_data[table_name] = self.parse_table( ent_property_table, table_name, base_page) else: pass year = get_year_of_annual_report(base_page) report_items = { 'wzFrame': 'website_info', 'gdczFrame': 'shareholder_contribute_info', 'dwdbFrame': 'external_guarantee_info', 'xgFrame': 'modify_record_info' } for item in report_items.items(): pat = re.compile( r'<iframe +id="%s" +src=\'(/entPub/entPubAction!.+)\'' % item[0]) m = pat.search(base_page) if m: next_url = self.crawler.urls['host'] + m.group(1) settings.logger.info('get annual report, url:\n%s\n' % next_url) page = self.crawler.crawl_page_by_url(next_url) pages = self.crawler.get_all_pages_of_a_section( page, page_type, next_url) table_name = item[1] try: soup = BeautifulSoup(page, 'html.parser') table_name = self.get_table_title(soup.body.table) except Exception as e: settings.logger.error( 'fail to get table name with exception %s' % e) raise e try: if len(pages) == 1: table_data = self.parse_page(page, table_name) else: table_data = [] for p in pages: table_data += self.parse_page(p, table_name) except Exception as e: settings.logger.error( 'fail to parse page with exception %s' % e) raise e finally: page_data[table_name] = table_data return page_data