def info(self): information = { "title": [], "article_url": [], "type": [], "publish_time": [], "institution": [], "author": [], "content": [] } for i in range(1, self._get_pages() + 1): bsObj = s_utils.conn_get(cons.SIAN_REPORT_URL + self.yesterday + "&p=" + str(i)) contents = bsObj.find("div", { "class": "main" }).find("table").find_all("tr")[2:] for content in contents: article = content.find_all("td") article_info = article[1].find("a") article_url = article_info.attrs["href"] information["title"].append( article_info.attrs["title"].encode('latin1').decode( 'gb2312', 'ignore')) information["article_url"].append(article_url) information["type"].append( article[2].text.encode('latin1').decode( 'gb2312', 'ignore')) information["publish_time"].append(self.yesterday) information["institution"].append(article[4].find("a").find( "div").find("span").text.encode('latin1').decode( 'gb2312', 'ignore')) information["author"].append(article[5].find("div").find( "span").text.encode('latin1').decode('gb2312', 'ignore')) try: content_bs = s_utils.conn_get(article_url) content_text = content_bs.find("div", { "class": "blk_container" }).find("p").text content_text = content_text.encode('latin1').decode( 'gb2312', 'ignore') content_text = re.sub("\n+", "\n", content_text) content_text = re.sub(" +", " ", content_text) information["content"].append(content_text) except Exception as e: self.log.info("\n{}".format(e)) information["content"].append("") pass self.log.info("the {} page scrapy successful..".format(i)) time.sleep(0.01) df = pd.DataFrame(information, columns=[ "title", "article_url", "type", "publish_time", "institution", "author", "content" ]) return df
def _url_for_pdf(self, url, retry=10): bs_obj = s_utils.conn_get(url) for i in range(retry): try: pdf_url = bs_obj.find("div", {"class": "detail-header"}).find("h1").find("span").find("a").attrs["href"] return pdf_url except Exception as e: self.log.info(e) time.sleep(0.01) pass
def _get_pages(self, retry=3): bsObj = s_utils.conn_get(cons.SIAN_REPORT_URL + self.yesterday) for i in range(retry): try: page_num = \ bsObj.find("div", {"class": "page"}).find("tr").find("td").find("div", {"class": "pagebox"}).find_all( "span", { "class": "pagebox_next"})[-1].find("a").attrs["onclick"] page_num = re.search("(\d+)", page_num) page_num = page_num.group() return int(page_num) except Exception as e: if i != retry - 1: self.log.info(e) pass else: self.log.info( "==========>No data day:{}<==========".format( self.yesterday)) sys.exit()
def info(self): information = { "title": [], "article_url": [], "type": [], "publish_time": [], "institution": [], "author": [], "content": [] } first_record = s_utils.get_first_info(cons.get_first_sina_report, cons.research_report_table_name, column_name='article_url') page = 1 while True: bsObj = s_utils.conn_get(cons.SIAN_REPORT_URL + self.today + "&p=" + str(page)) contents = bsObj.find("div", { "class": "main" }).find("table").find_all("tr")[2:] for content in contents: article = content.find_all("td") article_info = article[1].find("a") article_url = article_info.attrs["href"] if article_url != first_record: information["title"].append( article_info.attrs["title"].encode('latin1').decode( 'gb2312', 'ignore')) information["article_url"].append(article_url) information["type"].append( article[2].text.encode('latin1').decode( 'gb2312', 'ignore')) information["publish_time"].append(self.today) information["institution"].append( article[4].find("a").find("div").find( "span").text.encode('latin1').decode( 'gb2312', 'ignore')) information["author"].append(article[5].find("div").find( "span").text.encode('latin1').decode( 'gb2312', 'ignore')) try: content_bs = s_utils.conn_get(article_url) content_text = content_bs.find("div", { "class": "blk_container" }).find("p").text content_text = content_text.encode('latin1').decode( 'gb2312', 'ignore') content_text = re.sub("\n+", "\n", content_text) content_text = re.sub(" +", " ", content_text) information["content"].append(content_text) except Exception as e: self.log.info("\n{}".format(e)) information["content"].append("") pass else: break time.sleep(0.01) if len(information['article_url']) % 40 == 0: page += 1 else: break if len(information['article_url']) > 0: df = pd.DataFrame(information, columns=[ "title", "article_url", "type", "publish_time", "institution", "author", "content" ]) return df else: self.log.info("No data now.") sys.exit()