def get_newsinfo(self, url, parser_item_fuc): ''' 请求每一个新闻详情 ''' t_sleep() log('当前访问的URL', url) html = self.get_html(url) if html == 'timeout': return 'error' response = etree.HTML(html.text) log('当前访问的URL', url, html.status_code) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # parser_item_fuc(response) title, date, content = parser_item_fuc(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'pbc' return news
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log_line('请求状态不是200') return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'bjjrj' return news
def get_iteminfo(self, url): ''' 访问每一条新闻详情 :param itemlist: 新闻链接集合 :return: 新闻model ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'xinhua' return news
def parse_item(self, response, url): try: title = (response.xpath('//h2[@class="titl"]/text()'))[0].strip() except Exception as e: title = '未知' try: date = (response.xpath('//p[@class="Wh"]/span[1]/text()') )[0].strip().split()[0] date = str(arrow.get(date)).split('T')[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@class="detailCont"]/p') content = self.pasre_content(con_list) except Exception as e: content = '未知' item = News() item.title = title item.date = date item.content = content item.url = url item.spider_name = 'jingji' return item
def parser_item(self, item): news = News() news.spider_name = 'amac' news.url = self.parser_url( item.xpath('./@href')[0], 'http://www.amac.org.cn') news.title = item.xpath('./text()')[0] self.newslist.append(news)
def parser_item(self, item): news = News() news.spider_name = 'mohurd' news.url = item.xpath('./@href')[0] news.title = item.xpath('./text()')[0] news.date = item.getparent().getnext().xpath( './text()')[0][1:-1].replace('.', '-').strip() self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./li[@class="mc"]/div/a/@href')[0] date = item.xpath('./li[@class="fbrq"]/text()')[0] news = News() news.spider_name = 'csrc' news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic') news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0] news.date = arrow.get(date).format('YYYY-MM-DD') # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] date = item.xpath('./span/text()')[0] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] if 'search' in url: return date = item.getnext().xpath('./text()')[0][1:-1] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.circ.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def get_html(self, url): html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//a[@class="STYLE8"]') for item in items: news = News() news.spider_name = 'cbrc' news.url = item.xpath('./@href')[0] news.title = item.xpath('./@title')[0] news.date = item.getparent().getnext().xpath('./text()')[0].strip() self.newslist.append(news) return self.parser_url(self.newslist)