def parse_item(self, response, url): try: title = (response.xpath('//h2[@class="titl"]/text()'))[0].strip() except Exception as e: title = '未知' try: date = (response.xpath('//p[@class="Wh"]/span[1]/text()') )[0].strip().split()[0] date = str(arrow.get(date)).split('T')[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@class="detailCont"]/p') content = self.pasre_content(con_list) except Exception as e: content = '未知' item = News() item.title = title item.date = date item.content = content item.url = url item.spider_name = 'jingji' return item
def parser_item(self, item): news = News() news.spider_name = 'amac' news.url = self.parser_url( item.xpath('./@href')[0], 'http://www.amac.org.cn') news.title = item.xpath('./text()')[0] self.newslist.append(news)
def parser_item(self, item): news = News() news.spider_name = 'mohurd' news.url = item.xpath('./@href')[0] news.title = item.xpath('./text()')[0] news.date = item.getparent().getnext().xpath( './text()')[0][1:-1].replace('.', '-').strip() self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./li[@class="mc"]/div/a/@href')[0] date = item.xpath('./li[@class="fbrq"]/text()')[0] news = News() news.spider_name = 'csrc' news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic') news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0] news.date = arrow.get(date).format('YYYY-MM-DD') # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] date = item.xpath('./span/text()')[0] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def parser_item(self, item): url = item.xpath('./a/@href')[0] if 'search' in url: return date = item.getnext().xpath('./text()')[0][1:-1] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.circ.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news)
def get_html(self, url): html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//a[@class="STYLE8"]') for item in items: news = News() news.spider_name = 'cbrc' news.url = item.xpath('./@href')[0] news.title = item.xpath('./@title')[0] news.date = item.getparent().getnext().xpath('./text()')[0].strip() self.newslist.append(news) return self.parser_url(self.newslist)