コード例 #1
0
ファイル: jingji.py プロジェクト: NickLeeCoder/ttj
    def parse_item(self, response, url):

        try:
            title = (response.xpath('//h2[@class="titl"]/text()'))[0].strip()
        except Exception as e:
            title = '未知'

        try:
            date = (response.xpath('//p[@class="Wh"]/span[1]/text()')
                    )[0].strip().split()[0]
            date = str(arrow.get(date)).split('T')[0]
        except Exception as e:
            date = '未知'

        try:
            con_list = response.xpath('//div[@class="detailCont"]/p')
            content = self.pasre_content(con_list)
        except Exception as e:
            content = '未知'

        item = News()
        item.title = title
        item.date = date
        item.content = content
        item.url = url
        item.spider_name = 'jingji'

        return item
コード例 #2
0
ファイル: mohurd.py プロジェクト: NickLeeCoder/ttj
 def parser_item(self, item):
     news = News()
     news.spider_name = 'mohurd'
     news.url = item.xpath('./@href')[0]
     news.title = item.xpath('./text()')[0]
     news.date = item.getparent().getnext().xpath(
         './text()')[0][1:-1].replace('.', '-').strip()
     self.newslist.append(news)
コード例 #3
0
ファイル: amac2.py プロジェクト: NickLeeCoder/ttj
    def parser_item(self, item):
        news = News()
        news.spider_name = 'amac'
        news.url = self.parser_url(
            item.xpath('./a/@href')[0], 'http://www.amac.org.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = item.getnext().xpath('./text()')[0]

        # log(news.url, news.title, news.date)
        self.newslist.append(news)
コード例 #4
0
ファイル: csrc.py プロジェクト: NickLeeCoder/ttj
    def parser_item(self, item):
        url = item.xpath('./li[@class="mc"]/div/a/@href')[0]
        date = item.xpath('./li[@class="fbrq"]/text()')[0]

        news = News()
        news.spider_name = 'csrc'
        news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic')
        news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0]
        news.date = arrow.get(date).format('YYYY-MM-DD')

        # log(news.url, news.title, news.date)
        self.newslist.append(news)
コード例 #5
0
    def parser_item(self, item):
        url = item.xpath('./a/@href')[0]
        date = item.xpath('./span/text()')[0]

        news = News()
        news.spider_name = 'circ'
        news.url = self.parser_url(url, 'http://www.gov.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = date

        # log(news.url, news.title, news.date)

        self.newslist.append(news)
コード例 #6
0
ファイル: circ.py プロジェクト: NickLeeCoder/ttj
    def parser_item(self, item):
        url = item.xpath('./a/@href')[0]
        if 'search' in url:
            return

        date = item.getnext().xpath('./text()')[0][1:-1]

        news = News()
        news.spider_name = 'circ'
        news.url = self.parser_url(url, 'http://www.circ.gov.cn')
        news.title = item.xpath('./a/text()')[0]
        news.date = date

        # log(news.url, news.title, news.date)

        self.newslist.append(news)
コード例 #7
0
    def get_html(self, url):
        html = requests.get(url, headers=self.get_news_header())
        html.encoding = 'utf-8'
        html = etree.HTML(html.text)
        items = html.xpath('//a[@class="STYLE8"]')

        for item in items:
            news = News()
            news.spider_name = 'cbrc'
            news.url = item.xpath('./@href')[0]
            news.title = item.xpath('./@title')[0]
            news.date = item.getparent().getnext().xpath('./text()')[0].strip()

            self.newslist.append(news)

        return self.parser_url(self.newslist)