Exemple #1
0
 def parse_item(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     try:
         l.add_value('title',
                     response.xpath('//title/text()').extract_first() or '')
         l.add_value(
             'date',
             response.xpath('//span[@class="pubTime"]/text()').
             extract_first()) or '1970-01-01 00:00:00'
         l.add_value(
             'source',
             response.xpath('//a[@rel="nofollow"]/text()').extract_first()
             or self.website)
         l.add_value(
             'content', ''.join(
                 response.xpath(
                     '//div[@id="article_text"]/descendant-or-self::p/text()'
                 ).extract()))
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         pass
     finally:
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #2
0
 def parse_item(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     try:
         date_source_author = response.xpath(
             '//div[@class="info"]/span/text()').extract()
         l.add_value('title',
                     response.xpath('//title/text()').extract_first() or '')
         l.add_value(
             'date', date_source_author[0]
             if len(date_source_author) > 0 else '1970-01-01 00:00:00')
         l.add_value(
             'source',
             date_source_author[1] if len(date_source_author) > 1 else '')
         l.add_value(
             'content', ''.join(
                 response.xpath(
                     '//div[@class="TRS_Editor"]/descendant-or-self::text()'
                 ).extract()))
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         pass
     finally:
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #3
0
 def parse_item(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     try:
         l.add_value('title',
                     response.xpath('//title/text()').extract_first())
         date = response.xpath('//td[@align="center"]/text()').re(
             u'\d+年\d+月\d+日')[0]
         date = date.replace(u'年', '-').replace(u'月', '-').replace(
             u'日', ' ') + '00:00:00'
         l.add_value('date', date)
         l.add_value('source', self.website)
         l.add_value(
             'content', ''.join(
                 response.xpath(
                     '//td[@style="line-height: 30px;font-size:16px; padding-top:10px;"]/descendant-or-self::text()'
                 ).extract()))
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         pass
     finally:
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #4
0
 def parse_item(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     try:
         l.add_value('title',
                     response.xpath('//title/text()').extract_first() or '')
         l.add_value(
             'date',
             response.xpath('//div[@class="titleInfo"]/span[1]/text()').
             extract_first() or '')
         l.add_value('source', self.website)
         classname = ['artibody']
         content = ''
         for c in classname:
             content += ''.join(
                 response.xpath(
                     '//div[@id="%s"]/descendant-or-self::text()' %
                     c).extract())
         # if content == None or content.strip() == '':
         #     self.logger.info(response.url)
         l.add_value('content', content)
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         pass
     finally:
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #5
0
    def get_news(self,response):
	try:
            l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract())

            l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract())

            r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}"
	    date0 = re.compile(r1)
	    date = ''.join(l.get_collected_values('date'))
	    date1 = date0.findall(date)
            l.replace_value('date', date1[0])
            l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Exemple #6
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="l_tit"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            #date = time.strptime(date.split()[0], '%Y-%m-%d')
            #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/strong/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="article"]/div/p/text()').extract())

 
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Exemple #7
0
    def parse_item(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        try:
            if re.search(r'/404.h', response.body) != None:
                raise Exception('this item may be deleted')
            if response.status != 200:
                raise Exception('response status %s'%response.status)
            l.add_value('title', response.xpath('//title/text()').extract_first() or '')
            date = response.xpath('//td[@width="25%"]/text()').re(u"\d+年\d+月\d+日")[0]
            date = date.replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ')+'00:00:00'
            l.add_value('date', date)
            l.add_value('source', self.website)
            l.add_value('content', ''.join(response.xpath('//td[@class="font02"]/text()').extract()))
            l.add_value('content', ''.join(response.xpath('//div[@id="ozoom"]/descendant-or-self::p/text()').extract()) or '')

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
        finally:
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Exemple #8
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract())
	    l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract())

            l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url
	    if url[11:17]=="shzfzz":
                date = ''.join(l.get_collected_values('date'))
                date = time.strptime(date.split()[0], u'%Y年%m月%d日')
                l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Exemple #9
0
 def get_news(self, response):
     try:
         item = SpiderItem()
         item['title'] = ''.join(
             response.xpath('//div[@class="STYLE67"]/text()').extract())
         item['content'] = ''.join(response.xpath('//p/text()').extract())
         item['collection_name'] = self.name
         item['url'] = response.url
         date0 = response.xpath(
             '//td[@class="style5"]/div[2]/text()').extract()
         date1 = ('').join(date0)
         item['date'] = date1[0:10] + ' 00:00:00'
         item['website'] = self.website
         yield item
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         yield l.load_item()
Exemple #10
0
 def parse_item(self, response):
     try:
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title',
                     response.xpath('//title/text()').extract_first())
         da = response.xpath('//div[@class="conpage"]/span/text()'
                             ).extract_first().split(u'\xa0\xa0\xa0\xa0')
         da[1] = search(r'\d+-\d+-\d+\s+(\d+){0,1}(:\d+){0,2}',
                        da[1]).group()
         if da[1].count(':') == 1:
             da[1] += ':00'
         l.add_value('date', da[1])
         l.add_value('source', da[0])
         l.add_value(
             'content', ''.join(
                 response.xpath(
                     '//div[@class="conpage"][2]/descendant-or-self::text()'
                 ).extract()))
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #11
0
    def parse_item(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        try:
            for attr in ['title', 'date', 'content']:
                function = getattr(self, 'get' + attr, None)
                if function:
                    l.add_value(attr, function(response))
                else:
                    self.logger.error('no method for %s' % attr)

        except Exception as e:
            # self.logger.info((sys.exc_traceback))
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            pass
        finally:
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
 def parse_item(self, response):
     item = SpiderItem()
     sel = Selector(response)
     sites = sel.xpath('//record')
     for site in sites:
         item = SpiderItem()
         pattern = re.compile(".*?title='(.*?)'", re.S)
         item['title'] = re.findall(pattern, response.body)
         pattern1 = re.compile(".*?href='(.*?)'", re.S)
         urls = re.findall(pattern1, response.body)
     for url in urls:
         url = 'http://gzgy.lss.gov.cn' + url
         if self.filter.url_exist(url):
             break
         yield Request(url, callback=self.get_news, dont_filter=True)
Exemple #13
0
 def parse_item(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     try:
         l.add_value('title',
                     response.xpath('//title/text()').extract_first() or '')
         date = response.xpath(
             '//span[@class="padding-left cccc"]/text()').extract_first()
         if date != None and date != '' and date.find(u'年') != -1:
             date = time.strftime(
                 '%Y-%m-%d %H:%M:%S',
                 time.localtime(
                     time.mktime(time.strptime(date,
                                               u'%Y年%m月%d日 %H:%M:%S'))))
         l.add_value('date', date)
         date = response.xpath(
             '//span[@class="time"]/text()').extract_first()
         if date != None and date != '' and date.find(u'年') != -1:
             date = time.strftime(
                 '%Y-%m-%d %H:%M:%S',
                 time.localtime(
                     time.mktime(time.strptime(date,
                                               u'%Y年%m月%d日 %H:%M:%S'))))
         l.add_value('date', date)
         l.add_value('source', self.website)
         classname = ['newstext']
         content = ''
         for c in classname:
             content += ''.join(
                 response.xpath(
                     '//div[@class="%s"]/descendant-or-self::text()' %
                     c).extract())
         # if content == None or content.strip() == '':
         #     self.logger.info(response.url)
         l.add_value('content', content)
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' %
                           (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         pass
     finally:
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #14
0
    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value('title', response.xpath('//h3/text()').extract())

        l.add_value(
            'date',
            response.xpath(
                '//table[3]/tr[2]/td/table/tr/td/div/text()').extract())

        #r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
        #date0 = re.compile(r1)
        #date = ''.join(l.get_collected_values('date'))
        #date1 = date0.findall(date)
        # l.replace_value('date', date1[0]+" "+"00:00:00")

        l.add_value(
            'content',
            response.xpath(
                '//table[3]/tr[2]/td/table/tr[3]/td/div/p/text()').extract())
        l.add_value(
            'content',
            response.xpath(
                '//table[3]/tr[2]/td/table/tr[3]/td/div/div/p/text()').extract(
                ))

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)

        return l.load_item()
Exemple #15
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first())
            loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first())

            loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first())
            loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values("date")).strip()
            date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S')
            loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date))

            loader.add_value("content",
                             ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract()))
            loader.add_value("content",
                             ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #16
0
 def get_news(self, response):
     try:
         item = response.meta['item']
         content_type = chardet.detect(response.body)
         if content_type['encoding'] != 'utf-8':
             html_content = response.body.decode(content_type['encoding'], 'ignore')
         pattern = re.compile('align=center bgcolor="#FFFFFF">(.*?) ', re.S)
         date = re.findall(pattern, html_content)[0]
         response = Selector(text=html_content)
         view_num = response.xpath('//font[@color="red"]/text()').extract_first()
         data = response.xpath('//td[@id="fontzoom"]')
         content = data.xpath('string(.)').extract_first().replace(u'\xa0', '')
         date = time.strptime(date.split(u':')[1].strip(), u'%Y年%m月%d日')
         item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', date)
         item['content'] = content
         item['view_num'] = view_num
         item['collection_name'] = self.name
         item['website'] = self.website
         yield item
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' % (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         yield l.load_item()
Exemple #17
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//td[@style=" padding-bottom:5px; padding-top:5px"]//tr[2]/td/text()'
                ).extract_first())
            date = response.xpath(
                '//td[@style=" padding-bottom:5px; padding-top:5px"]//td[@style="font-size:12px"]/text()'
            ).extract_first()
            loader.add_value("date", date[5:15])
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//td[@style=" padding-bottom:5px; padding-top:5px"]//div/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//td[@style=" padding-bottom:5px; padding-top:5px"]//p/descendant-or-self::text()'
                    ).extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #18
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                'title',
                response.xpath('//p[@class="title1"]/text()').extract_first())

            date = response.xpath(
                '//pre[@class="f_title"]/text()').extract_first()
            loader.replace_value('date', date[date.find(u"日期:") + 3:][0:10])

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="contents"]/descendant-or-self::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #19
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="artibodyTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@class="article-header clearfix"]/h1/text()').
                extract_first())
            loader.add_value(
                "title",
                response.xpath('//h2[@id="titleText"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="main_title"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="artibodyTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//span[@class="location"]/h1/text()').extract_first())

            url = response.url
            loader.add_value(
                "date",
                url[url.rfind('/', 0, url.rfind('/')) + 1:url.rfind('/')])

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="artibody"]//p/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@class="article-body main-body"]//p/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="mainContent"]//p/descendant-or-self::text()'
                    ).extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #20
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value("title", response.xpath('//div[@class="articleTitle"]/h2/text()').extract_first())
            loader.add_value("title", response.xpath('//h1[@class=" BSHARE_POP"]/text()').extract_first())

            loader.add_value("date", response.xpath('//span[@class="yearMsg"]/text()').extract_first())
            loader.add_value("date", response.xpath('//span[@id="pubtime_baidu"]/text()').extract_first())

            loader.add_value("source", response.xpath('//span[@class="sourceMsg"]/text()').extract_first())

            loader.add_value("content", ''.join(
                response.xpath('//div[@class="articleBox mb20 cfix"]/descendant-or-self::text()').extract()))
            loader.add_value("content", ''.join(
                response.xpath('//div[@id="pzoom"]/p/descendant::text()').extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            loader.add_value('title', '')
            loader.add_value("source", '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        yield loader.load_item()
Exemple #21
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="articleTitle"]/text()').extract_first())

            date = response.xpath(
                '//div[@class="content-th-info"]/span/text()').extract_first()
            loader.add_value("date", date[-16:] + ":00")

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@class="document"]/descendant-or-self::text()').
                    extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #22
0
    def get_news_list(self, response):
        for sel in response.xpath('//ul[contains(@class, "main_right_box")]/li'):
            url = urljoin(response.url, sel.xpath('a/@href').extract_first())
            if self.filter.url_exist(url):
                return
            title = sel.xpath('a/@title').extract_first()
            date = sel.xpath('span/text()').extract_first()
            item = SpiderItem()
            item['url'] = url
            item['title'] = title
            date = time.strptime(date.strip(), '%Y-%m-%d %H:%M:%S')
            item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', date)
            request = Request(url=url, callback=self.get_news)
            request.meta['item'] = item
            yield request

        next_page = response.xpath('//div[@id="AspNetPager1"]/a')[-2].xpath('@href')
        if next_page:
            next_one = int(response.xpath('//span[@style="margin-right:5px;font-weight:Bold;color:red;"]/text()').extract()[0]) + 1
            view_state = response.xpath('//input[@id="__VIEWSTATE"]/@value').extract_first()
            data = {
                '__EVENTTARGET': 'AspNetPager1',
                '__EVENTARGUMENT': str(next_one),
                '__VIEWSTATE': view_state
            }
            headers = {
                'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                'Host': "gyxfdj.gov.cn",
                'Referer': response.url
            }
            yield FormRequest(url=response.url, method="POST", formdata=data, headers=headers, callback=self.get_news_list)
Exemple #23
0
    def get_news_list(self, response):
        script = response.xpath('//ul[@class="list01"]').extract_first()
        pattern = re.compile('str_1 = "(.*?)".*?str_3 = "(.*?)".*?<span>(.*?)</span>', re.S)
        results = re.findall(pattern, ''.join(script))
        for result in results:
            url = urljoin(response.url, result[0])
            if self.filter.url_exist(url):
                return
            title = result[1]
            date = result[2]
            item = SpiderItem()
            item['url'] = url
            item['title'] = title
            item['date'] = date.strip() + " 00:00:00"
            request = Request(url=url, callback=self.get_news)
            request.meta['item'] = item
            yield request

        next_page = response.xpath('//div[@class="page"]/script').extract_first()
        pattern = re.compile('createPageHTML\((.*?), (.*?),', re.S)
        result = re.findall(pattern, ''.join(next_page))[0]
        all_page = int(result[0]) - 1
        current_page = int(result[1])
        if all_page != current_page:
            url = response.url
            url = url[0:url.find('index')]
            yield Request(url=url + 'index_' + str(current_page + 1) + ".html", callback=self.get_news_list)
Exemple #24
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="cont_1_1_2"]/h1[1]/text()').extract_first())
            loader.add_value(
                "date",
                response.xpath(
                    '//span[@id="pubtime_baidu"]/text()').extract_first())
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@class="left_zw"]/descendant-or-self::text()').
                    extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #25
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath('//span[@class="jiatype16"]/strong[1]/text()').
                extract_first())

            date = time.strptime(
                response.xpath('//td[@class="news-bg"]/table//tr[2]/td/text()'
                               ).extract()[-1][-19:], "%Y年%m月%d日%H:%M:%S")
            loader.add_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date))

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//td[@class="jiawenzhang-type"]/descendant-or-self::text()'
                    ).extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Exemple #26
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@class="artz"]/h2[1]/text()').extract_first())

            loader.add_value(
                "date",
                response.xpath('//p[@class="fbsj"]/text()').extract_first()
                [3:19] + ":00")

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@class="artz"]/div/descendant-or-self::text()').
                    extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        yield loader.load_item()
 def get_news_list(self, response):
     content_type = chardet.detect(response.body)
     if content_type['encoding'] != 'utf-8':
         html_content = response.body.decode(content_type['encoding'],
                                             'ignore')
     selector = Selector(text=html_content)
     more = selector.xpath('//a[@class="class"]/@href')
     if more:
         url = urljoin(response.url, more.extract_first())
         yield Request(url=url, callback=self.get_news_list)
     else:
         follow = True
         for sel in response.xpath('//td[@width="62%"]'):
             item = SpiderItem()
             url = urljoin(response.url,
                           sel.xpath('a/@href').extract_first())
             if self.filter.url_exist(url):
                 follow = False
                 break
             title = sel.xpath('a/@title').extract_first()
             item['url'] = url
             item['title'] = title
             request = Request(url=item['url'], callback=self.get_news)
             request.meta['item'] = item
             yield request
         if follow:
             next_page = response.xpath('//a[@class="black"]')
             if next_page and next_page.xpath(
                     'text()').extract_first() == u'下一页':
                 next_one = next_page.xpath('@href').extract()[-1]
                 yield Request(url=next_one, callback=self.get_news_list)
Exemple #28
0
    def parse_item(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        contents = ''
        try:
            title = response.xpath(
                '//div[@class="bt font24_lan"]/text()').extract_first()

            # 1.利用正文信息匹配日期,但是复杂,可能并不是每个都匹配
            # text = response.xpath('//div[@class="lysj"]/ul/li[3]/text()').extract_first()
            # match = re.search(r'([0-9]{4})[^\x00-\xff]([0-9]{2})[^\x00-\xff]([0-9]{2})[^\x00-\xff]', text)
            # if match:
            #     date = match.group(1) + match.group(2) + match.group(3)
            #     dateArray = time.strptime(date, "%Y%m%d")
            #     otherStyleDate = time.strftime("%Y-%m-%d", dateArray)
            #     print "date, ", otherStyleDate

            # 2. 利用url提取日期,简洁快速
            match = re.search(r't(20[0-9]{6})', response.url)
            date = match.group(1)
            # 转换日期格式
            dateArray = time.strptime(date, "%Y%m%d")
            otherStyleDate = time.strftime("%Y-%m-%d", dateArray)

            # 这里//text() 等价于提取当前标签下所有含有文本的标签的文本信息,区别于/text()
            content_list = response.xpath(
                r'//div[@class="nr font14_lan"]/descendant-or-self::p//text()'
            ).extract()
            for content in content_list:
                contents = contents + content

            # ### print info
            # try:
            #     ### str.encode('GB18030') 解决cmd中打印中文报错问题
            #     print 'title, ', title.encode('GB18030')
            #     print 'url, ', response.url
            #     print "date, ", otherStyleDate
            #     print "content, ", contents.encode('GB18030')
            # except Exception as e:
            #     print " error : ", e

            loader.add_value('title', title)
            loader.add_value('date', otherStyleDate)
        except Exception as e:
            # self.logger.error('error url: %s error msg: %s' % (response.url, e))
            loader.add_value('date', '1970-01-01')
            loader.add_value('title', '')
        finally:
            # self.logger.info('crawling url: %s' % response.url)
            loader.add_value('url', response.url)
            loader.add_value('collection_name', self.name)
            loader.add_value('website', self.website)
            if contents == '':
                self.logger.warning(' url: %s msg: %s' %
                                    (response.url, ' content is None'))
            loader.add_value('content', contents)
            return loader.load_item()
Exemple #29
0
 def parse_item(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     try:
         l.add_value('title', ''.join(response.xpath('//h1/text()').extract()))
         l.add_value('date', ''.join(response.xpath('//div[@class="news_time_source"]/text()').re('\d+-\d+-\d+\W\d+:\d+:\d+')))
         l.add_value('source', self.website)
         l.add_value('content', ''.join(response.xpath('//div[@class="news_main"]/descendant-or-self::text()').extract()))
     except Exception as e:
         self.logger.error('error url: %s error msg: %s' % (response.url, e))
         l = ItemLoader(item=SpiderItem(), response=response)
         l.add_value('title', '')
         l.add_value('date', '1970-01-01 00:00:00')
         l.add_value('source', '')
         l.add_value('content', '')
         pass
     finally:
         l.add_value('url', response.url)
         l.add_value('collection_name', self.name)
         l.add_value('website', self.website)
         return l.load_item()
Exemple #30
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="wh645 left"]/p[1]/text()').extract())
	    l.add_value('title', response.xpath('//p[@class="f22 lh30 yahei"]/a/text()').extract())
	    l.add_value('title', response.xpath('//p[@class="f22 lh40 fb"]/text()').extract())

            l.add_value('date',response.xpath('//p[@class="lh30 left f14 yahei"]/text()').extract())
	    l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            #date = time.strptime(date.split()[0], '%Y-%m-%d')
            #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract())
            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="sanji_left"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url

            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()