Esempio n. 1
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_xpath('title', '//div[@class="news-title"]/h1/text()')
     item.add_xpath('author', '//span[@class="writer"]/a/text()')
     item.add_value('source', u'搜狐网')
     item.add_value('original_link', response.url)
     item.add_value('category', CATEGORY.TECHNOLOGY)
     article_time = sel.xpath(
         '//span[@id="pubtime_baidu"]/text()').extract()
     date_time = compare_time(article_time)
     if not date_time:
         return
     item.add_value('date_time', article_time)
     elements = sel.xpath('//div[@id="contentText"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url',
                        hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('image_urls', images)
     item.add_value('content', content)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Esempio n. 2
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     images = []
     sel = Selector(response)
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     if sel.xpath('//div[@class="neirong-shouquan"]'):
         return
     item.add_xpath('title', '//div[@class="article-wrap"]/h1/text()')
     item.add_xpath('author', '//span[@class="author-name"]/text()')
     item.add_value('source', u'虎嗅网')
     item.add_value('original_link', response.url)
     item.add_value('category', CATEGORY.TECHNOLOGY)
     article_time = sel.xpath('//span[@class="article-time"]/text()').extract()
     date_time = compare_time(article_time, "%Y-%m-%d %H:%M")
     if not date_time:
         return
     item.add_value('date_time', article_time)
     image_url = sel.xpath('//div[@class="article-img-box"]/img/@src').extract()[0]
     images.append(image_url)
     elements = sel.xpath('//div[@id="article_content"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('image_urls', images)
     item.add_value('content', content)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Esempio n. 3
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     images = []
     sel = Selector(response)
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     if sel.xpath('//div[@class="neirong-shouquan"]'):
         return
     item.add_xpath('title', '//div[@class="article-wrap"]/h1/text()')
     item.add_xpath('author', '//span[@class="author-name"]/text()')
     item.add_value('source', u'虎嗅网')
     item.add_value('original_link', response.url)
     item.add_value('category', CATEGORY.TECHNOLOGY)
     article_time = sel.xpath(
         '//span[@class="article-time"]/text()').extract()
     date_time = compare_time(article_time, "%Y-%m-%d %H:%M")
     if not date_time:
         return
     item.add_value('date_time', article_time)
     image_url = sel.xpath(
         '//div[@class="article-img-box"]/img/@src').extract()[0]
     images.append(image_url)
     elements = sel.xpath('//div[@id="article_content"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url',
                        hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('image_urls', images)
     item.add_value('content', content)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Esempio n. 4
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p')
     article_time = content.xpath('//span[@class="pubTime"]/text()').extract()
     date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M")
     if not date_time:
         return
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_value('date_time', date_time)
     item.add_xpath('title', '//div[@class="hd"]/h1/text()')
     item.add_xpath('reading_number', '//em[@id="top_count"]/text()')
     item.add_xpath('author', '//span[@class="auth"]/text()')
     item.add_value('original_link', response.url)
     elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('content', content)
     item.add_value('image_urls', images)
     item.add_value('source', u'腾讯科技')
     item.add_value('category', CATEGORY.TECHNOLOGY)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Esempio n. 5
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_xpath('title', '//div[@class="pageTop"]/h1/text()')
     item.add_xpath('author', '//div[@class="pi-author"]/a/text()')
     article_time = " ".join(
         sel.xpath('//div[@class="pi-author"]/span/text()').extract())
     date_time = compare_time([article_time], "%Y-%m-%d %H:%M")
     if not date_time:
         return
     item.add_value('date_time', date_time)
     content = sel.xpath("//div[@class='pageCont lph-article-comView ']")
     elements = content.xpath("h2|p").extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url',
                        hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('image_urls', images)
     item.add_value('content', content)
     item.add_value('original_link', response.url)
     item.add_value('category', CATEGORY.TECHNOLOGY)
     item.add_value('source', u'雷锋网')
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Esempio n. 6
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     article_time = sel.xpath('//span[@class="date"]/text()').extract()
     date_time = compare_time(article_time)
     if not date_time:
         return
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_value('date_time', date_time)
     item.add_xpath('title', '//h2[@id="news_title"]/text()')
     item.add_value('original_link', response.url)
     elements = sel.xpath('//div[@class="content"]/p').extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('content', content)
     item.add_value('image_urls', images)
     item.add_value('source', u'cnBeta')
     item.add_value('category', CATEGORY.TECHNOLOGY)
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()
Esempio n. 7
0
 def parse_item(self, response):
     logging.info(u"start crawl  --->  " + response.url)
     item = ItemLoader(item=NewsItem(), response=response)
     sel = Selector(response)
     item.add_xpath('keywords', "//head/meta[@name='keywords']/@content")
     item.add_xpath('title', '//div[@class="pageTop"]/h1/text()')
     item.add_xpath('author', '//div[@class="pi-author"]/a/text()')
     article_time = " ".join(sel.xpath('//div[@class="pi-author"]/span/text()').extract())
     date_time = compare_time([article_time], "%Y-%m-%d %H:%M")
     if not date_time:
         return
     item.add_value('date_time', date_time)
     content = sel.xpath("//div[@class='pageCont lph-article-comView ']")
     elements = content.xpath("h2|p").extract()
     images, content = translate_content(elements)
     if images:
         item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg")
     item.add_value('image_urls', images)
     item.add_value('content', content)
     item.add_value('original_link', response.url)
     item.add_value('category', CATEGORY.TECHNOLOGY)
     item.add_value('source', u'雷锋网')
     logging.info(u"finished crawl  --->  " + response.url)
     yield item.load_item()