def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_xpath('title', '//div[@class="news-title"]/h1/text()') item.add_xpath('author', '//span[@class="writer"]/a/text()') item.add_value('source', u'搜狐网') item.add_value('original_link', response.url) item.add_value('category', CATEGORY.TECHNOLOGY) article_time = sel.xpath( '//span[@id="pubtime_baidu"]/text()').extract() date_time = compare_time(article_time) if not date_time: return item.add_value('date_time', article_time) elements = sel.xpath('//div[@id="contentText"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('image_urls', images) item.add_value('content', content) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) images = [] sel = Selector(response) item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") if sel.xpath('//div[@class="neirong-shouquan"]'): return item.add_xpath('title', '//div[@class="article-wrap"]/h1/text()') item.add_xpath('author', '//span[@class="author-name"]/text()') item.add_value('source', u'虎嗅网') item.add_value('original_link', response.url) item.add_value('category', CATEGORY.TECHNOLOGY) article_time = sel.xpath('//span[@class="article-time"]/text()').extract() date_time = compare_time(article_time, "%Y-%m-%d %H:%M") if not date_time: return item.add_value('date_time', article_time) image_url = sel.xpath('//div[@class="article-img-box"]/img/@src').extract()[0] images.append(image_url) elements = sel.xpath('//div[@id="article_content"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('image_urls', images) item.add_value('content', content) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) images = [] sel = Selector(response) item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") if sel.xpath('//div[@class="neirong-shouquan"]'): return item.add_xpath('title', '//div[@class="article-wrap"]/h1/text()') item.add_xpath('author', '//span[@class="author-name"]/text()') item.add_value('source', u'虎嗅网') item.add_value('original_link', response.url) item.add_value('category', CATEGORY.TECHNOLOGY) article_time = sel.xpath( '//span[@class="article-time"]/text()').extract() date_time = compare_time(article_time, "%Y-%m-%d %H:%M") if not date_time: return item.add_value('date_time', article_time) image_url = sel.xpath( '//div[@class="article-img-box"]/img/@src').extract()[0] images.append(image_url) elements = sel.xpath('//div[@id="article_content"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('image_urls', images) item.add_value('content', content) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p') article_time = content.xpath('//span[@class="pubTime"]/text()').extract() date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M") if not date_time: return item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_value('date_time', date_time) item.add_xpath('title', '//div[@class="hd"]/h1/text()') item.add_xpath('reading_number', '//em[@id="top_count"]/text()') item.add_xpath('author', '//span[@class="auth"]/text()') item.add_value('original_link', response.url) elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('content', content) item.add_value('image_urls', images) item.add_value('source', u'腾讯科技') item.add_value('category', CATEGORY.TECHNOLOGY) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_xpath('title', '//div[@class="pageTop"]/h1/text()') item.add_xpath('author', '//div[@class="pi-author"]/a/text()') article_time = " ".join( sel.xpath('//div[@class="pi-author"]/span/text()').extract()) date_time = compare_time([article_time], "%Y-%m-%d %H:%M") if not date_time: return item.add_value('date_time', date_time) content = sel.xpath("//div[@class='pageCont lph-article-comView ']") elements = content.xpath("h2|p").extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('image_urls', images) item.add_value('content', content) item.add_value('original_link', response.url) item.add_value('category', CATEGORY.TECHNOLOGY) item.add_value('source', u'雷锋网') logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) article_time = sel.xpath('//span[@class="date"]/text()').extract() date_time = compare_time(article_time) if not date_time: return item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_value('date_time', date_time) item.add_xpath('title', '//h2[@id="news_title"]/text()') item.add_value('original_link', response.url) elements = sel.xpath('//div[@class="content"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('content', content) item.add_value('image_urls', images) item.add_value('source', u'cnBeta') item.add_value('category', CATEGORY.TECHNOLOGY) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_xpath('title', '//div[@class="pageTop"]/h1/text()') item.add_xpath('author', '//div[@class="pi-author"]/a/text()') article_time = " ".join(sel.xpath('//div[@class="pi-author"]/span/text()').extract()) date_time = compare_time([article_time], "%Y-%m-%d %H:%M") if not date_time: return item.add_value('date_time', date_time) content = sel.xpath("//div[@class='pageCont lph-article-comView ']") elements = content.xpath("h2|p").extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('image_urls', images) item.add_value('content', content) item.add_value('original_link', response.url) item.add_value('category', CATEGORY.TECHNOLOGY) item.add_value('source', u'雷锋网') logging.info(u"finished crawl ---> " + response.url) yield item.load_item()