def parse_item(self, response): item = self.get_new_item(response) html = utils.decode(response.body, response.encoding) article = response.xpath("//div[contains(@class,'item-page')]") caption_images = {} caption_imgs = article.xpath(".//p/span[contains(@class,'wf_caption')]") imgs = article.xpath(".//p/img") for caption_img in caption_imgs: if caption_img.xpath(".//img/@alt"): image_alt = caption_img.xpath(".//img/@alt").extract()[0].strip() elif caption_img.xpath(".//img/@title"): image_alt = caption_img.xpath(".//img/@title").extract()[0].strip() else: image_alt = None caption_images[urlparse.urljoin(response.url, caption_img.xpath(".//img/@src").extract()[0].strip())] = ("".join(caption_img.xpath(".//span").extract()),image_alt) for img in imgs: if img.xpath(".//@alt"): image_alt = img.xpath(".//@alt").extract()[0].strip() elif img.xpath(".//@title"): image_alt = img.xpath(".//@title").extract()[0].strip() else: image_alt = None caption_images[urlparse.urljoin(response.url, img.xpath(".//@src").extract()[0])] = (None,image_alt) slid_imgs = [] for main_images_wrapper in response.xpath("//div[contains(@id,'main_images_wrapper')]"): desc = [] image_info = [] for main_des_container in main_images_wrapper.xpath(".//div[contains(@id,'main_des_container')]/div[contains(@class,'des_div')]/p"): desc.append(main_des_container.xpath(".//text()").extract()[0].strip()) for img_tag in main_images_wrapper.xpath(".//div[contains(@id,'main_thumbs_arrow_wrapper')]/div[contains(@id,'main_thumb_container')]//img[contains(@class,'ig_thumb')]"): if img_tag.xpath(".//@src"): path = urlparse.urljoin(response.url,img_tag.xpath(".//@src").extract()[0].strip().replace("120-90-80-c","600-450-80")) slid_imgs.append(path) else: path = "" alt = img_tag.xpath(".//@alt").extract()[0].strip() if img_tag.xpath(".//@alt") else None image_info.append((path,alt)) for i in xrange(len(desc)): caption_images[image_info[i][0]] = (desc[i],image_info[i][1]) item['image_urls'] = [u for u in caption_images.keys()] item["json"]["caption_images"] = caption_images createdby = response.xpath("//dd[@class='createdby']//text()").extract()[0] createdBySplit = createdby.split('|') if len(createdBySplit) >= 2: item['json']['author'] = createdBySplit[0].strip()[3:] item['json']['date'] = createdBySplit[1].strip() else: item['json']['date'] = createdby.strip() item['json']['title'] = article.xpath(".//h2/text()").extract()[0].strip() item['json']['item_url'] = response.url content_document=fromstring(article.extract()[0].strip()) del_title = content_document.xpath(".//h2")[0] del_title.getparent().remove(del_title) del_author_date = content_document.xpath(".//dl[contains(@class,'article-info')]")[0] del_author_date.getparent().remove(del_author_date) if content_document.xpath(".//div[contains(@id,'main_images_wrapper')]"): del_main_images_wrapper = content_document.xpath(".//div[contains(@id,'main_images_wrapper')]")[0] for image_url in slid_imgs: img_doc = Element("img",**{"src":image_url}) del_main_images_wrapper.addprevious(img_doc) del_main_images_wrapper.getparent().remove(del_main_images_wrapper) del_igallery_clear_div = content_document.xpath(".//div[contains(@class,'igallery_clear')]")[0] del_igallery_clear_div.getparent().remove(del_igallery_clear_div) captions = content_document.xpath(".//p/span[contains(@class,'wf_caption')]") if captions: for caption in captions: keep_img = caption.xpath(".//img")[0] caption.addnext(keep_img) caption.getparent().remove(caption) item["json"]["content"] = tostring( content_document, encoding="UTF-8" ) sourceurl = response.meta['source_url'] item['json']['category'] = response.meta['category'] item['html'] = html htmls_path = { sourceurl:html } item["htmls_path"] = htmls_path item['source_url'] = sourceurl return item
def parse_item(self, response): self.log('this is an item page! %s' % response.url) if "jump" not in response.meta: html = utils.decode(response.body) response.meta["page_html"] = html item = self.get_new_item(response) item["htmls_path"] = {} item["htmls_path"][response.meta['source_url']] = html json = item["json"] json['item_url'] = response.url if 'category' in response.meta: category = response.meta['category'] json['category'] = category if response.xpath("//h1[contains(@class,'title')]"): json['title'] = response.xpath( "//h1[contains(@class,'title')]/text()").extract( )[0].strip() else: json['title'] = response.meta['title'] json['date'] = response.meta['date'] json['excerpt'] = response.meta['excerpt'] thumb_urls = [] if 'thumb_urls' in response.meta: thumb_urls += response.meta['thumb_urls'] item['thumb_urls'] = thumb_urls tags = [] for item_tag in response.xpath( "//section[@class='tags']/span[@itemprop='keywords']/a[@rel='tag']" ): tagTuple = (item_tag.xpath(".//text()").extract()[0].strip(), item_tag.xpath(".//@href").extract()[0].strip()) tags.append(tagTuple) json['tags'] = tags if response.xpath( "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']" ): json['author'] = response.xpath( "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']/span/text()" ).extract()[0].strip() else: json = response.meta["json"] item = response.meta["item"] item["htmls_path"][response.meta["sourceurl"]] = utils.decode( response.body) summaryOriginal = "".join( response.xpath("//section[@class='body']").extract()).strip() lxmlTree = fromstring(summaryOriginal) image_urls = [] caption_images = {} blackouts = lxmlTree.xpath( "//div[contains(@class,'blackout-gallery')]") for blackout in blackouts: image_divs = blackout.xpath(".//div[@class='image-data']") for image_div in image_divs: image_url = image_div.xpath( ".//div[@class='data-urls']/meta/@content")[0] image = urlparse.urljoin(response.url, image_url.encode("UTF-8")) width = image_div.xpath( ".//div[@class='data-urls']/meta/@data-width")[0].encode( "UTF-8") height = image_div.xpath( ".//div[@class='data-urls']/meta/@data-height")[0].encode( "UTF-8") image_urls.append(image) img = "<img src='%s' width='%s' height='%s' />" % ( image, width, height) figcaption = "<figcaption>%s</figcaption>" % image_div.xpath( ".//meta[@itemprop='caption']/@content")[0].encode("UTF-8") caption_images[image] = (figcaption, None) blackout.addnext(fromstring(img)) blackout.getparent().remove(blackout) for url in lxmlTree.xpath("//section[@class='body']/p/img[@src]/@src"): image_url = url.encode("UTF-8") if image_url not in image_urls: image_urls += [image_url] caption_images[image_url] = (None, None) for figure in lxmlTree.xpath("//section[@class='body']/figure"): image_url = figure.xpath(".//img[@src]/@src")[0].encode("UTF-8") desc = tostring(figure.xpath( ".//figcaption[contains(@class,'wp-caption-text')]")[0], encoding="UTF-8") if image_url not in image_urls: image_urls += [image_url] caption_images[image_url] = (desc, None) keep_img = figure.xpath(".//img[@src]")[0] figure.addnext(keep_img) figure.getparent().remove(figure) section_start = "<section class=\"body\" itemprop=\"articleBody\">" section_end = "</section>" content = tostring(lxmlTree, encoding="UTF-8").replace( section_start, "").replace(section_end, "") if "jump" not in response.meta: item['image_urls'] = image_urls json["caption_images"] = caption_images json['content'] = section_start json['content'] = "".join([json['content'], content]) else: item['image_urls'] = response.meta["item"][ "image_urls"] + image_urls json["caption_images"] = dict( response.meta["json"]["caption_images"], **caption_images) json['content'] = "".join( [response.meta["json"]["content"], content]) pages = response.xpath( "//section[@class='body']/following-sibling::p[starts-with(.,'Pages')]/a/text()" ) if pages: if "max_page" not in response.meta: max_page = int(pages.extract()[len(pages.extract()) - 1]) jump = True next_page = 2 else: max_page = response.meta["max_page"] jump = response.meta["jump"] next_page = response.meta["next_page"] + 1 param = response.meta if next_page <= max_page: param["item"] = item param["json"] = json param["max_page"] = max_page param["next_page"] = next_page param["jump"] = jump next_url = response.meta['source_url'] + str(next_page) + "/" param["sourceurl"] = next_url yield scrapy.Request(next_url, callback=self.parse_item, dont_filter=True, meta=param) else: jump = False if not pages or not jump: json['content'] = "".join([json['content'], "\n</section>"]) item['json'] = json try: item['html'] = html except UnboundLocalError: item['html'] = response.meta["page_html"] item['source_url'] = response.meta['source_url'] yield item
def parse_item(self, response): html = utils.decode(response.body) item = self.get_new_item(response) json = item["json"] json['item_url'] = response.url if 'categories' in response.meta: json['categories'] = response.meta['categories'] thumb_urls = [] if 'thumb_urls' in response.meta: thumb_urls += response.meta['thumb_urls'] item['thumb_urls'] = thumb_urls if response.xpath("//div[@class='main']/article[@id='article']/h1[@itemprop='name']"): json['title'] = response.xpath("//div[@class='main']/article[@id='article']/h1[@itemprop='name']/text()").extract()[0].strip() if response.xpath("//div[@class='main']/article[@id='article']/p[@class='date']"): json['date'] = response.xpath("//div[@class='main']/article[@id='article']/p[@class='date']/text()").extract()[0].strip() if response.xpath("//div[@class='main']/article[@id='article']/h2[@class='standfirst']"): json['excerpt'] = response.xpath("//div[@class='main']/article[@id='article']/h2[@class='standfirst']/text()").extract()[0].strip() image_urls = [] caption_images = {} article = response.xpath("//article[contains(@id,'article')]").extract()[0].strip() article_document = fromstring(article) if article_document.xpath(".//div[contains(@class,'actions')]"): del_actions = article_document.xpath(".//div[contains(@class,'actions')]")[0] del_actions.getparent().remove(del_actions) if article_document.xpath(".//p[contains(@itemprop,'datePublished')]"): del_date = article_document.xpath(".//p[contains(@itemprop,'datePublished')]")[0] del_date.getparent().remove(del_date) if article_document.xpath(".//h1[contains(@itemprop,'name')]"): del_title = article_document.xpath(".//h1[contains(@itemprop,'name')]")[0] del_title.getparent().remove(del_title) if article_document.xpath(".//h2[contains(@itemprop,'description')]"): del_desc = article_document.xpath(".//h2[contains(@itemprop,'description')]")[0] del_desc.getparent().remove(del_desc) if article_document.xpath(".//div[contains(@class,'article_tabs')]"): del_tabs = article_document.xpath(".//div[contains(@class,'article_tabs')]")[0] del_tabs.getparent().remove(del_tabs) if article_document.xpath(".//img[contains(@class,'article_image')]"): article_body = article_document.xpath(".//div[contains(@itemprop,'articleBody')]")[0] for article_image in article_document.xpath(".//img[contains(@class,'article_image')]"): src = urlparse.urljoin(response.url,article_image.xpath(".//@src")[0].encode("UTF-8")) if article_image.xpath(".//@alt"): alt = article_image.xpath(".//@alt")[0].encode("UTF-8") elif article_image.xpath(".//@title"): alt = article_image.xpath(".//@title")[0].encode("UTF-8") else: alt = None if article_image.xpath(".//parent::a") and\ Selector(text=tostring(article_image.xpath(".//parent::a")[0], encoding="UTF-8")).xpath(".//@title"): caption = Selector(text=tostring(article_image.xpath(".//parent::a")[0], encoding="UTF-8")).xpath(".//@title").extract()[0].strip() else: caption= None caption_images[src] = (caption,alt) image_urls.append(src) article_body.addprevious(article_image) if article_document.xpath(".//div[contains(@class,'letterbox_image_wrapper')]"): del_cf = article_document.xpath(".//div[contains(@class,'letterbox_image_wrapper')]")[0] del_cf.getparent().remove(del_cf) if article_document.xpath(".//div[contains(@class,'article_right_col')]"): del_ar = article_document.xpath(".//div[contains(@class,'article_right_col')]")[0] del_ar.getparent().remove(del_ar) image_infos = [] if response.xpath(".//div[contains(@itemprop,'articleBody')]/descendant::img[@src]"): for image in response.xpath(".//div[contains(@itemprop,'articleBody')]/descendant::img[@src]"): image_src = urlparse.urljoin(response.url, image.xpath(".//@src").extract()[0].strip()) if image.xpath(".//@alt"): image_alt = image.xpath(".//@alt").extract()[0].strip() elif image.xpath(".//@title"): image_alt = image.xpath(".//@title").extract()[0].strip() else: image_alt = None image_infos.append((image_src,image_alt)) for image_info in image_infos: image_url,image_alt = image_info caption_images[image_url] = (None,image_alt) image_urls.append(image_url) item['image_urls'] = image_urls json["caption_images"] = caption_images json['content'] = tostring(article_document, encoding="UTF-8") item['json'] = json item['html'] = html htmls_path = { response.meta['source_url']:response.body } item["htmls_path"] = htmls_path item['source_url'] = response.meta['source_url'] authors_url = '' if response.xpath("//div[@id='article_author']/descendant::a[@class='author_link']"): authors_url = urlparse.urljoin(response.url, response.xpath("//div[@id='article_author']/descendant::a[@class='author_link']/@href").extract()[0].strip()) if authors_url: yield scrapy.Request(authors_url, callback=self.parse_authors, dont_filter=True, meta={'item': item, "authors_url": authors_url}) else: yield item
def parse_item(self, response): self.log('this is an item page! %s' % response.url) html = utils.decode(response.body) item = DataFetchersItem() json = {} json['item_url'] = response.url if "reparse_from" in response.meta: json["reparse_from"] = response.meta["reparse_from"] if self.is_reparse: if 'category' in response.meta["json"]: category = response.meta["json"]['category'] json['category'] = category json['title'] = response.meta["json"]['title'] json['date'] = response.meta["json"]['date'] json['excerpt'] = response.meta["json"]['excerpt'] thumb_urls = [] if 'thumb_urls' in response.meta["json"]: thumb_urls += response.meta["json"]['thumb_urls'] else: if 'category' in response.meta: category = response.meta['category'] json['category'] = category json['title'] = response.meta['title'] json['date'] = response.meta['date'] json['excerpt'] = response.meta['excerpt'] thumb_urls = [] if 'thumb_urls' in response.meta: thumb_urls += [response.meta['thumb_urls']] item['thumb_urls'] = thumb_urls tags = [] for item_tag in response.xpath( "//section[@class='tags']/span[@itemprop='keywords']/a[@rel='tag']" ): tagTuple = (item_tag.xpath(".//text()").extract()[0].strip(), item_tag.xpath(".//@href").extract()[0].strip()) tags.append(tagTuple) json['tags'] = tags if response.xpath( "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']" ): # author_url = response.xpath("//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']") json['author'] = response.xpath( "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']/span/text()" ).extract()[0].strip() res = utils.readability(html=html) summaryOriginal = res['content'] lxmlTree = fromstring(summaryOriginal) image_urls = [] for url in lxmlTree.xpath( "//section[@class='body']/descendant::img[@src]/@src"): image_urls += [url.encode("UTF-8")] item['image_urls'] = image_urls caption_images = {} for image_url in image_urls: caption_images["image_url"] = None json["caption_images"] = caption_images json['content'] = tostring( lxmlTree.xpath("//section[@class='body']")[0], encoding="UTF-8") del lxmlTree item['json'] = json item['html'] = html htmls_path = {response.meta['source_url']: response.body} item["htmls_path"] = htmls_path item['source_url'] = response.meta['source_url'] item['source_crawler'] = self.source_crawler item['crawl_type'] = self.crawl_type item['created_time'] = datetime.datetime.now() yield item for tagKey, tagValue in tags: if tagKey not in self.hasCrawledTags: self.hasCrawledTags.add(tagKey) yield scrapy.Request(tagValue, callback=self.parse_list, dont_filter=True) gc.collect()
def parse_item(self, response): url = response.url self.log("this is an item page! %s" % url) article_section_div = response.xpath( CSSSelector("div.article-section").path) if article_section_div and article_section_div.xpath( ".//a/text()").extract()[0].strip() == "News": html = utils.decode(response.body) item = self.get_new_item(response) item["thumb_urls"] = [] json = item["json"] tags = [] for item_tag in response.xpath( CSSSelector( "div.field-name-field-topics-the-packer a").path): tagTuple = ( item_tag.xpath(".//text()").extract()[0].strip(), urlparse.urljoin( url, item_tag.xpath(".//@href").extract()[0].strip())) tags.append(tagTuple) json['tags'] = tags if response.xpath("//h1[contains(@itemprop,'headline')]"): json["title"] = response.xpath( "//h1[contains(@itemprop,'headline')]/text()").extract( )[0].strip() else: if "title" in response.meta: json["title"] = response.meta["title"] json["item_url"] = url if "description" in response.meta: json["description"] = response.meta["description"] if "pubDate" in response.meta: json["pubDate"] = response.meta["pubDate"] if "channel" in response.meta: json["channel"] = response.meta["channel"] sourceurl = response.meta["source_url"] about_author_div = response.xpath( CSSSelector("div.article-author").path) if about_author_div: author = {} if about_author_div.xpath( ".//div[@class='author-image']/img/@src"): author["author_img"] = urlparse.urljoin( url, about_author_div.xpath( ".//div[@class='author-image']/img/@src").extract( )[0].strip()) if about_author_div.xpath( ".//p[@class='author-name']/strong[@itemprop='name']/text()" ): author["author_name"] = about_author_div.xpath( ".//p[@class='author-name']/strong[@itemprop='name']/text()" ).extract()[0].strip() if about_author_div.xpath( ".//p[@class='author-name']/span[@itemprop='jobTitle']/text()" ): author["author_jobtitle"] = about_author_div.xpath( ".//p[@class='author-name']/span[@itemprop='jobTitle']/text()" ).extract()[0].strip() if about_author_div.xpath( ".//div[@class='author-bio']/text()"): author["author_biography"] = about_author_div.xpath( ".//div[@class='author-bio']/text()").extract( )[0].strip() json["author"] = author article = response.xpath( "//section[contains(@class,'article-content')]") figures = article.xpath( ".//figure[contains(@class,'article-featured-image')]") caption_images = {} for figure in figures: image_url = urlparse.urljoin( response.url, figure.xpath( ".//div[contains(@class,'module-wrapper')]/img/@src"). extract()[0].strip()) if figure.xpath( ".//div[contains(@class,'module-wrapper')]/img/@alt"): image_alt = figure.xpath( ".//div[contains(@class,'module-wrapper')]/img/@alt" ).extract()[0].strip() elif figure.xpath( ".//div[contains(@class,'module-wrapper')]/img/@title" ): image_alt = figure.xpath( ".//div[contains(@class,'module-wrapper')]/img/@title" ).extract()[0].strip() else: image_alt = None image_desc = figure.xpath( ".//div[contains(@class,'module-wrapper')]/figcaption[contains(@class,'photo-caption')]" ) caption_images[image_url] = (image_desc.extract()[0].strip() if image_desc else None, image_alt) figure_items = article.xpath(".//figure[contains(@class,'item')]") for figure_item in figure_items: image_url = urlparse.urljoin( response.url, figure_item.xpath( ".//descendant::div[contains(@class,'slide-image')]/img/@src" ).extract()[0].strip()) if figure_item.xpath( ".//descendant::div[contains(@class,'slide-image')]/img/@alt" ): image_alt = figure_item.xpath( ".//descendant::div[contains(@class,'slide-image')]/img/@alt" ).extract()[0].strip() elif figure_item.xpath( ".//descendant::div[contains(@class,'slide-image')]/img/@title" ): image_alt = figure_item.xpath( ".//descendant::div[contains(@class,'slide-image')]/img/@title" ).extract()[0].strip() else: image_alt = None image_desc = figure_item.xpath( ".//descendant::div[contains(@class,'credits-caption')]/figcaption[contains(@class,'photo-caption')]" ) caption_images[image_url] = (image_desc.extract()[0].strip() if image_desc else None, image_alt) media_items = article.xpath(".//img[@class='media-image']") for media_item in media_items: image_url = media_item.xpath("./@src").extract()[0] if len(media_item.xpath("../text()")) > 0: image_desc = media_item.xpath( "../text()").extract()[0].strip() else: image_desc = None image_alt = None caption_images[image_url] = (image_desc, image_alt) item['image_urls'] = [u for u in caption_images.keys()] json["caption_images"] = caption_images content_document = fromstring(article.extract()[0].strip()) slideshows = content_document.xpath( ".//div[@id='vance-slideshow']") for slideshow in slideshows: for slideshow_item in slideshow.xpath( ".//figure[contains(@class,'item')]"): keep_img = slideshow_item.xpath( ".//descendant::div[contains(@class,'slide-image')]/img" )[0] slideshow.addnext(keep_img) slideshow.getparent().remove(slideshow) figure_docs = content_document.xpath( ".//figure[contains(@class,'article-featured-image')]") for figure_doc in figure_docs: keep_img = figure_doc.xpath( ".//div[contains(@class,'module-wrapper')]/img")[0] figure_doc.addnext(keep_img) figure_doc.getparent().remove(figure_doc) json["content"] = tostring(content_document, encoding="UTF-8") item["json"] = json item["html"] = html htmls_path = {sourceurl: html} item["htmls_path"] = htmls_path item["source_url"] = sourceurl yield item else: self.log('the item is invalid news! %s' % url)