Python decodeの例

プログラミング言語: Python

名前空間/パッケージ名: data_fetchers.utils

メソッド/関数: decode

hotexamples.comのコード掲載数: 5

Python decode - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdata_fetchers.utils.decodeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: producenews.py プロジェクト: j-toma/FTspiders

    def parse_item(self, response):
        item = self.get_new_item(response)
        html = utils.decode(response.body, response.encoding)
        article = response.xpath("//div[contains(@class,'item-page')]")
        caption_images = {}
        caption_imgs = article.xpath(".//p/span[contains(@class,'wf_caption')]")
        imgs = article.xpath(".//p/img")
        for caption_img in caption_imgs:
            if caption_img.xpath(".//img/@alt"):
                image_alt = caption_img.xpath(".//img/@alt").extract()[0].strip()
            elif caption_img.xpath(".//img/@title"):
                image_alt = caption_img.xpath(".//img/@title").extract()[0].strip()
            else:
                image_alt = None
            caption_images[urlparse.urljoin(response.url, caption_img.xpath(".//img/@src").extract()[0].strip())] = ("".join(caption_img.xpath(".//span").extract()),image_alt)
        for img in imgs:
            if img.xpath(".//@alt"):
                image_alt = img.xpath(".//@alt").extract()[0].strip()
            elif img.xpath(".//@title"):
                image_alt = img.xpath(".//@title").extract()[0].strip()
            else:
                image_alt = None
            caption_images[urlparse.urljoin(response.url, img.xpath(".//@src").extract()[0])] = (None,image_alt)
        slid_imgs = []
        for main_images_wrapper in response.xpath("//div[contains(@id,'main_images_wrapper')]"):
            desc = []
            image_info = []
            for main_des_container in main_images_wrapper.xpath(".//div[contains(@id,'main_des_container')]/div[contains(@class,'des_div')]/p"):
                desc.append(main_des_container.xpath(".//text()").extract()[0].strip())
            for img_tag in main_images_wrapper.xpath(".//div[contains(@id,'main_thumbs_arrow_wrapper')]/div[contains(@id,'main_thumb_container')]//img[contains(@class,'ig_thumb')]"):
                if img_tag.xpath(".//@src"):
                    path = urlparse.urljoin(response.url,img_tag.xpath(".//@src").extract()[0].strip().replace("120-90-80-c","600-450-80"))
                    slid_imgs.append(path)
                else:
                    path = ""
                alt = img_tag.xpath(".//@alt").extract()[0].strip() if img_tag.xpath(".//@alt") else None
                image_info.append((path,alt))
            for i in xrange(len(desc)):
                caption_images[image_info[i][0]] = (desc[i],image_info[i][1])
        item['image_urls'] = [u for u in caption_images.keys()]
        item["json"]["caption_images"] = caption_images

        createdby = response.xpath("//dd[@class='createdby']//text()").extract()[0]
        createdBySplit = createdby.split('|')
        if len(createdBySplit) >= 2:
            item['json']['author'] = createdBySplit[0].strip()[3:]
            item['json']['date'] = createdBySplit[1].strip()
        else:
            item['json']['date'] = createdby.strip()

        item['json']['title'] = article.xpath(".//h2/text()").extract()[0].strip()
        item['json']['item_url'] = response.url

        content_document=fromstring(article.extract()[0].strip())
        del_title = content_document.xpath(".//h2")[0]
        del_title.getparent().remove(del_title)
        del_author_date = content_document.xpath(".//dl[contains(@class,'article-info')]")[0]
        del_author_date.getparent().remove(del_author_date)
        if content_document.xpath(".//div[contains(@id,'main_images_wrapper')]"):
            del_main_images_wrapper = content_document.xpath(".//div[contains(@id,'main_images_wrapper')]")[0]
            for image_url in slid_imgs:
                img_doc = Element("img",**{"src":image_url})
                del_main_images_wrapper.addprevious(img_doc)
            del_main_images_wrapper.getparent().remove(del_main_images_wrapper)
        del_igallery_clear_div = content_document.xpath(".//div[contains(@class,'igallery_clear')]")[0]
        del_igallery_clear_div.getparent().remove(del_igallery_clear_div)
        captions = content_document.xpath(".//p/span[contains(@class,'wf_caption')]")
        if captions:
            for caption in captions:
                keep_img = caption.xpath(".//img")[0]
                caption.addnext(keep_img)
                caption.getparent().remove(caption)
        item["json"]["content"] = tostring(
            content_document,
            encoding="UTF-8"
        )

        sourceurl = response.meta['source_url']
        item['json']['category'] = response.meta['category']
        item['html'] = html
        htmls_path = {
            sourceurl:html
        }
        item["htmls_path"] = htmls_path
        item['source_url'] = sourceurl
        return item

コード例 #2

ファイルを表示

    def parse_item(self, response):
        self.log('this is an item page! %s' % response.url)
        if "jump" not in response.meta:
            html = utils.decode(response.body)
            response.meta["page_html"] = html
            item = self.get_new_item(response)
            item["htmls_path"] = {}
            item["htmls_path"][response.meta['source_url']] = html
            json = item["json"]
            json['item_url'] = response.url

            if 'category' in response.meta:
                category = response.meta['category']
                json['category'] = category

            if response.xpath("//h1[contains(@class,'title')]"):
                json['title'] = response.xpath(
                    "//h1[contains(@class,'title')]/text()").extract(
                    )[0].strip()
            else:
                json['title'] = response.meta['title']

            json['date'] = response.meta['date']
            json['excerpt'] = response.meta['excerpt']
            thumb_urls = []
            if 'thumb_urls' in response.meta:
                thumb_urls += response.meta['thumb_urls']
            item['thumb_urls'] = thumb_urls

            tags = []
            for item_tag in response.xpath(
                    "//section[@class='tags']/span[@itemprop='keywords']/a[@rel='tag']"
            ):
                tagTuple = (item_tag.xpath(".//text()").extract()[0].strip(),
                            item_tag.xpath(".//@href").extract()[0].strip())
                tags.append(tagTuple)
            json['tags'] = tags
            if response.xpath(
                    "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']"
            ):
                json['author'] = response.xpath(
                    "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']/span/text()"
                ).extract()[0].strip()
        else:
            json = response.meta["json"]
            item = response.meta["item"]
            item["htmls_path"][response.meta["sourceurl"]] = utils.decode(
                response.body)
        summaryOriginal = "".join(
            response.xpath("//section[@class='body']").extract()).strip()
        lxmlTree = fromstring(summaryOriginal)
        image_urls = []
        caption_images = {}
        blackouts = lxmlTree.xpath(
            "//div[contains(@class,'blackout-gallery')]")
        for blackout in blackouts:
            image_divs = blackout.xpath(".//div[@class='image-data']")
            for image_div in image_divs:
                image_url = image_div.xpath(
                    ".//div[@class='data-urls']/meta/@content")[0]
                image = urlparse.urljoin(response.url,
                                         image_url.encode("UTF-8"))
                width = image_div.xpath(
                    ".//div[@class='data-urls']/meta/@data-width")[0].encode(
                        "UTF-8")
                height = image_div.xpath(
                    ".//div[@class='data-urls']/meta/@data-height")[0].encode(
                        "UTF-8")
                image_urls.append(image)
                img = "<img src='%s' width='%s' height='%s' />" % (
                    image, width, height)
                figcaption = "<figcaption>%s</figcaption>" % image_div.xpath(
                    ".//meta[@itemprop='caption']/@content")[0].encode("UTF-8")
                caption_images[image] = (figcaption, None)
                blackout.addnext(fromstring(img))
            blackout.getparent().remove(blackout)

        for url in lxmlTree.xpath("//section[@class='body']/p/img[@src]/@src"):
            image_url = url.encode("UTF-8")
            if image_url not in image_urls:
                image_urls += [image_url]
                caption_images[image_url] = (None, None)

        for figure in lxmlTree.xpath("//section[@class='body']/figure"):
            image_url = figure.xpath(".//img[@src]/@src")[0].encode("UTF-8")
            desc = tostring(figure.xpath(
                ".//figcaption[contains(@class,'wp-caption-text')]")[0],
                            encoding="UTF-8")
            if image_url not in image_urls:
                image_urls += [image_url]
                caption_images[image_url] = (desc, None)
            keep_img = figure.xpath(".//img[@src]")[0]
            figure.addnext(keep_img)
            figure.getparent().remove(figure)

        section_start = "<section class=\"body\" itemprop=\"articleBody\">"
        section_end = "</section>"
        content = tostring(lxmlTree, encoding="UTF-8").replace(
            section_start, "").replace(section_end, "")
        if "jump" not in response.meta:
            item['image_urls'] = image_urls
            json["caption_images"] = caption_images
            json['content'] = section_start
            json['content'] = "".join([json['content'], content])
        else:
            item['image_urls'] = response.meta["item"][
                "image_urls"] + image_urls
            json["caption_images"] = dict(
                response.meta["json"]["caption_images"], **caption_images)
            json['content'] = "".join(
                [response.meta["json"]["content"], content])

        pages = response.xpath(
            "//section[@class='body']/following-sibling::p[starts-with(.,'Pages')]/a/text()"
        )
        if pages:
            if "max_page" not in response.meta:
                max_page = int(pages.extract()[len(pages.extract()) - 1])
                jump = True
                next_page = 2

            else:
                max_page = response.meta["max_page"]
                jump = response.meta["jump"]
                next_page = response.meta["next_page"] + 1
            param = response.meta

            if next_page <= max_page:
                param["item"] = item
                param["json"] = json
                param["max_page"] = max_page
                param["next_page"] = next_page
                param["jump"] = jump
                next_url = response.meta['source_url'] + str(next_page) + "/"
                param["sourceurl"] = next_url
                yield scrapy.Request(next_url,
                                     callback=self.parse_item,
                                     dont_filter=True,
                                     meta=param)
            else:
                jump = False
        if not pages or not jump:
            json['content'] = "".join([json['content'], "\n</section>"])
            item['json'] = json
            try:
                item['html'] = html
            except UnboundLocalError:
                item['html'] = response.meta["page_html"]
            item['source_url'] = response.meta['source_url']
            yield item

コード例 #3

ファイルを表示

ファイル: fruit_net.py プロジェクト: j-toma/FTspiders

    def parse_item(self, response):
        html = utils.decode(response.body)
        item = self.get_new_item(response)
        json = item["json"]
        json['item_url'] = response.url
        if 'categories' in response.meta:
            json['categories'] = response.meta['categories']
        thumb_urls = []
        if 'thumb_urls' in response.meta:
            thumb_urls += response.meta['thumb_urls']
        item['thumb_urls'] = thumb_urls
        if response.xpath("//div[@class='main']/article[@id='article']/h1[@itemprop='name']"):
            json['title'] = response.xpath("//div[@class='main']/article[@id='article']/h1[@itemprop='name']/text()").extract()[0].strip()
        if response.xpath("//div[@class='main']/article[@id='article']/p[@class='date']"):
            json['date'] = response.xpath("//div[@class='main']/article[@id='article']/p[@class='date']/text()").extract()[0].strip()
        if response.xpath("//div[@class='main']/article[@id='article']/h2[@class='standfirst']"):
            json['excerpt'] = response.xpath("//div[@class='main']/article[@id='article']/h2[@class='standfirst']/text()").extract()[0].strip()

        image_urls = []
        caption_images = {}
        article = response.xpath("//article[contains(@id,'article')]").extract()[0].strip()
        article_document = fromstring(article)
        if article_document.xpath(".//div[contains(@class,'actions')]"):
            del_actions = article_document.xpath(".//div[contains(@class,'actions')]")[0]
            del_actions.getparent().remove(del_actions)
        if article_document.xpath(".//p[contains(@itemprop,'datePublished')]"):
            del_date = article_document.xpath(".//p[contains(@itemprop,'datePublished')]")[0]
            del_date.getparent().remove(del_date)
        if article_document.xpath(".//h1[contains(@itemprop,'name')]"):
            del_title = article_document.xpath(".//h1[contains(@itemprop,'name')]")[0]
            del_title.getparent().remove(del_title)
        if article_document.xpath(".//h2[contains(@itemprop,'description')]"):
            del_desc = article_document.xpath(".//h2[contains(@itemprop,'description')]")[0]
            del_desc.getparent().remove(del_desc)
        if article_document.xpath(".//div[contains(@class,'article_tabs')]"):
            del_tabs = article_document.xpath(".//div[contains(@class,'article_tabs')]")[0]
            del_tabs.getparent().remove(del_tabs)
        if article_document.xpath(".//img[contains(@class,'article_image')]"):
            article_body = article_document.xpath(".//div[contains(@itemprop,'articleBody')]")[0]
            for article_image in article_document.xpath(".//img[contains(@class,'article_image')]"):
                src = urlparse.urljoin(response.url,article_image.xpath(".//@src")[0].encode("UTF-8"))
                if article_image.xpath(".//@alt"):
                    alt = article_image.xpath(".//@alt")[0].encode("UTF-8")
                elif article_image.xpath(".//@title"):
                    alt = article_image.xpath(".//@title")[0].encode("UTF-8")
                else:
                    alt = None
                if article_image.xpath(".//parent::a") and\
                   Selector(text=tostring(article_image.xpath(".//parent::a")[0], encoding="UTF-8")).xpath(".//@title"):
                    caption = Selector(text=tostring(article_image.xpath(".//parent::a")[0], encoding="UTF-8")).xpath(".//@title").extract()[0].strip()
                else:
                    caption= None
                caption_images[src] = (caption,alt)
                image_urls.append(src)
                article_body.addprevious(article_image)
        if article_document.xpath(".//div[contains(@class,'letterbox_image_wrapper')]"):
            del_cf = article_document.xpath(".//div[contains(@class,'letterbox_image_wrapper')]")[0]
            del_cf.getparent().remove(del_cf)
        if article_document.xpath(".//div[contains(@class,'article_right_col')]"):
            del_ar = article_document.xpath(".//div[contains(@class,'article_right_col')]")[0]
            del_ar.getparent().remove(del_ar)

        image_infos = []
        if response.xpath(".//div[contains(@itemprop,'articleBody')]/descendant::img[@src]"):
            for image in response.xpath(".//div[contains(@itemprop,'articleBody')]/descendant::img[@src]"):
                image_src = urlparse.urljoin(response.url, image.xpath(".//@src").extract()[0].strip())
                if image.xpath(".//@alt"):
                    image_alt = image.xpath(".//@alt").extract()[0].strip()
                elif image.xpath(".//@title"):
                    image_alt = image.xpath(".//@title").extract()[0].strip()
                else:
                    image_alt = None
                image_infos.append((image_src,image_alt))
        for image_info in image_infos:
            image_url,image_alt = image_info
            caption_images[image_url] = (None,image_alt)
            image_urls.append(image_url)
        item['image_urls'] = image_urls
        json["caption_images"] = caption_images

        json['content'] = tostring(article_document, encoding="UTF-8")
        item['json'] = json
        item['html'] = html
        htmls_path = {
            response.meta['source_url']:response.body
        }
        item["htmls_path"] = htmls_path
        item['source_url'] = response.meta['source_url']
        authors_url = ''
        if response.xpath("//div[@id='article_author']/descendant::a[@class='author_link']"):
            authors_url = urlparse.urljoin(response.url, response.xpath("//div[@id='article_author']/descendant::a[@class='author_link']/@href").extract()[0].strip())
        if authors_url:
            yield scrapy.Request(authors_url, callback=self.parse_authors, dont_filter=True, meta={'item': item, "authors_url": authors_url})
        else:
            yield item

コード例 #4

ファイルを表示

ファイル: growingproduce.py プロジェクト: j-toma/FTspiders

 def parse_item(self, response):
     self.log('this is an item page! %s' % response.url)
     html = utils.decode(response.body)
     item = DataFetchersItem()
     json = {}
     json['item_url'] = response.url
     if "reparse_from" in response.meta:
         json["reparse_from"] = response.meta["reparse_from"]
     if self.is_reparse:
         if 'category' in response.meta["json"]:
             category = response.meta["json"]['category']
             json['category'] = category
         json['title'] = response.meta["json"]['title']
         json['date'] = response.meta["json"]['date']
         json['excerpt'] = response.meta["json"]['excerpt']
         thumb_urls = []
         if 'thumb_urls' in response.meta["json"]:
             thumb_urls += response.meta["json"]['thumb_urls']
     else:
         if 'category' in response.meta:
             category = response.meta['category']
             json['category'] = category
         json['title'] = response.meta['title']
         json['date'] = response.meta['date']
         json['excerpt'] = response.meta['excerpt']
         thumb_urls = []
         if 'thumb_urls' in response.meta:
             thumb_urls += [response.meta['thumb_urls']]
     item['thumb_urls'] = thumb_urls
     tags = []
     for item_tag in response.xpath(
             "//section[@class='tags']/span[@itemprop='keywords']/a[@rel='tag']"
     ):
         tagTuple = (item_tag.xpath(".//text()").extract()[0].strip(),
                     item_tag.xpath(".//@href").extract()[0].strip())
         tags.append(tagTuple)
     json['tags'] = tags
     if response.xpath(
             "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']"
     ):
         # author_url = response.xpath("//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']")
         json['author'] = response.xpath(
             "//div[contains(@class,'date-author-wrap')]/descendant::a[@rel='author']/span/text()"
         ).extract()[0].strip()
     res = utils.readability(html=html)
     summaryOriginal = res['content']
     lxmlTree = fromstring(summaryOriginal)
     image_urls = []
     for url in lxmlTree.xpath(
             "//section[@class='body']/descendant::img[@src]/@src"):
         image_urls += [url.encode("UTF-8")]
     item['image_urls'] = image_urls
     caption_images = {}
     for image_url in image_urls:
         caption_images["image_url"] = None
     json["caption_images"] = caption_images
     json['content'] = tostring(
         lxmlTree.xpath("//section[@class='body']")[0], encoding="UTF-8")
     del lxmlTree
     item['json'] = json
     item['html'] = html
     htmls_path = {response.meta['source_url']: response.body}
     item["htmls_path"] = htmls_path
     item['source_url'] = response.meta['source_url']
     item['source_crawler'] = self.source_crawler
     item['crawl_type'] = self.crawl_type
     item['created_time'] = datetime.datetime.now()
     yield item
     for tagKey, tagValue in tags:
         if tagKey not in self.hasCrawledTags:
             self.hasCrawledTags.add(tagKey)
             yield scrapy.Request(tagValue,
                                  callback=self.parse_list,
                                  dont_filter=True)
     gc.collect()

コード例 #5

ファイルを表示

    def parse_item(self, response):
        url = response.url
        self.log("this is an item page! %s" % url)
        article_section_div = response.xpath(
            CSSSelector("div.article-section").path)
        if article_section_div and article_section_div.xpath(
                ".//a/text()").extract()[0].strip() == "News":
            html = utils.decode(response.body)
            item = self.get_new_item(response)
            item["thumb_urls"] = []
            json = item["json"]
            tags = []
            for item_tag in response.xpath(
                    CSSSelector(
                        "div.field-name-field-topics-the-packer a").path):
                tagTuple = (
                    item_tag.xpath(".//text()").extract()[0].strip(),
                    urlparse.urljoin(
                        url,
                        item_tag.xpath(".//@href").extract()[0].strip()))
                tags.append(tagTuple)
            json['tags'] = tags
            if response.xpath("//h1[contains(@itemprop,'headline')]"):
                json["title"] = response.xpath(
                    "//h1[contains(@itemprop,'headline')]/text()").extract(
                    )[0].strip()
            else:
                if "title" in response.meta:
                    json["title"] = response.meta["title"]
            json["item_url"] = url
            if "description" in response.meta:
                json["description"] = response.meta["description"]
            if "pubDate" in response.meta:
                json["pubDate"] = response.meta["pubDate"]
            if "channel" in response.meta:
                json["channel"] = response.meta["channel"]
            sourceurl = response.meta["source_url"]

            about_author_div = response.xpath(
                CSSSelector("div.article-author").path)
            if about_author_div:
                author = {}
                if about_author_div.xpath(
                        ".//div[@class='author-image']/img/@src"):
                    author["author_img"] = urlparse.urljoin(
                        url,
                        about_author_div.xpath(
                            ".//div[@class='author-image']/img/@src").extract(
                            )[0].strip())
                if about_author_div.xpath(
                        ".//p[@class='author-name']/strong[@itemprop='name']/text()"
                ):
                    author["author_name"] = about_author_div.xpath(
                        ".//p[@class='author-name']/strong[@itemprop='name']/text()"
                    ).extract()[0].strip()
                if about_author_div.xpath(
                        ".//p[@class='author-name']/span[@itemprop='jobTitle']/text()"
                ):
                    author["author_jobtitle"] = about_author_div.xpath(
                        ".//p[@class='author-name']/span[@itemprop='jobTitle']/text()"
                    ).extract()[0].strip()
                if about_author_div.xpath(
                        ".//div[@class='author-bio']/text()"):
                    author["author_biography"] = about_author_div.xpath(
                        ".//div[@class='author-bio']/text()").extract(
                        )[0].strip()
                json["author"] = author

            article = response.xpath(
                "//section[contains(@class,'article-content')]")
            figures = article.xpath(
                ".//figure[contains(@class,'article-featured-image')]")
            caption_images = {}
            for figure in figures:
                image_url = urlparse.urljoin(
                    response.url,
                    figure.xpath(
                        ".//div[contains(@class,'module-wrapper')]/img/@src").
                    extract()[0].strip())
                if figure.xpath(
                        ".//div[contains(@class,'module-wrapper')]/img/@alt"):
                    image_alt = figure.xpath(
                        ".//div[contains(@class,'module-wrapper')]/img/@alt"
                    ).extract()[0].strip()
                elif figure.xpath(
                        ".//div[contains(@class,'module-wrapper')]/img/@title"
                ):
                    image_alt = figure.xpath(
                        ".//div[contains(@class,'module-wrapper')]/img/@title"
                    ).extract()[0].strip()
                else:
                    image_alt = None
                image_desc = figure.xpath(
                    ".//div[contains(@class,'module-wrapper')]/figcaption[contains(@class,'photo-caption')]"
                )
                caption_images[image_url] = (image_desc.extract()[0].strip() if
                                             image_desc else None, image_alt)
            figure_items = article.xpath(".//figure[contains(@class,'item')]")
            for figure_item in figure_items:
                image_url = urlparse.urljoin(
                    response.url,
                    figure_item.xpath(
                        ".//descendant::div[contains(@class,'slide-image')]/img/@src"
                    ).extract()[0].strip())
                if figure_item.xpath(
                        ".//descendant::div[contains(@class,'slide-image')]/img/@alt"
                ):
                    image_alt = figure_item.xpath(
                        ".//descendant::div[contains(@class,'slide-image')]/img/@alt"
                    ).extract()[0].strip()
                elif figure_item.xpath(
                        ".//descendant::div[contains(@class,'slide-image')]/img/@title"
                ):
                    image_alt = figure_item.xpath(
                        ".//descendant::div[contains(@class,'slide-image')]/img/@title"
                    ).extract()[0].strip()
                else:
                    image_alt = None
                image_desc = figure_item.xpath(
                    ".//descendant::div[contains(@class,'credits-caption')]/figcaption[contains(@class,'photo-caption')]"
                )
                caption_images[image_url] = (image_desc.extract()[0].strip() if
                                             image_desc else None, image_alt)
            media_items = article.xpath(".//img[@class='media-image']")
            for media_item in media_items:
                image_url = media_item.xpath("./@src").extract()[0]
                if len(media_item.xpath("../text()")) > 0:
                    image_desc = media_item.xpath(
                        "../text()").extract()[0].strip()
                else:
                    image_desc = None
                image_alt = None
                caption_images[image_url] = (image_desc, image_alt)

            item['image_urls'] = [u for u in caption_images.keys()]
            json["caption_images"] = caption_images

            content_document = fromstring(article.extract()[0].strip())
            slideshows = content_document.xpath(
                ".//div[@id='vance-slideshow']")
            for slideshow in slideshows:
                for slideshow_item in slideshow.xpath(
                        ".//figure[contains(@class,'item')]"):
                    keep_img = slideshow_item.xpath(
                        ".//descendant::div[contains(@class,'slide-image')]/img"
                    )[0]
                    slideshow.addnext(keep_img)
                slideshow.getparent().remove(slideshow)
            figure_docs = content_document.xpath(
                ".//figure[contains(@class,'article-featured-image')]")
            for figure_doc in figure_docs:
                keep_img = figure_doc.xpath(
                    ".//div[contains(@class,'module-wrapper')]/img")[0]
                figure_doc.addnext(keep_img)
                figure_doc.getparent().remove(figure_doc)
            json["content"] = tostring(content_document, encoding="UTF-8")

            item["json"] = json
            item["html"] = html
            htmls_path = {sourceurl: html}
            item["htmls_path"] = htmls_path
            item["source_url"] = sourceurl
            yield item
        else:
            self.log('the item is invalid news! %s' % url)