def parse_details(self, response):
        var = CnnMiddleEastArticles()
        list_content, final_output = [], []

        var["title"] = response.css(
            "h1._2JPm2UuC56::text").extract_first().strip()
        summary = response.css(
            "div.clearfix.wysiwyg._2A-9LYJ7eK p strong::text").extract_first(
            ).replace("\n", " ")
        if summary is None:
            summary = response.css(
                "div.clearfix.wysiwyg._2A-9LYJ7eK p strong span::text"
            ).extract_first().replace("\n", " ")
        summary = summary.partition(
            "(CNN)")[2] if "(CNN)" in summary else summary
        var["summary"] = summary
        for i in response.css(
                "div.clearfix.wysiwyg._2A-9LYJ7eK p:nth-child(n+2)"):
            list_content.append("".join(
                i.xpath('descendant-or-self::text()').extract()))
            temp1 = [i.replace("\n", " ") for i in list_content]
            temp2 = [i.replace("\r", " ") for i in temp1]
            final_output = [i.replace("\xa0", " ") for i in temp2]
        var["article_content"] = final_output
        yield var
Exemple #2
0
 def parse_details(self, response):
     list_content = []
     middle_east = CnnMiddleEastArticles()
     middle_east["title"] = response.css("div.sna_content_head_cont h1.sna_content_heading::text").extract_first() \
         .strip()
     for i in response.css(
             "div.article-body div#firstBodyDiv > p:nth-child(n+1)"):
         list_content.append("".join(
             i.xpath('descendant-or-self::text()').extract()))
         middle_east["article_content"] = list_content
         middle_east["tags"] = response.css(
             "div.article-tags.noprint div a h2::text").extract()
     if middle_east["article_content"] and len(
             middle_east["tags"]) > 1:  # we need more than 2 tags at least!
         yield middle_east
    def parse_details(self, response):
        var = CnnMiddleEastArticles()
        var["title"] = response.css("h1._2JPm2UuC56::text").extract_first().strip()
        # for the content of articles i will have to cover all the structure of the webpages

        try:
            temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(div.first-child)::text").extract()]
            if len(temp) != 1:  # if it has "" only!
                var["article_content"] = self.clear_input(temp)

            elif not var["article_content"] and len(temp) != 1:
                temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(:first-child) > strong > span > span > span"
                                                         " > span > span > span > span > span::text").extract()]
                var["article_content"] = self.clear_input(temp)

            elif not var["article_content"] and len(temp) != 1:
                temp = [i.rstrip() for i in
                        response.css("div.wysiwyg p:not(:first-child) > span > span > span > span > span > span > span "
                                     "> span::text").extract()]
                var["article_content"] = self.clear_input(temp)

            elif not var["article_content"] and len(temp) != 1:
                temp = [i.rstrip() for i in
                        response.css("div.wysiwyg p:not(:first-child)> span > span > span > span > span > "
                                     "span:nth-child(3) > span > span::text").extract()]
                var["article_content"] = self.clear_input(temp)

            elif not var["article_content"] and len(temp) != 1:
                temp = [i.rstrip() for i in
                        response.css("div.wysiwyg p:not(:first-child)> span > span > span > span > span > "
                                     "span:nth-child(2) > span > span::text").extract()]
                var["article_content"] = self.clear_input(temp)

            elif not var["article_content"] and len(temp) != 1:
                temp = [i.rstrip() for i in response.css("div.wysiwyg._2A-9LYJ7eK p::text").extract()]
                var["article_content"] = self.clear_input(temp)
            else:
                var["article_content"] = "you did not cover this case."

            var["tags"] = [i.strip() for i in response.css("ul.AsCeVPiOdE li a::text").extract()]
            if var["tags"] and var["article_content"]:  # do not save any article that has neither tag nor content!
                yield var

        except KeyError as e:
            print(e)