Exemple #1
0
    def parse(self):
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        site = NewsSite(self.url)
        home_raw = requests.get(self.url, headers=self.headers, verify=False)
        home = html.fromstring(home_raw.content)

        excerpts = home.xpath('//article')

        for excerpt in excerpts:
            title = excerpt.xpath('.//h2/a/text()')[0].strip()

            url = excerpt.xpath('.//h2/a/@href')[0]
            article_raw = requests.get(url, headers=self.headers)
            article = html.fromstring(article_raw.content)
            # for script in article.xpath('//script'):
            #     script.getparent().remove(script)
            # for style in article.xpath('//style'):
            #     style.getparent().remove(style)
            # for div in article.xpath('//div[contains(@class, "wipa-container")]'):
            #     div.getparent().remove(div)
            image_url = None
            image_raw = article.xpath('.//meta[@property="og:image"]/@content')
            if len(image_raw):
                image_url = image_raw[0]
            date_raw = article.xpath(
                '//div[contains(@class, "cb-entry-header")]//time/@datetime'
            )[0]
            date = datetime.strptime(date_raw.strip()[:10], "%Y-%m-%d")
            body_html_raw = article.xpath('//span[@class="cb-itemprop"]')[0]
            body_html = html.tostring(body_html_raw)
            body_text = body_html_raw.text_content().strip()

            site.add_article(title, url, image_url, date, body_html, body_text)

        return site.to_dict()
Exemple #2
0
    def parse(self):
        site = NewsSite(self.url)
        home_raw = requests.get(self.url, headers = self.headers)
        home = html.fromstring(home_raw.content)

        excerpts = home.xpath('//article[contains(@class, "post")]')

        for excerpt in excerpts:
            title = excerpt.xpath('.//h2/a/text()')[0].strip()
            
            url = excerpt.xpath('.//h2/a/@href')[0]
            article_raw = requests.get(url, headers = self.headers)
            article = html.fromstring(article_raw.content)
            for script in article.xpath('//script'):
                script.getparent().remove(script)
            for style in article.xpath('//style'):
                style.getparent().remove(style)
            for div in article.xpath('//div[@class="entry-meta-tags"]'):
                div.getparent().remove(div)
            for div in article.xpath('//div[@class="post-body-social"]'):
                div.getparent().remove(div)
            for div in article.xpath('//div[@class="entry-meta-author"]'):
                div.getparent().remove(div)
            image_url = article.xpath('.//meta[@property="og:image"]/@content')[0]
            date_raw = article.xpath('//time[@class="value-title"]/@datetime')[0]
            date = datetime.strptime(date_raw.strip()[:10], "%Y-%m-%d")
            body_html_raw = article.xpath('//div[contains(@class, "post-content")]')[0]
            body_html = html.tostring(body_html_raw)
            body_text = body_html_raw.text_content().strip()

            site.add_article(title, url, image_url, date, body_html, body_text)

        return site.to_dict()
Exemple #3
0
    def parse(self):
        site = NewsSite(self.url)
        home_raw = requests.get(self.url, headers=self.headers)
        home = html.fromstring(home_raw.content)

        excerpts = home.xpath('//a[@class="leafly-article"]')

        for excerpt in excerpts:
            title = excerpt.xpath('.//span[@class="leafly-title"]/text()')[0]
            #print u'Processing article: {}'.format(title)
            url = excerpt.xpath('./@href')[0]
            article_raw = requests.get(url, headers=self.headers)
            article = html.fromstring(article_raw.content)
            for script in article.xpath('//script'):
                script.getparent().remove(script)
            date_raw = article.xpath(
                '//meta[@property="article:published_time"]/@content')
            date = None
            if len(date_raw):
                date_raw = date_raw[0]
                date = datetime.strptime(date_raw.strip()[:10], "%Y-%m-%d")
            image_url = None
            image_url_raw = article.xpath(
                '//div[@class="leafly-standard-article-header"]//img/@src')
            if len(image_url_raw):
                image_url = image_url_raw[0]
            body_html = None
            body_text = None
            body_html_raw = article.xpath(
                '//div[@class="leafly-legacy-article"]')
            if len(body_html_raw):
                body_html = html.tostring(body_html_raw[0])
            else:
                body_html_raw = article.xpath(
                    '//div[@class="post-content style-light"]/div[2]//div[@class="uncode_text_column"]'
                )
                if len(body_html_raw):
                    body_html = html.tostring(body_html_raw[0])

            if len(body_html_raw):
                body_text = body_html_raw[0].text_content().strip()
                site.add_article(title, url, image_url, date, body_html,
                                 body_text)

        return site.to_dict()
Exemple #4
0
    def parse(self):
        site_result = NewsSite(self.url)

        html_string = self.html_getter.get(self.url)
        if not html_string:
            return

        html_document = html.fromstring(html_string)

        news = HtmlUtils.get_elements(html_document, self.xpaths_container.get_news_xpath())

        for n in news:
            title = self._get_title(n)
            url = self._get_url(n)

            if not url:
                continue

            html_string = self.html_getter.get(url)

            if not html_string:
                continue

            article = html.fromstring(html_string)

            image_url = self._get_image_url(article)
            date = self._get_date(article)

            self.remove_elements(article, self.xpaths_container.get_elements_to_remove_xpaths())

            text_html = self._get_text_html(article)
            text_plain = self._get_text_plain(article)

            site_result.add_article(title, url, image_url, date, text_html, text_plain)

        return site_result.to_dict()