Python html2text Exemples, amcatscraping.tools.html2text Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : newsdesk.py Projet : amcat/amcat-scraping

    def scrape_unit(self, unit: NewsdeskUnit):
        article_element, article = unit

        try:
            article_element.find_element_by_css_selector(".extract_full_link_button").click()
        except NoSuchElementException:
            pass
        else:
            self.wait(".extract_full_link_button", on=article_element)

        try:
            inner = self.wait(".extract_inner", on=article_element)
        except NotVisible:
            if article.get_property("wordcount_int") <= 1:
                article.text = "[NO TEXT]"
            else:
                raise
        else:
            article_html = inner.get_attribute("innerHTML")
            article.text = html2text(article_html)

            # Cut off data urls at #3; no article actually has that many. All instances so far led to the same
            # data (even though the url differed).
            data_urls = list(get_data_urls(inner))[:3]
            for i, data_url in enumerate(data_urls):
                article.set_property("data{}_url".format(i), data_url)

        # Be gentle with servers
        time.sleep(random.uniform(0.1, 0.3))

        return article

Exemple #2

0

Afficher le fichier

Fichier : generic.py Projet : amcat/amcat-scraping

    def scrape_unit(self, url):
        reader_url = "about:reader?url={}".format(url)
        doc = self.get_html(reader_url, wait_for="div.content p")

        for tag in REMOVE_TAGS:
            for element in doc.cssselect(tag):
                element.getparent().remove(element)

        article = doc.cssselect("div.content")[0]
        article_html = lxml.html.tostring(article).decode()

        title = doc.cssselect("h1.reader-title")[0].text_content().strip()
        text = html2text(article_html)

        if self.__class__.get_date is not GenericScraper.get_date:
            # Get contents of un-firefox-read-ed article
            self.wait(".reader-toolbar .close-button").click()
            time.sleep(0.3)
            doc_html = self.wait("html").get_attribute("outerHTML")
            doc = lxml.html.fromstring(doc_html, base_url=url)

            try:
                date = self.get_date(doc)
            except NotImplementedError:
                date = self.now
            except Exception as e:
                log.warning("get_date() failed for {} with: {}".format(url, e))
                date = self.now
        else:
            date = self.now

        article = Article(date=date, title=title, text=text, url=url)

        return article

Exemple #3

0

Afficher le fichier

Fichier : geenstijl.py Projet : Gallifery/amcat-scraping

    def scrape_unit(self, date_and_article_url):
        date, article_url = date_and_article_url
        log.info("Fetching {}".format(article_url))
        article_doc = self.session.get_html(article_url)

        article_el = article_doc.cssselect("#content > article")

        if not article_el:
            log.error("Could not find article on {article_url}".format(**locals()))
            return None

        title = article_el[0].cssselect("h1")[0].text
        text = html2text(article_el[0].cssselect("p"))
        text = text.strip() or "."

        try:
            footer = article_el[0].cssselect("footer")[0]
        except IndexError as e:
            # Contains <embed> tag which is not closed gracefully :-(
            log.exception(e)
            return None

        author = footer.text.rsplit("|", 1)[0].strip()
        timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime"))
        if not title:
            return None

        children = self._get_comments(title, article_url, article_doc)

        article = Article(date=timestamp, title=title, text=text)
        article.set_property("author", author)
        article.set_property("url", article_url)
        article.set_property("medium", "GeenStijl")

        return ArticleTree(article, [ArticleTree(c, []) for c in children])

Exemple #4

0

Afficher le fichier

    def scrape_unit(self, url):
        reader_url = "about:reader?url={}".format(url)
        doc = self.get_html(reader_url, wait_for="div.content p")

        for tag in REMOVE_TAGS:
            for element in doc.cssselect(tag):
                element.getparent().remove(element)

        article = doc.cssselect("div.content")[0]
        article_html = lxml.html.tostring(article).decode()

        title = doc.cssselect("h1.reader-title")[0].text_content().strip()
        text = html2text(article_html)

        if self.__class__.get_date is not GenericScraper.get_date:
            # Get contents of un-firefox-read-ed article
            self.wait(".reader-toolbar .close-button").click()
            time.sleep(0.3)
            doc_html = self.wait("html").get_attribute("outerHTML")
            doc = lxml.html.fromstring(doc_html, base_url=url)

            try:
                date = self.get_date(doc)
            except NotImplementedError:
                date = self.now
            except Exception as e:
                log.warning("get_date() failed for {} with: {}".format(url, e))
                date = self.now
        else:
            date = self.now

        article = Article(date=date, title=title, text=text, url=url)

        return article

Exemple #5

0

Afficher le fichier

Fichier : nieuws_nl.py Projet : nruigrok/amcat-scraping

 def _scrape_unit(self, article):
     doc = self.session.get_html(article['url'])
     article['headline'] = doc.cssselect("div.intro h1")[0].text
     article['text'] = html2text(
         doc.cssselect("div.intro h2") + doc.cssselect("div.text"))
     if doc.cssselect("span.author"):
         article['author'] = doc.cssselect("span.author"
         )[0].text.split("Door:")[1]

Exemple #6

0

Afficher le fichier

Fichier : teletekst.py Projet : nruigrok/amcat-scraping

 def _scrape_unit(self, url):
     doc = self.session.get_html(url)
     article = {
         'url' : doc.base_url,
         'text' : tools.html2text(doc.cssselect("#article-content p")),
         'headline' : doc.cssselect("#article h1")[0].text_content().strip(),
         'date' : tools.read_date(doc.cssselect("abbr.page-last-modified")[0].text),
         'externalid' : int(url.split("/")[-1].split("-")[0])}
     return article

Exemple #7

0

Afficher le fichier

Fichier : nu.py Projet : Gallifery/amcat-scraping

    def get_article_section_text(self, url):
        article_doc = self.session.get_html(url)
        print(url)

        yield article_doc.cssselect("#main")[0].text_content().strip()
        #yield article_doc.cssselect(".breadcrumb li.active")[0].text_content().strip()
        #text = html2text(article_doc.cssselect(".article.body")[0]).strip()
        text = html2text(
            article_doc.cssselect("div.block.article-body")[0]).strip()
        print(f"{text}")

        # Bylines are sometimes included in the main body text. We won't put in
        # any effort to pluck out those cases though..
        byline = article_doc.cssselect(".article.header .item-excerpt")
        if byline:
            text += "\n\n"
            text += html2text(byline[0])
        yield text

Exemple #8

0

Afficher le fichier

Fichier : fd.py Projet : amcat/amcat-scraping

    def scrape_unit(self, article_info: ArticleTuple):
        date, page_num, url = article_info

        try:
            text_url = strip_query(self.session.get_redirected_url(url))
        except RedirectError as e:
            if e.status_code == 404:
                return None
            raise

        text_doc = self.session.get_html(text_url)

        for image in text_doc.cssselect(".image"):
            image.getparent().remove(image)

        date = datetime.datetime(date.year, date.month, date.day)
        try:
            title = text_doc.cssselect("article > h1")[0].text.strip()
        except:
            return None

        text = html2text(text_doc.cssselect("main > article > .body"))
        if not text.strip():
            return None

        article = Article(title=title, date=date, text=text, url=url)

        if text_doc.cssselect("article > header.themed"):
            # New headers style
            author = text_doc.cssselect("article > header .author")[0].text
            section = text_doc.cssselect("article > header .title")[0].text
            article.set_property("author", author)
        else:
            # Old header style
            section = text_doc.cssselect("article > header > .title")
            section = section[0].text if section else "NOSECTION"
            author_a = text_doc.cssselect("article .author a")
            if author_a:
                author = author_a[0].text.strip()
                article.set_property("author", author)
                if author == section:
                    section = "Opinie"

        download = text_doc.cssselect('form[name="download"]')
        if download:
            pdf_url = download[0].get("action")
            article.set_property("pdf_url", pdf_url)

        article.set_property("text_url", text_url)
        article.set_property("image_url", text_url + "?view=img")

        if section:
            article.set_property("section", section.strip())

        return article

Exemple #9

0

Afficher le fichier

Fichier : geenstijl.py Projet : Gallifery/amcat-scraping

    def _parse_comment(self, comment, base_title, base_url):
        text = html2text(comment.cssselect("p"))
        article_id = comment.get("id")
        title = "{base_title}#{article_id}".format(**locals())
        url = "{base_url}#{article_id}".format(**locals())
        author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content())

        article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url)
        article.set_property("author", author.strip())
        article.set_property("medium", "GeenStijl Comments")
        return article

Exemple #10

0

Afficher le fichier

Fichier : telegraaf.py Projet : amcat/amcat-scraping

    def _get_deduplicate_units(self, date, edition=None):
        self.browser.get("https://digitalpublishing.telegraaf.nl/static/krant/")

        found = False
        for day_container in self.browser.find_elements_by_css_selector(".Day__date-container"):
            paper_date_string = " ".join(day_container.text.split()[1:3] + ["2018"])
            paper_date = dutch_strptime(paper_date_string, "%d %B %Y").date()
            print(paper_date_string, paper_date, date, paper_date==date)
            if date == paper_date:
                self.wait(".Day__button", on=day_container).click()
                found = True
                break 

        if found:
            self.wait("#next-page-button")
            while self.next_button().is_displayed():
                for article in self.browser.find_elements_by_css_selector(".pages-swiper-slide-active .article-layer"):
                    self.click(article)
                    time.sleep(1.5)

                    try:
                        self.browser.switch_to_frame(self.wait("iframe.article-contents", timeout=10))
                    except NotVisible:
                        print("Warning: article skipped because frame was not visible")
                        continue

                    article_html = self.wait("body").get_property("outerHTML")
                    text = html2text(article_html)
                    url = self.browser.current_url
                    (scheme, netloc, path, params, query, fragment) = parse.urlparse(url)
                    query += "&hash=" + hashlib.sha256(article_html.encode()).hexdigest()[:20]
                    url = parse.urlunparse((scheme, netloc, path, params, query, fragment))
                    page_range = fragment.split("/")[-1]

                    try:
                        title = self.wait("body > .head", timeout=2).text.strip()
                    except:
                        continue
                    else:
                        yield TelegraafUnit(url, date, title, text, page_range)
                    finally:
                        self.browser.switch_to_default_content()

                        # Close modal
                        self.wait(".article-modal-default-button").click()

                        time.sleep(0.5)


                self.next_button().click()
                time.sleep(0.5)

Exemple #11

0

Afficher le fichier

Fichier : nu_nl.py Projet : nruigrok/amcat-scraping

    def _scrape_unit(self, url):
        doc = self.session.get_html(url)
        date_txt = doc.cssselect("div.dateplace-data")[0].text
        date = read_date(date_txt)
        if not date.date() in self.dates:
            return
        hours, minutes = map(int,date_txt.split()[-1].split(":"))
        date += timedelta(hours = hours, minutes = minutes)

        content = doc.cssselect("#leadarticle")[0]
        article = {
            'text' : html2text(content.cssselect("div.content")),
            'headline' : content.cssselect("h1")[0].text,
            'section' : url.split("/")[3],
            'date' : date,
            'url' : url,
            }
        if content.cssselect("span.smallprint"):
            article['author'] = content.cssselect("span.smallprint"
            )[-1].text.split(":")[1].strip()
        return article

Exemple #12

0

Afficher le fichier

    def scrape_unit(self, article_info: ArticleTuple):
        date, page_num, url = article_info

        try:
            text_url = strip_query(self.session.get_redirected_url(url))
        except RedirectError as e:
            if e.status_code == 404:
                return None
            raise

        try:
            text_doc = self.session.get_html(text_url)
        except HTTPError as e:
            if e.response.status_code == 404:
                logging.warning(f"{url} returned 404 skipping")
                return None
            else:
                raise

        for image in text_doc.cssselect(".image"):
            image.getparent().remove(image)

        date = datetime.datetime(date.year, date.month, date.day)
        try:
            title = text_doc.cssselect("article > h1")[0].text.strip()
        except:
            return None

        text = html2text(text_doc.cssselect("main > article > .body"))
        if not text.strip():
            return None

        article = Article(title=title, date=date, text=text, url=url)

        if text_doc.cssselect("article > header.themed"):
            # New headers style
            author = text_doc.cssselect("article > header .author")[0].text
            section = text_doc.cssselect("article > header .title")[0].text
            article.set_property("author", author)
        else:
            # Old header style
            section = text_doc.cssselect("article > header > .title")
            section = section[0].text if section else "NOSECTION"
            author_a = text_doc.cssselect("article .author a")
            if author_a:
                author = author_a[0].text.strip()
                article.set_property("author", author)
                if author == section:
                    section = "Opinie"

        download = text_doc.cssselect('form[name="download"]')
        if download:
            pdf_url = download[0].get("action")
            article.set_property("pdf_url", pdf_url)

        article.set_property("text_url", text_url)
        article.set_property("image_url", text_url + "?view=img")

        if section:
            article.set_property("section", section.strip())

        return article

Exemple #13

0

Afficher le fichier

Fichier : ad.py Projet : Gallifery/amcat-scraping

    def _get_deduplicate_units(self, date: datetime.datetime, edition=None):
        # Select edition
        logging.info(f"Selecting date {date}")
        self.accept_cookie(timeout=1)

        self.browser.get(self.login_url)
        if edition is not None:
            self.click(
                self.wait('//div[text() = "{}"]'.format(edition), by=By.XPATH))
            # accept cookies
            self.accept_cookie2()

        # Go to archive and select paper of this date
        self.wait("paper-button.showMoreButton").click()

        # make sure right header is not hidden
        header = self.wait('#rightHeader', visible=False)
        self.browser.execute_script('arguments[0].removeAttribute("hidden");',
                                    header)

        # click "Archief" button
        self.wait('archive-calendar-button').click()

        # find correct year
        while True:
            picked_year = int(
                self.wait("#yearSelection > paper-slider").get_attribute(
                    "value"))
            #picked_year = int(self.wait('input.style-scope.vl-date-picker').text)
            if picked_year == date.year:
                break
            year_buttons = self.browser.find_elements_by_css_selector(
                "#yearSelection iron-icon")
            if picked_year > date.year:
                year_buttons[0].click()
            elif len(year_buttons) > 1:
                year_buttons[1].click()
            else:
                raise Exception(
                    f"Only one year button, but {picked_year} < {date.year}")

        # find correct month
        self.wait("#monthSelection").find_element_by_xpath(
            f"//paper-button[@data-month={date.month-1}]").click()
        # find correct day -- wait 3 seconds to give date picker time to load
        time.sleep(3)
        day_button = self.wait("#daySelection").find_element_by_xpath(
            f"//paper-button[@data-current and @data-day={date.day}]")
        logging.info(
            f'{date}: aria-disabled: {day_button.get_attribute("aria-disabled")}, active: {day_button.get_attribute("active")}'
        )
        if day_button.get_attribute("aria-disabled") == "true":
            logging.warning(f"No newspaper for {date}, Sunday?")
            return
        if day_button.get_attribute("active") == "false":
            # Don't click if correct day was already selected
            day_button.click()
        self.wait("#selectButton").click()

        for archive_issue in self.browser.find_elements_by_css_selector(
                "archive-issue"):
            try:
                archive_date = archive_issue.find_element_by_css_selector(
                    ".issueDate").text.strip()
            except NoSuchElementException:
                continue
            if not archive_date:
                continue
            if dutch_strptime(archive_date, "%d %B %Y").date() == date:
                archive_issue.click()
                break
        else:
            logging.warning(f"Could not find date {date}")
            return

        # Scrape unit
        self.browser.switch_to_frame(self.wait("iframe#issue"))

        seconds_forgone = 0
        start = datetime.datetime.now()
        while seconds_forgone < 30:
            seconds_forgone = (datetime.datetime.now() - start).total_seconds()

            try:
                self.wait("#articleMenuItem", timeout=10).click()
            except ElementClickInterceptedException:
                pass
            else:
                break

        article_list_buttons = self.browser.find_elements_by_css_selector(
            "#articleListSectionsButtons > button")
        article_list_buttons = list(article_list_buttons) or [lambda: None]

        for article_list_button in article_list_buttons:
            if not "selected" in article_list_button.get_attribute("class"):
                article_list_button.click()
            articles = list(
                self.browser.find_elements_by_css_selector(".articleListItem"))
            for article in articles:
                page = int(article.get_attribute("data-page"))
                refid = article.get_attribute("data-refid")
                url = urljoin(self.browser.current_url + "/", refid)

                def collect_headers(els):
                    for el in els:
                        el_text = el.get_property("textContent").strip()
                        if el_text:
                            yield (el, el_text)

                h1s = list(
                    collect_headers(
                        article.find_elements_by_css_selector(
                            ".articleListItem > h1")))
                h2s = list(
                    collect_headers(
                        article.find_elements_by_css_selector(
                            ".articleListItem > h2")))
                h3s = list(
                    collect_headers(
                        article.find_elements_by_css_selector(
                            ".articleListItem > h3")))

                if h1s:
                    _, title = h1s.pop(0)
                elif h2s:
                    _, title = h2s.pop(0)
                else:
                    _, title = h3s.pop(0)

                try:
                    content = article.find_element_by_css_selector(
                        "div.content").get_property("outerHTML")
                except NoSuchElementException:
                    continue

                subtitles = [
                    element.get_property("outerHTML")
                    for element, _ in h1s + h2s + h3s
                ]
                article_html = "".join(subtitles) + content
                text = html2text(article_html)

                #try:
                #    author = article.find_element_by_css_selector(".byline").get_property("textContent").strip()
                #except NoSuchElementException:
                #    pass
                #else:
                #    print(author)

                # Screenshot code:
                # article.click()
                # self.browser.switch_to_frame(self.wait("#articleViewContent > iframe"))
                # screenshot = self.wait("#page article").screenshot_as_base64
                # self.browser.switch_to_default_content()
                # self.browser.switch_to_frame(self.wait("#issue"))
                # self.wait("#articleNavigationBack").click()
                # time.sleep(0.5)
                screenshot = None

                yield EPagesUnit(url, date, title, page, screenshot, text)

Exemple #14

0

Afficher le fichier

Fichier : telegraaf.py Projet : Gallifery/amcat-scraping

    def _get_deduplicate_units(self, date, edition=None):
        self.browser.get(
            "https://digitalpublishing.telegraaf.nl/static/krant/")

        found = False
        for day_container in self.browser.find_elements_by_css_selector(
                ".Day__date-container"):
            paper_date_string = " ".join(day_container.text.split()[1:3] +
                                         [str(date.year)])
            paper_date = dutch_strptime(paper_date_string, "%d %B %Y").date()
            print(
                f"datum krant {paper_date_string}, {paper_date}, {date}, {paper_date==date}"
            )
            if date == paper_date:
                self.wait(".Day__button", on=day_container).click()
                found = True
                break

        if found:
            self.wait("#next-page-button")
            while self.next_button().is_displayed():
                try:
                    articles = self.wait_multiple(
                        ".pages-swiper-slide-active .article-layer")
                except NoSuchElementException:
                    logging.warning(f"Could not find article layer in {url}")
                    articles = []

                for article in articles:
                    self.click(article)
                    time.sleep(1.5)

                    try:
                        self.browser.switch_to_frame(
                            self.wait("iframe.article-contents", timeout=10))
                    except NotVisible:
                        print(
                            "Warning: article skipped because frame was not visible"
                        )
                        continue

                    article_html = self.wait("body").get_property("outerHTML")
                    text = html2text(article_html)
                    url = self.browser.current_url
                    (scheme, netloc, path, params, query,
                     fragment) = parse.urlparse(url)
                    query += "&hash=" + hashlib.sha256(
                        article_html.encode()).hexdigest()[:20]
                    url = parse.urlunparse(
                        (scheme, netloc, path, params, query, fragment))
                    page_range = fragment.split("/")[-1]
                    try:
                        title = self.wait("body > .head",
                                          timeout=2).text.strip()
                    except NoSuchElementException:
                        try:
                            title = self.wait("body > .head1",
                                              timeout=2).text.strip()
                        except NoSuchElementException:
                            logging.warning(f"No title found: {url}")
                            title = "-"
                    if not title:
                        logging.warning(f"Empty title for {url}")
                        title = "-"
                    yield TelegraafUnit(url, date, title, text, page_range)
                    self.browser.switch_to_default_content()
                    # Close modal
                    self.wait(".article-modal-default-button").click()
                    time.sleep(0.5)

                self.next_button().click()
                time.sleep(0.5)

Exemple #15

0

Afficher le fichier

Fichier : ad.py Projet : amcat/amcat-scraping

    def _get_deduplicate_units(self, date, edition=None):
        # Select edition
        self.browser.get(self.login_url)

        if edition is not None:
            self.click(self.wait('//div[text() = "{}"]'.format(edition), by=By.XPATH))

        # Go to archive and select paper of this date
        self.wait("paper-button.showMoreButton").click()

        for archive_issue in self.browser.find_elements_by_css_selector("archive-issue"):
            try:
                archive_date = archive_issue.find_element_by_css_selector(".issueDate").text.strip()
            except NoSuchElementException:
                continue
            if not archive_date:
                continue
            if dutch_strptime(archive_date, "%d %B %Y").date() == date:
                archive_issue.click()
                break
        else:
            return

        # Scrape unit
        self.browser.switch_to_frame(self.wait("#issue"))

        seconds_forgone = 0
        start = datetime.datetime.now()
        while seconds_forgone < 30:
            seconds_forgone = (datetime.datetime.now() - start).total_seconds()

            try:
                self.wait("#articleMenuItem", timeout=60).click()
            except ElementClickInterceptedException:
                pass
            else:
                break

        article_list_buttons = self.browser.find_elements_by_css_selector("#articleListSectionsButtons > button")
        article_list_buttons = list(article_list_buttons) or [lambda: None]

        time.sleep(2)

        for article_list_button in article_list_buttons:
            article_list_button.click()
            time.sleep(2)
            articles = list(self.browser.find_elements_by_css_selector(".articleListItem"))
            for article in articles:
                page = int(article.get_attribute("data-page"))
                refid = article.get_attribute("data-refid")
                url = urljoin(self.browser.current_url + "/", refid)

                def collect_headers(els):
                    for el in els:
                        el_text = el.get_property("textContent").strip()
                        if el_text:
                            yield (el, el_text)

                h1s = list(collect_headers(article.find_elements_by_css_selector(".articleListItem > h1")))
                h2s = list(collect_headers(article.find_elements_by_css_selector(".articleListItem > h2")))
                h3s = list(collect_headers(article.find_elements_by_css_selector(".articleListItem > h3")))

                if h1s:
                    _, title = h1s.pop(0)
                elif h2s:
                    _, title = h2s.pop(0)
                else:
                    _, title = h3s.pop(0)

                try:
                    content = article.find_element_by_css_selector("div.content").get_property("outerHTML")
                except NoSuchElementException:
                    continue

                subtitles = [element.get_property("outerHTML") for element, _ in h1s + h2s + h3s]
                article_html = "".join(subtitles) + content
                text = html2text(article_html)

                #try:
                #    author = article.find_element_by_css_selector(".byline").get_property("textContent").strip()
                #except NoSuchElementException:
                #    pass
                #else:
                #    print(author)

                # Screenshot code:
                # article.click()
                # self.browser.switch_to_frame(self.wait("#articleViewContent > iframe"))
                # screenshot = self.wait("#page article").screenshot_as_base64
                # self.browser.switch_to_default_content()
                # self.browser.switch_to_frame(self.wait("#issue"))
                # self.wait("#articleNavigationBack").click()
                # time.sleep(0.5)
                screenshot = None

                yield EPagesUnit(url, date, title, page, screenshot, text)