Exemple #1
0
 def scrape_unit(self, unit):
     date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None)
     hostname = urlparse(unit["url"]).hostname
     publisher = ".".join(hostname.split(".")[-2:])
     title = unit["titel"].strip() or "[No title]"
     article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date)
     article.set_property("author", unit["auteur"])
     article.set_property("publisher", publisher)
     return article
    def _parse_comment(self, comment, base_title, base_url):
        text = html2text(comment.cssselect("p"))
        article_id = comment.get("id")
        title = "{base_title}#{article_id}".format(**locals())
        url = "{base_url}#{article_id}".format(**locals())
        author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content())

        article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url)
        article.set_property("author", author.strip())
        article.set_property("medium", "GeenStijl Comments")
        return article
Exemple #3
0
 def scrape_unit(self, unit):
     date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None)
     hostname = urlparse(unit["url"]).hostname
     publisher = ".".join(hostname.split(".")[-2:])
     title = unit["titel"].strip() or "[No title]"
     article = Article(title=title,
                       text=unit["bericht tekst"],
                       url=unit["url"],
                       date=date)
     article.set_property("author", unit["auteur"])
     article.set_property("publisher", publisher)
     return article
Exemple #4
0
    def scrape_unit_meta(self, article_element):
        CONTEXT['unit'] = article_element

        article_html = article_element.get_attribute("outerHTML")
     #   print(f"dit is html{article_html}")
        article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL)
        CONTEXT['doc'] = article_element

        def get_byline_prop(prop):
            for meta_element in article_doc.cssselect(f".nd-article__{prop}"):
                prop_value = meta_element.text_content().strip()
                if prop_value:
                    return prop_value
            else:
                raise ValueError("Article {} has no property '{}'.".format(title, prop))

        text_url = article_doc.cssselect("a.nd-article__headline-text")[0].get("href")
        url = "newsdesk://{}".format(get_newsdesk_article_id(text_url))
        title = article_doc.cssselect("a.nd-article__headline-text")[0].text_content().strip()
        print(title)
        publisher = get_byline_prop("source")
        date_text = article_doc.cssselect(".nd-article__date")[0].get("title")
        date = date_text.split("Publicatiedatum:")
        pub_date = date[-1]
        pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M")
        load_date = date[1]
        load_date = dutch_strptime(load_date.strip(), "%d %b %Y %H:%M")

        article = Article(url=url, title=title, date=pub_date)
        article.set_property("publisher", publisher)
        article.set_property("text_url", text_url)

        # Crashes AmCAT API:
        #article.set_property("pubdate_date", pub_date)

        try:
            author = get_byline_prop("author")
            article.set_property("author", author)
        except ValueError:
            pass
        try:
            article.set_property("wordcount_int", int(get_byline_prop("word-count").split()[0].replace(",", "")))
        except ValueError:
            logging.warning("could not find word count")
        try:
            article.set_property("country", get_byline_prop("source_country"))
        except ValueError:
            pass
        return NewsdeskUnit(article_element, article)
Exemple #5
0
    def scrape_unit_meta(self, article_element):
        article_html = article_element.get_attribute("outerHTML")
        article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL)

        def get_byline_prop(prop):
            for meta_element in article_doc.cssselect(".article_byline__element.{}".format(prop)):
                prop_value = meta_element.text_content().strip()
                if prop_value:
                    return prop_value
            else:
                raise ValueError("Article {} has no property '{}'.".format(title, prop))

        text_url = article_doc.cssselect("a.article_headline")[0].get("href")
        url = "newsdesk://{}".format(get_newsdesk_article_id(text_url))

        title = article_doc.cssselect("a.article_headline")[0].text_content().strip()
        publisher = get_byline_prop("source")

        date = get_byline_prop("harvest_date")
        date, pub_date = date.split("(gepubliceerd: ")
        date = dutch_strptime(date.strip(), "%d %b %Y %H:%M")
        pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M")

        article = Article(url=url, title=title, date=date)
        article.set_property("publisher", publisher)
        article.set_property("text_url", text_url)

        # Crashes AmCAT API:
        #article.set_property("pubdate_date", pub_date)

        try:
            article.set_property("author", get_byline_prop("author"))
        except ValueError:
            pass

        try:
            article.set_property("wordcount_int", int(get_byline_prop("word_count").split()[0]))
        except ValueError:
            pass

        try:
            article.set_property("country", get_byline_prop("source_country"))
        except ValueError:
            pass

        return NewsdeskUnit(article_element, article)
    def scrape_unit(self, date_and_article_url):
        date, article_url = date_and_article_url
        log.info("Fetching {}".format(article_url))
        article_doc = self.session.get_html(article_url)

        article_el = article_doc.cssselect("#content > article")

        if not article_el:
            log.error("Could not find article on {article_url}".format(**locals()))
            return None

        title = article_el[0].cssselect("h1")[0].text
        text = html2text(article_el[0].cssselect("p"))
        text = text.strip() or "."

        try:
            footer = article_el[0].cssselect("footer")[0]
        except IndexError as e:
            # Contains <embed> tag which is not closed gracefully :-(
            log.exception(e)
            return None

        author = footer.text.rsplit("|", 1)[0].strip()
        timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime"))
        if not title:
            return None

        children = self._get_comments(title, article_url, article_doc)

        article = Article(date=timestamp, title=title, text=text)
        article.set_property("author", author)
        article.set_property("url", article_url)
        article.set_property("medium", "GeenStijl")

        return ArticleTree(article, [ArticleTree(c, []) for c in children])
Exemple #7
0
    def scrape_unit(self, article_info: ArticleTuple):
        date, page_num, url = article_info

        try:
            text_url = strip_query(self.session.get_redirected_url(url))
        except RedirectError as e:
            if e.status_code == 404:
                return None
            raise

        try:
            text_doc = self.session.get_html(text_url)
        except HTTPError as e:
            if e.response.status_code == 404:
                logging.warning(f"{url} returned 404 skipping")
                return None
            else:
                raise

        for image in text_doc.cssselect(".image"):
            image.getparent().remove(image)

        date = datetime.datetime(date.year, date.month, date.day)
        try:
            title = text_doc.cssselect("article > h1")[0].text.strip()
        except:
            return None

        text = html2text(text_doc.cssselect("main > article > .body"))
        if not text.strip():
            return None

        article = Article(title=title, date=date, text=text, url=url)

        if text_doc.cssselect("article > header.themed"):
            # New headers style
            author = text_doc.cssselect("article > header .author")[0].text
            section = text_doc.cssselect("article > header .title")[0].text
            article.set_property("author", author)
        else:
            # Old header style
            section = text_doc.cssselect("article > header > .title")
            section = section[0].text if section else "NOSECTION"
            author_a = text_doc.cssselect("article .author a")
            if author_a:
                author = author_a[0].text.strip()
                article.set_property("author", author)
                if author == section:
                    section = "Opinie"

        download = text_doc.cssselect('form[name="download"]')
        if download:
            pdf_url = download[0].get("action")
            article.set_property("pdf_url", pdf_url)

        article.set_property("text_url", text_url)
        article.set_property("image_url", text_url + "?view=img")

        if section:
            article.set_property("section", section.strip())

        return article
Exemple #8
0
    def scrape_unit(self, article_info: ArticleTuple):
        date, page_num, url = article_info

        try:
            text_url = strip_query(self.session.get_redirected_url(url))
        except RedirectError as e:
            if e.status_code == 404:
                return None
            raise

        text_doc = self.session.get_html(text_url)

        for image in text_doc.cssselect(".image"):
            image.getparent().remove(image)

        date = datetime.datetime(date.year, date.month, date.day)
        try:
            title = text_doc.cssselect("article > h1")[0].text.strip()
        except:
            return None

        text = html2text(text_doc.cssselect("main > article > .body"))
        if not text.strip():
            return None

        article = Article(title=title, date=date, text=text, url=url)

        if text_doc.cssselect("article > header.themed"):
            # New headers style
            author = text_doc.cssselect("article > header .author")[0].text
            section = text_doc.cssselect("article > header .title")[0].text
            article.set_property("author", author)
        else:
            # Old header style
            section = text_doc.cssselect("article > header > .title")
            section = section[0].text if section else "NOSECTION"
            author_a = text_doc.cssselect("article .author a")
            if author_a:
                author = author_a[0].text.strip()
                article.set_property("author", author)
                if author == section:
                    section = "Opinie"

        download = text_doc.cssselect('form[name="download"]')
        if download:
            pdf_url = download[0].get("action")
            article.set_property("pdf_url", pdf_url)

        article.set_property("text_url", text_url)
        article.set_property("image_url", text_url + "?view=img")

        if section:
            article.set_property("section", section.strip())

        return article
Exemple #9
0
    def scrape_unit(self, entry):
        article = Article()
        try:
            section, text = self.get_article_section_text(entry["link"])
            print(section, text)
        except IndexError:
            return None

        article.set_property("nuid", entry["id"])
        article.set_property("title", entry["title"])
        article.set_property("date", self.parse_date(str(entry["published"])))
        article.set_property("url", entry["link"])
        article.set_property("section", section)
        article.set_property("text", text)
        return article