def get_article(url: str, text="") -> Article:
        """
        Examine the url or text parameters.
        The method accepts text or webpage url for processing.
        If both are passed, the text is given priority, and the url is ignored.
        If only url is passed, then it tries to download and parse it into an Article object.
        If both parameters are null or empty, then an exception is thrown.
        :param url: string with the url
        :param text: string with the article's text. By default it equals empty string.
        :return: Article object from newspaper library
        """

        if text and text.strip():
            text = text.strip()
            article = Article("text_is_passed_so_no_url")
            article.download(input_html=text)
            article.set_text(text)
        elif url:
            url = f"http://{url}" if not str(url).startswith("http") else url
            article = Downloader.download_article(url)
        else:
            message = "Parameters are empty!"
            Downloader.logger.warn(message)
            # raise NliServiceException(Level.WARNING, __name__, message)
        return Downloader.parse_article(article)
Exemple #2
0
def parse(url=None, html=None, text=None, title=None,
          sentences_count=5,
          options={},
          summarize_algo="luhn",
          date_timezone="America/New_York"):
    """
    Parse article to get relevant data

    :param url:
    :param html:
    :param text:
    :param title:
    :param sentences_count:
    :param options: {}
    :param summarize_algo:
    :param date_timezone: The timezone to convert the date to
    :return:
    """

    article = Article("")

    if text and title:
        article.is_parsed = True
        article.is_downloaded = True
        article.set_title(title)
        article.set_text(text)
    else:
        if url:
            r = requests.get(url.strip())
            if r.status_code != 200:
                raise Exception("Paper request failed '%s'" % url)
            html = r.content

        if html:
            soup = get_soup(html)
        else:
            raise Exception("Paper missing HTML content")

        article.set_html(remove_social_embeds(html))
        article.parse()
        article.nlp()

        if options.get("title_selector"):
            title = soup.select(options.get("title_selector"))
            if title:
                title = title[0].text
                article.set_title(title)

        if options.get("image_selector"):
            img = soup.select(options.get("image_selector"))
            if img:
                img = img[0].text
                article.set_top_img_no_check(img)

        if options.get("content_selector"):
            html = soup.select(options.get("content_selector"))
            if html:
                article.set_text(html[0].text)

    summary = summarize(text=article.text,
                        title=article.title,
                        algo=summarize_algo,
                        sentences_count=sentences_count)
    publish_date = article.publish_date
    if not publish_date and html:
        publish_date = extract_publish_date(html)
    if not publish_date:
        publish_date = datetime.datetime.now()

    return {
        "url": article.canonical_link,
        "title": article.title,
        "summary": summary,
        "summaries": summary.split("\n\n"),
        "text": article.text,
        "html": article.html,
        "top_image": article.top_image,
        "images": article.images,
        "videos": list(set(article.movies + extract_video_iframes(html))),
        "social_media_content": extract_social_media_content(html),
        "keywords": article.keywords,
        "tags": article.tags,
        "authors": article.authors,
        "published_date": datetime_to_local_timezone(publish_date),
        "md_text": ""
    }