Exemple #1
0
def parse(url=None, html=None, text=None, title=None,
          sentences_count=5,
          options={},
          summarize_algo="luhn",
          date_timezone="America/New_York"):
    """
    Parse article to get relevant data

    :param url:
    :param html:
    :param text:
    :param title:
    :param sentences_count:
    :param options: {}
    :param summarize_algo:
    :param date_timezone: The timezone to convert the date to
    :return:
    """

    article = Article("")

    if text and title:
        article.is_parsed = True
        article.is_downloaded = True
        article.set_title(title)
        article.set_text(text)
    else:
        if url:
            r = requests.get(url.strip())
            if r.status_code != 200:
                raise Exception("Paper request failed '%s'" % url)
            html = r.content

        if html:
            soup = get_soup(html)
        else:
            raise Exception("Paper missing HTML content")

        article.set_html(remove_social_embeds(html))
        article.parse()
        article.nlp()

        if options.get("title_selector"):
            title = soup.select(options.get("title_selector"))
            if title:
                title = title[0].text
                article.set_title(title)

        if options.get("image_selector"):
            img = soup.select(options.get("image_selector"))
            if img:
                img = img[0].text
                article.set_top_img_no_check(img)

        if options.get("content_selector"):
            html = soup.select(options.get("content_selector"))
            if html:
                article.set_text(html[0].text)

    summary = summarize(text=article.text,
                        title=article.title,
                        algo=summarize_algo,
                        sentences_count=sentences_count)
    publish_date = article.publish_date
    if not publish_date and html:
        publish_date = extract_publish_date(html)
    if not publish_date:
        publish_date = datetime.datetime.now()

    return {
        "url": article.canonical_link,
        "title": article.title,
        "summary": summary,
        "summaries": summary.split("\n\n"),
        "text": article.text,
        "html": article.html,
        "top_image": article.top_image,
        "images": article.images,
        "videos": list(set(article.movies + extract_video_iframes(html))),
        "social_media_content": extract_social_media_content(html),
        "keywords": article.keywords,
        "tags": article.tags,
        "authors": article.authors,
        "published_date": datetime_to_local_timezone(publish_date),
        "md_text": ""
    }