def parse(url=None, html=None, text=None, title=None, sentences_count=5, options={}, summarize_algo="luhn", date_timezone="America/New_York"): """ Parse article to get relevant data :param url: :param html: :param text: :param title: :param sentences_count: :param options: {} :param summarize_algo: :param date_timezone: The timezone to convert the date to :return: """ article = Article("") if text and title: article.is_parsed = True article.is_downloaded = True article.set_title(title) article.set_text(text) else: if url: r = requests.get(url.strip()) if r.status_code != 200: raise Exception("Paper request failed '%s'" % url) html = r.content if html: soup = get_soup(html) else: raise Exception("Paper missing HTML content") article.set_html(remove_social_embeds(html)) article.parse() article.nlp() if options.get("title_selector"): title = soup.select(options.get("title_selector")) if title: title = title[0].text article.set_title(title) if options.get("image_selector"): img = soup.select(options.get("image_selector")) if img: img = img[0].text article.set_top_img_no_check(img) if options.get("content_selector"): html = soup.select(options.get("content_selector")) if html: article.set_text(html[0].text) summary = summarize(text=article.text, title=article.title, algo=summarize_algo, sentences_count=sentences_count) publish_date = article.publish_date if not publish_date and html: publish_date = extract_publish_date(html) if not publish_date: publish_date = datetime.datetime.now() return { "url": article.canonical_link, "title": article.title, "summary": summary, "summaries": summary.split("\n\n"), "text": article.text, "html": article.html, "top_image": article.top_image, "images": article.images, "videos": list(set(article.movies + extract_video_iframes(html))), "social_media_content": extract_social_media_content(html), "keywords": article.keywords, "tags": article.tags, "authors": article.authors, "published_date": datetime_to_local_timezone(publish_date), "md_text": "" }