def get_article(url: str, text="") -> Article: """ Examine the url or text parameters. The method accepts text or webpage url for processing. If both are passed, the text is given priority, and the url is ignored. If only url is passed, then it tries to download and parse it into an Article object. If both parameters are null or empty, then an exception is thrown. :param url: string with the url :param text: string with the article's text. By default it equals empty string. :return: Article object from newspaper library """ if text and text.strip(): text = text.strip() article = Article("text_is_passed_so_no_url") article.download(input_html=text) article.set_text(text) elif url: url = f"http://{url}" if not str(url).startswith("http") else url article = Downloader.download_article(url) else: message = "Parameters are empty!" Downloader.logger.warn(message) # raise NliServiceException(Level.WARNING, __name__, message) return Downloader.parse_article(article)
def parse(url=None, html=None, text=None, title=None, sentences_count=5, options={}, summarize_algo="luhn", date_timezone="America/New_York"): """ Parse article to get relevant data :param url: :param html: :param text: :param title: :param sentences_count: :param options: {} :param summarize_algo: :param date_timezone: The timezone to convert the date to :return: """ article = Article("") if text and title: article.is_parsed = True article.is_downloaded = True article.set_title(title) article.set_text(text) else: if url: r = requests.get(url.strip()) if r.status_code != 200: raise Exception("Paper request failed '%s'" % url) html = r.content if html: soup = get_soup(html) else: raise Exception("Paper missing HTML content") article.set_html(remove_social_embeds(html)) article.parse() article.nlp() if options.get("title_selector"): title = soup.select(options.get("title_selector")) if title: title = title[0].text article.set_title(title) if options.get("image_selector"): img = soup.select(options.get("image_selector")) if img: img = img[0].text article.set_top_img_no_check(img) if options.get("content_selector"): html = soup.select(options.get("content_selector")) if html: article.set_text(html[0].text) summary = summarize(text=article.text, title=article.title, algo=summarize_algo, sentences_count=sentences_count) publish_date = article.publish_date if not publish_date and html: publish_date = extract_publish_date(html) if not publish_date: publish_date = datetime.datetime.now() return { "url": article.canonical_link, "title": article.title, "summary": summary, "summaries": summary.split("\n\n"), "text": article.text, "html": article.html, "top_image": article.top_image, "images": article.images, "videos": list(set(article.movies + extract_video_iframes(html))), "social_media_content": extract_social_media_content(html), "keywords": article.keywords, "tags": article.tags, "authors": article.authors, "published_date": datetime_to_local_timezone(publish_date), "md_text": "" }