def run_dateguesser(htmlstring): '''try with date_guesser''' guess = guess_date(url='https://www.example.org/test/', html=htmlstring) if guess.date is None: return None date = convert_date(guess.date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d') return date
def test_guess_date(): # Just making sure it works url = 'https://www.nytimes.com/opinion/catalonia-spain-puigdemont.html' html = '<could be anything></could>' guess = guess_date(url, html) assert guess.date is None assert guess.accuracy is Accuracy.NONE assert guess.method is NO_METHOD
async def get_article(session, url): if url == None: return 'url parameter is required', 400 print(f'Processing URL: {url}') sem = asyncio.Semaphore(10) async with sem: async with session.get(url, timeout=ClientTimeout(total=0)) as response: content = await response.read() article = newspaper.Article(url) article.set_html(content) article_dict = {} article_dict['last_updated'] = datetime.now().strftime( "%d/%m/%Y %H:%M:%S") if response.status == 200: article.parse() article_dict['status'] = 'ok' article_dict['article'] = {} article_dict['article']['source_url'] = article.source_url try: guess = guess_date(url=url, html=article.html) article_dict['article']['published'] = guess.date article_dict['article']['published_method_found'] = guess.method article_dict['article']['published_guess_accuracy'] = None if guess.accuracy is Accuracy.PARTIAL: article_dict['article']['published_guess_accuracy'] = 'partial' if guess.accuracy is Accuracy.DATE: article_dict['article']['published_guess_accuracy'] = 'date' if guess.accuracy is Accuracy.DATETIME: article_dict['article']['published_guess_accuracy'] = 'datetime' if guess.accuracy is Accuracy.NONE: article_dict['article']['published_guess_accuracy'] = None except: article_dict['article']['published'] = article.publish_date article_dict['article']['published_method_found'] = None article_dict['article']['published_guess_accuracy'] = None article.nlp() article_dict['article']['title'] = article.title article_dict['article']['summary'] = article.summary article_dict['article']['keywords'] = list(article.keywords) article_dict['article']['authors'] = list(article.authors) sentiString = article.title + article.summary analyzer = SentimentIntensityAnalyzer() vs = analyzer.polarity_scores(sentiString) try: title_lang = detect(article.title) except: title_lang = None try: text_lang = detect(article.text) except: text_lang = None article_dict['article']['images'] = list(article.images) article_dict['article']['top_image'] = article.top_image article_dict['article']['meta_image'] = article.meta_img article_dict['article']['movies'] = list(article.movies) article_dict['article']['meta_keywords'] = list( article.meta_keywords) article_dict['article']['tags'] = list(article.tags) article_dict['article']['meta_description'] = article.meta_description article_dict['article']['meta_lang'] = article.meta_lang article_dict['article']['title_lang'] = str(title_lang) article_dict['article']['text_lang'] = str(text_lang) article_dict['article']['meta_favicon'] = article.meta_favicon article_dict['article']['url'] = str(url) article_dict['article']['positive'] = vs['pos'] article_dict['article']['neutral'] = vs['neu'] article_dict['article']['negative'] = vs['neg'] finalJSON.append(article_dict) # return article_dict # finalJSON = {**finalJSON, **article_dict} else: article_dict = {} article_dict['status'] = 'error' article_dict['article'] = 'An error occurred parsing this article' # return article_dict finalJSON.append(article_dict)
def get_article(url: str = Query( ..., title="Article URL", description="The URL of the requested article.")): article = Article(url) article.download() if (article.download_state == 2): article.parse() # Article article_response = {} article_response['source_url'] = article.url article_response['title'] = article.title article_response['text'] = article.text article_response['authors'] = list(article.authors) # Date Detection try: guess = guess_date(url=url, html=article.html) article_response['published_date'] = guess.date article_response['published_date_method'] = guess.method # Date Accuracy if guess.accuracy is Accuracy.PARTIAL: article_response['published_date_accuracy'] = 'partial' if guess.accuracy is Accuracy.DATE: article_response['published_date_accuracy'] = 'date' if guess.accuracy is Accuracy.DATETIME: article_response['published_date_accuracy'] = 'datetime' if guess.accuracy is Accuracy.NONE: article_response['published_date_accuracy'] = None except: article_response['published_date'] = article.published_date article_response['published_date_method'] = None article_response['published_date_accuracy'] = None # Language Detection try: article_response['title_lang'] = detect(article.title) except: article_response['title_lang'] = None try: article_response['text_lang'] = detect(article.text) except: article_response['text_lang'] = None # Meta / Other article_response['meta_description'] = article.meta_description article_response['meta_lang'] = article.meta_lang article_response['meta_favicon'] = article.meta_favicon article_response['meta_keywords'] = list(article.meta_keywords) article_response['tags'] = list(article.tags) # Images article_response['images'] = list(article.images) article_response['meta_image'] = article.meta_img article_response['top_image'] = article.top_image return ArticleOut(**article_response) else: raise HTTPException(status_code=404, detail="Article was not found")
def dateGuesser(urlParam): # Uses url slugs when available guess = guess_date(url=urlParam, html=requests.get(urlParam).text) # Returns a Guess object with three properties return guess
def get_article(): url = None url = request.args.get('url', type=str) if url == None: return 'url parameter is required', 400 article = Article(url) article.download() if (article.download_state == 2): article.parse() article_dict = {} article_dict['status'] = 'ok' article_dict['article'] = {} article_dict['article']['source_url'] = article.source_url try: guess = guess_date(url=url, html=article.html) article_dict['article']['published'] = guess.date article_dict['article']['published_method_found'] = guess.method article_dict['article']['published_guess_accuracy'] = None if guess.accuracy is Accuracy.PARTIAL: article_dict['article']['published_guess_accuracy'] = 'partial' if guess.accuracy is Accuracy.DATE: article_dict['article']['published_guess_accuracy'] = 'date' if guess.accuracy is Accuracy.DATETIME: article_dict['article'][ 'published_guess_accuracy'] = 'datetime' if guess.accuracy is Accuracy.NONE: article_dict['article']['published_guess_accuracy'] = None except: article_dict['article']['published'] = article.publish_date article_dict['article']['published_method_found'] = None article_dict['article']['published_guess_accuracy'] = None article_dict['article']['title'] = article.title article_dict['article']['text'] = article.text article_dict['article']['authors'] = list(article.authors) try: title_lang = detect(article.title) except: title_lang = None try: text_lang = detect(article.text) except: text_lang = None article_dict['article']['images'] = list(article.images) article_dict['article']['top_image'] = article.top_image article_dict['article']['meta_image'] = article.meta_img article_dict['article']['movies'] = list(article.movies) article_dict['article']['meta_keywords'] = list(article.meta_keywords) article_dict['article']['tags'] = list(article.tags) article_dict['article']['meta_description'] = article.meta_description article_dict['article']['meta_lang'] = article.meta_lang article_dict['article']['title_lang'] = str(title_lang) article_dict['article']['text_lang'] = str(text_lang) article_dict['article']['meta_favicon'] = article.meta_favicon return jsonify(article_dict) else: article_dict = {} article_dict['status'] = 'error' article_dict['article'] = article.download_exception_msg return jsonify(article_dict)
def get_dates(article_html: str, url: str) -> tuple: """ Given the html and the url of the url, return the publication date and the modification date in isoformat as a tuple. # format is (date_published_iso, date_modified_iso) ("2020-05-27T21:59:25+01:00", "2020-05-28T18:34:13+01:00") If either of the publication date or the modification date cannot be found, they will be a empty string in the tuple. For instance, here is the example if the modification date was not found ("2020-05-27T21:59:25+01:00", "") How it works: 1) Looks for date in a website's json. 2) If date not found, look for date in url. 3) If date still not found, look for date in html. 4) Use media cloud's dateguesser. """ # first try the json method datedict = datefind_json(article_html) # method name changed pubtime = datedict.get("datePublished", '') modtime = datedict.get("dateModified", "") # print("Go JSON", datedict) # add print statement # add html to try because some news sources like psychology today only has html way of scraping if pubtime == "": pubtime = datefind_html(article_html, url) # print("Go HTML", pubtime) # add print statement # now, try to look at the url url_date = re.search( r"(19|20)\d{2}[/\-_]" # year r"[0-1]?\d[/\-_]" # month r"[0-3]?\d", # day url) if url_date is not None: date_found = re.split(r"[/\-_]", url_date.group()) year = int(date_found[0]) month = int(date_found[1]) day = int(date_found[2]) pubtime = datetime.date(year, month, day).isoformat() # print('Go url!', pubtime) # add date-guesser if pubtime == "": guess = guess_date(url, article_html) if guess.accuracy == Accuracy.DATE or guess.accuracy == Accuracy.DATETIME: pubtime = guess.date.isoformat() # print(guess.date.isoformat()[:19], guess.method) # print("Go dateguesser", pubtime, guess.method) elif guess.accuracy == Accuracy.PARTIAL: pubtime = guess.date.isoformat()[:7] # print("Go dateguesser", pubtime, guess.method) # print(guess.accuracy, guess.method, guess.date.isoformat()[:7]) # else: # print(guess.accuracy, guess.method, guess.date) return pubtime, modtime