def news_from_link(ref_link, news_from_globo): row = { 'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [] } article = NewsPlease.from_url(ref_link) if (article is not None): # Data returned by the NewsPlease row['titulos'].append(article.title) row['noticia'].append(article.text) row['abstract'].append(article.text) row['links'].append(article.url) if (news_from_globo): # we need to get the date from the original url, the date returned by the NewsPlease is wrong page_time = urllib.request.urlopen(article.url) soup_date = BeautifulSoup(page_time, 'html.parser') time_tag = soup_date.find_all('time', attrs={'itemprop': 'datePublished'}) public_date = time_tag[0].text formated_date = format_globo_date(public_date) row['date'].append(formated_date) else: formated_date = str(article.date_publish) row['date'].append(formated_date) path_image = article.image_url if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = seguranca_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = seguranca_lexical.lexical_corpus_and_title( row) print(categories) # DB categories and image if (categories != [set()]): news.set_categories(categories) seguranca_table.save_news(news) seguranca_post.post_news(df) except: print('Empty News')
def news_from_link(ref_link): row = { 'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [] } article = NewsPlease.from_url(ref_link) if (article is not None): row['titulos'].append(article.title) row['noticia'].append(article.text) row['links'].append(article.url) row['abstract'].append(article.text) # if(article.date_publish == None): if ((article.date_publish == None) or (article.date_publish > datetime.datetime.now())): row['date'].append( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) else: row['date'].append(article.date_publish) path_image = article.image_url print(path_image) if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = pessoas_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = pessoas_lexical.lexical_corpus_and_title(row) # DB categories if (categories != [set()]): news.set_categories(categories) pessoas_table.save_news(news) pessoas_post.post_news(df) except: print('Empty News')
row['abstract'].append(article.text) row['date'].append(article.date_publish) path_image = article.image_url print(path_image) if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = midia_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = midia_lexical.lexical_corpus_and_title(row) # DB categories if (categories != [set()]): news.set_categories(categories) midia_table.save_news(news) midia_post.post_news(df) except: print('Empty News') # In[ ]: noticias[0]