def news_from_link(ref_link, news_from_globo): row = { 'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [] } article = NewsPlease.from_url(ref_link) if (article is not None): # Data returned by the NewsPlease row['titulos'].append(article.title) row['noticia'].append(article.text) row['abstract'].append(article.text) row['links'].append(article.url) if (news_from_globo): # we need to get the date from the original url, the date returned by the NewsPlease is wrong page_time = urllib.request.urlopen(article.url) soup_date = BeautifulSoup(page_time, 'html.parser') time_tag = soup_date.find_all('time', attrs={'itemprop': 'datePublished'}) public_date = time_tag[0].text formated_date = format_globo_date(public_date) row['date'].append(formated_date) else: formated_date = str(article.date_publish) row['date'].append(formated_date) path_image = article.image_url if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = seguranca_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = seguranca_lexical.lexical_corpus_and_title( row) print(categories) # DB categories and image if (categories != [set()]): news.set_categories(categories) seguranca_table.save_news(news) seguranca_post.post_news(df) except: print('Empty News')
def social_news_from_link(ref_link): row = {'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [], 'fb_comment': [], 'fb_share': [], 'fb_reaction': [], 'fb_total': []} article = NewsPlease.from_url(ref_link) if (article is not None): # Data returned by the NewsPlease row['titulos'].append(article.title) row['noticia'].append(article.text) row['links'].append(article.url) row['abstract'].append(article.text) formated_date = str(article.date_publish) row['date'].append(formated_date) path_image = article.image_url if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) fb_comment, fb_share, fb_reaction, fb_total = util.get_sharedcount_info(article.url) row['fb_comment'].append(fb_comment) row['fb_share'].append(fb_share) row['fb_reaction'].append(fb_reaction) row['fb_total'].append(fb_total) social_news = Social_News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image'], row['fb_comment'], row['fb_share'], row['fb_reaction'], row['fb_total']) try: print(row['titulos']) news_in_db = midia_table.check_news(social_news) print('news_in_db: ' + str(news_in_db)) if(not news_in_db): row = pd.DataFrame(row) df, categories = midia_lexical.lexical_corpus_and_title(row) # DB categories and image if(categories != [set()]): social_news.set_categories(categories) midia_table.save_news(social_news) midia_post.post_news(df) except: print('Empty News')
def news_from_link(ref_link): row = { 'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [] } article = NewsPlease.from_url(ref_link) if (article is not None): row['titulos'].append(article.title) row['noticia'].append(article.text) row['links'].append(article.url) row['abstract'].append(article.text) # if(article.date_publish == None): if ((article.date_publish == None) or (article.date_publish > datetime.datetime.now())): row['date'].append( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) else: row['date'].append(article.date_publish) path_image = article.image_url print(path_image) if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = pessoas_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = pessoas_lexical.lexical_corpus_and_title(row) # DB categories if (categories != [set()]): news.set_categories(categories) pessoas_table.save_news(news) pessoas_post.post_news(df) except: print('Empty News')
for noticia in noticias: print(noticia.find_all('a', href=True)[0]['href']) article = NewsPlease.from_url(noticia.find_all('a', href=True)[0]['href']) row = {'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': []} if (article is not None): row['titulos'].append(article.title) row['noticia'].append(article.text) row['links'].append(article.url) row['abstract'].append(article.text) row['date'].append(article.date_publish) path_image = article.image_url print(path_image) if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = midia_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = midia_lexical.lexical_corpus_and_title(row) # DB categories if (categories != [set()]): news.set_categories(categories) midia_table.save_news(news) midia_post.post_news(df) except: print('Empty News')
# we need to get the date from the original url, the date returned by the NewsPlease is wrong page_time = urllib.request.urlopen(news_url) soup_date = BeautifulSoup(page_time, 'html.parser') time_tag = soup_date.find_all('time', attrs={'itemprop': 'datePublished'}) public_date = time_tag[0].text formated_date = format_date(public_date) row['titulos'].append(titulo) row['links'].append(news_url) row['date'].append(formated_date) row['noticia'].append(noticia) row['abstract'].append(noticia) if(image_url is not None): path_image = image_url row['image'].append(download_and_move_image(path_image)) else: row['image'].append(0) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = check_news(news) print('news_in_db: ' + str(news_in_db)) if(not news_in_db): row = pd.DataFrame(row) df, categories = lexical_soup_globo(row) # DB categories if(categories != [set()]): news.set_categories(categories)