def crawl_link_article(url): result_json = None try: if 'http' not in url: if url[0] == '/': url = url[1:] try: article = Article('http://' + url) article.download() time.sleep(2) article.parse() flag = True except: logging.exception( "Exception in getting data from url {}".format(url)) flag = False pass if flag == False: try: article = Article('https://' + url) article.download() time.sleep(2) article.parse() flag = True except: logging.exception( "Exception in getting data from url {}".format(url)) flag = False pass if flag == False: return None else: try: article = Article(url) article.download() time.sleep(2) article.parse() except: logging.exception( "Exception in getting data from url {}".format(url)) return None if not article.is_parsed: return None visible_text = article.text top_image = article.top_image images = article.images keywords = article.keywords authors = article.authors canonical_link = article.canonical_link title = article.title meta_data = article.meta_data movies = article.movies publish_date = article.publish_date source = article.source_url summary = article.summary images = create_string(images) authors = create_string(authors) movies = create_string(movies) keywords = create_string(keywords) result_json = { 'url': url, 'text': visible_text, 'images': images, 'top_img': top_image, 'keywords': keywords, 'authors': authors, 'canonical_link': canonical_link, 'title': title, 'movies': movies, 'publish_date': get_epoch_time(publish_date), 'source': source, 'summary': summary } except: logging.exception( "Exception in fetching article form URL : {}".format(url)) return result_json
links = re.findall(r'<a href="(/article/.+?)"', rr.text) for j in range( 10): #to remove extraneous links (note 10 articles per page) corpus_link_endings.append(links[1 + 2 * j]) corpus_links = [] for end in corpus_link_endings: corpus_links.append('https://uk.reuters.com' + end) #Save links to file f = open('reuters-corpus-10k-links.txt', 'w+') for link in corpus_links: f.write(link + '\n') f.close() # Extract contents of article from url using newspaper3k corpus_texts = [] from newspaper import Article for link in corpus_links: a = Article(link) a.download() a.parse() corpus_texts.append(a.text) # i=0 # for text in corpus_texts: # print(text,i) # i+=1 # Write corpus-texts to a file f = open('reuters-corpus-10k.txt', 'w+') for text in corpus_texts: f.write(text + '\n\n\n\n\n\n\n') f.close()
doc = ' '.join(filtered_tokens) return doc end_r_speeches = [] # for year in range(9): # for speech in agg[9]['R']: # news = normalize_document(speech) # end_r_speeches.append(news) from newspaper import Article import newspaper rep_sauce = newspaper.build('http://breitbart.com', memoize_articles=False) for article in rep_sauce.articles: urll = article.url art = Article(urll) art.download() art.parse() text = normalize_document(art.text) end_r_speeches.append(text) wpt = nltk.WordPunctTokenizer() tokenized_corpus = [wpt.tokenize(document) for document in end_r_speeches] # Set values for various parameters feature_size = 100 # Word vector dimensionality window_context = 30 # Context window size min_word_count = 1 # Minimum word count sample = 1e-3 # Downsample setting for frequent words r_w2v_model = word2vec.Word2Vec(tokenized_corpus,
"link": value['link'], "articles": [] } for entry in d.entries: # Check if publish date is provided, if no the article is skipped. # This is done to keep consistency in the data and to keep the script from crashing. if hasattr(entry, 'published'): if count > param.num_of_articles: break article = {} article['link'] = entry.link date = entry.published_parsed article['published'] = datetime.fromtimestamp( mktime(date)).isoformat() try: content = Article(entry.link) content.download() content.parse() except Exception as e: # If the download for some reason fails (ex. 404) the script will continue downloading # the next article. print(e) print("proceeding...") continue article['title'] = content.title article['text'] = content.text newsPaper['articles'].append(article) if param._comments: print(count, "articles downloaded from", company, ", url: ", entry.link) count = count + 1
#import the libraries from newspaper import Article import random import string import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np import warnings warnings.filterwarnings('ignore') #download the punkt package nltk.download('punkt', quiet = True) #get the article article = Article('https://www.mayoclinic.org/diseases-conditions/chronic-kidney-disease/symptoms-causes/syc-20354521') article.download() article.parse() article.nlp() corpus = article.text #print the article's test print(corpus) #tokenisation text = corpus sentence_list = nltk.sent_tokenize(text) print(sentence_list) # a function to return a random greeting respone to a user's greeting
def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp)
def process_article(url): a = Article(url, request_timeout=3) a.download() a.parse() a.nlp() return article_to_dict(a)
def parseurl(url): article = Article(url) article.download() article.parse() return article
def build(url): article = Article(url) article.download() article.parse() article.nlp() return article
def read_article(url): art = Article(url, language='vi') #print(fulltext(url, language='vi')) art.download() art.parse() return art.title + '.\n' + art.text + '.\n'
from summarizer import summarize import newspaper from newspaper import Article from pyteaser import SummarizeUrl from fuzzywuzzy import fuzz from fuzzywuzzy import process url = 'http://www.politifact.com/truth-o-meter/statements/2017/jun/22/antinews/its-fake-news-chinese-lunar-rover-found-no-evidenc/' a = Article(url) a.download() a.parse() TXT=a.text TITLE=a.title def Summarize(title, text): summaries = [] sentences = split_sentences(text) keys = keywords(text) titleWords = split_words(title) if len(sentences) <= 1: return sentences #score setences, and use the top 5 sentences ranks = score(sentences, titleWords, keys).most_common(1) for rank in ranks: summaries.append(rank[0])
def get_article_from_url(self, url): article = Article(url) article.download() article.parse() article.nlp() return self.get_article_obj_from_article(article)
import os def conv(s): try: return int(s) except ValueError: return s df = pd.read_csv('enk.csv') query = df["url"].values #query =["https://en.wikipedia.org/wiki/Vaccine" , "https://en.wikipedia.org/wiki/Vaccine"] for i in range(len(query)): try: article = Article(query[i]) article.download() article.parse() text = article.text blob = TextBlob(text) s = Textatistic(text) afb = len(article.images) vals = requests.get(query[i], timeout=4, allow_redirects=False).elapsed.total_seconds() st = "/&callback=process&key=57bf606e01a24537ac906a86dc27891f94a0f587" # zz = urlopen ( url ) quez = 'http://api.mywot.com/0.4/public_link_json2?hosts=' + query[ i] + st stt = urllib.request.urlopen(quez).read() stt = str(stt) wot = re.findall('\d+', stt)
def set_text(self): if not self.text and self.url: a = Article(self.url) a.download() a.parse() self.text = a.text
def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch')
def getTitle(title): article = Article(title) article.download() article.parse() return article.title
def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse)
def random_sentences(): """ This function uses the Newspaper3k package (https://newspaper.readthedocs.io/en/latest/) to download text from CNN articles. The first, second, and last sentence from each article are taken and stored. In addition, 100 other random sentences hand-picked by me from random websites are added. This new dataset of random sentences is saved to be used later. :return: nothing, but a dataset of random sentences is outputted """ # First bit, extract the random sentences from the CNN articls urls = pd.read_excel("training_sentences.xlsx", sheet_name="Sheet2", index_col=0) sentences = [] for url in urls["url"]: #url = "https://www.cnn.com/2021/02/12/tech/facebook-myanmar-military-intl-hnk/index.html" print(url) article = Article(url) article.download() try: article.parse() last_index = len(nltk.sent_tokenize(article.text)) - 1 first_sent = nltk.sent_tokenize(article.text)[0] first_sent.replace("(CNN) -", '', 1) first_sent.replace("(CNN)-", '', 1) first_sent.replace("(CNN)", '', 1) second_sent = nltk.sent_tokenize(article.text)[1] second_sent.replace("(CNN) -", '', 1) second_sent.replace("(CNN)-", '', 1) second_sent.replace("(CNN)", '', 1) last_sent = nltk.sent_tokenize(article.text)[last_index] last_sent.replace("(CNN) -", '', 1) last_sent.replace("(CNN)-", '', 1) last_sent.replace("(CNN)", '', 1) sentences.append(first_sent) if first_sent != second_sent: sentences.append(second_sent) if second_sent != last_sent and first_sent != last_sent: sentences.append(last_sent) if first_sent == second_sent or first_sent == last_sent or second_sent == last_sent: print( "There was the same sentence twice. It did not get duplicated." ) except newspaper.article.ArticleException: print('didnt work for that url:', url) # Second bit, use the ~100 random sentences that I compiled to even the mix of random to climate related sentences extra_sentences = pd.read_excel("training_sentences.xlsx", sheet_name="Sheet3", index_col=0) for sentence in extra_sentences["random_sentence"]: sentence = sentence.replace('\xa0', ' ') sentences.append(sentence) ################################## # SAVE THE SENTENCES ################################## print("Saving Sentences...") np.save("random.npy", np.array(sentences)) print("Completed saving sentences!") print("Number of random sentences:", len(sentences)) print(75 * '-') # ==============================================================================# # OLD CODE # ==============================================================================# # cnn_paper = newspaper.build('https://www.cnn.com') # # for article in cnn_paper.articles: # print(article.url) # # cnn_article_array = [] # for i in range(5): # cnn_article = cnn_paper.articles[i] # cnn_article.download() # cnn_article.parse() # cnn_article.nlp() # print(cnn_article.text)
def test_article_pdf_fetching(self): a = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf') a.download() self.assertNotEqual('%PDF-', a.html)
def sentiment(self, ticker, number_of_articles=50, text_boolean=False): ''' classifier: default='nb'. Choose between a Niave Bayes Classifier (input='nb'), or NLTK's Sentiment Intensity Analyzer (input='si'). The method scrapes the top "specified number of articles" from google news. The classifier analyzes each article and averages the negative and positive scores to return a dictionary of scores. {'pos': x, 'neg': y}. ticker:choose the ticker symbol you want analyzed number_of_articles: Default= 50. Choose the number of articles you want scraped from google news, text_boolean:Default=False. input True or False, if you want the text of the article that is closset to the average positive and negative score return in the dictionary. If this argument is 'True', the dictionary will have three keys {'pos': x, 'neg': y, 'text': summary} ''' classifier = 'nb' ticker = ticker articles_examined = number_of_articles prefix = 'https://news.google.com/' url = 'https://news.google.com/search?q=' + ticker + '&hl=en-US&gl=US&ceid=US%3Aen' r1 = requests.get(url) coverpage = r1.content soup1 = BeautifulSoup(coverpage, 'html5lib') coverpage_news = soup1.find_all( 'div', class_="NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc") links = [] for article in (coverpage_news): links.append(prefix + article.a["href"]) titles = [] texts = [] summaries = [] counter = 0 for link in links: print(link) try: url = link article = Article(url, language="en") article.download() article.parse() article.nlp() titles.append(article.title) #prints the title of the article texts.append( (article.text)) #prints the entire text of the article summaries.append( article.summary) #prints the summary of the article #print(article.keywords) #prints the keywords of the article counter += 1 if counter >= articles_examined: break except newspaper.article.ArticleException: continue if classifier == 'nb': import pickle classifier_f = open("naivebayes.pickle", "rb") classifier = pickle.load(classifier_f) classifier_f.close() text_counter = 0 texts_neg_sum = [] texts_pos_sum = [] result_te = '' for text in texts: print('text') prob_dist = classifier.prob_classify(text) texts_pos_sum.append(round(prob_dist.prob("pos"), 2)) texts_neg_sum.append(round(prob_dist.prob("neg"), 2)) text_counter += 1 if sum(texts_neg_sum) > sum(texts_pos_sum): result_te = 'negative' elif sum(texts_neg_sum) < sum(texts_pos_sum): result_te = 'positive' n_sent = ((sum(texts_neg_sum) / text_counter) * 100) p_sent = ((sum(texts_pos_sum) / text_counter) * 100) if text_boolean == True: sent_list = [] avg_num = 0 if sum(texts_neg_sum) > sum(texts_pos_sum): sent_list = texts_neg_sum avg_num = n_sent elif sum(texts_neg_sum) < sum(texts_pos_sum): sent_list = texts_pos_sum avg_num = p_sent clossest_sent = min(sent_list, key=lambda x: abs(x - avg_num)) avg_summary = summaries[sent_list.index(clossest_sent)] return {'pos': p_sent, 'neg': n_sent, 'text': avg_summary} elif text_boolean == False: return {'pos': p_sent, 'neg': n_sent} else: raise ValueError('text_boolean must be either True or False') else: raise ValueError( 'Argument must be nb(Naive Bayes Classifier) or si(Sentiemnt Intensity Classifier)' )
def newspaper_parser(self, sleep_time=5): logging.debug('running newspaper_parser() for sercure sites...') results = [] count = 0 profile = webdriver.FirefoxProfile() browser = webdriver.Firefox(executable_path=r'gecko\geckodriver.exe') credential_names = list(self.credentials.keys()) browser.get(self.login_url) cred1 = browser.find_element_by_id(credential_names[0]) cred2 = browser.find_element_by_id(credential_names[1]) cred1.send_keys(self.credentials[credential_names[0]]) cred2.send_keys(self.credentials[credential_names[1]]) time.sleep(10) browser.find_element_by_class_name(self.submit_id).click() time.sleep(10) cookies = browser.get_cookies() browser.close() s = requests.Session() for cookie in cookies: s.cookies.set(cookie['name'], cookie['value']) for l in self.links: try: page = s.get(l) except Exception as e: logging.error("issue bundling {} for {}, {}".format( l, self.searchTerm, e)) print(e) time.sleep(20) continue soup = BeautifulSoup(page.content, features="lxml") article = Article(url=l) article.set_html(str(soup)) article.parse() article.nlp() up_date = article.publish_date if self.newspaper == 'Wall Street Journal': soup = BeautifulSoup(article.html, features="lxml") # if no articles, stop pub_date = soup.find("meta", { "name": "article.published" }).get("content", None) up_date = soup.find("meta", { "name": "article.updated" }).get("content", None) article.publish_date = pub_date data = { 'search': self.searchTerm, 'title': article.title, 'date_published': article.publish_date, 'date_updated': up_date, 'news_outlet': self.newspaper, 'authors': article.authors, # 'feature_img': article.top_image, 'article_link': article.canonical_link, 'keywords': article.keywords, # 'movies': article.movies, 'summary': article.summary, 'text': article.text, 'html': article.html, } results.append(data) time.sleep(sleep_time) count += 1 print("done for ", self.searchTerm) return results
def get_twittable_sentences_from_url(url): article = Article(url) article.download() article.parse() best_sentences = nlp.summarize(title=article.title, text=article.text) return [s for s in best_sentences if len(s) < 140]
import webbrowser import newspaper from newspaper import Article url = "https://www.hindustantimes.com/india-news/pm-modi-asks-g20-for-an-effective-global-response-to-coronavirus-reports/story-myRgcYwmAhEX077ZZdGCbP.html" toi_article = Article(url, language="en") # en for English toi_article.download() toi_article.parse() toi_article.nlp() f = open('art.html', 'w') a = "https://www.hindustantimes.com/rf/image_size_960x540/HT/p2/2020/03/26/Pictures/ahmedabad-ahmedabad-unorganised-hindustan-addresses-siddharaj-minister_1342d1d6-6f7e-11ea-ad54-628e87a77846.jpg" message = """<html> <head></head> <body> <h1>{title}</h1> <h3>Authors:{auth}</h3> <img src="{URL}"> <article> <h4>Article Publish Date: {date} </h4> <p>Summary:{summ}</p> <p>Detailed News: {text}</p> </article> </body> </html>""" new_message = message.format(URL=a, title=toi_article.title, auth=toi_article.authors,
def getArticle(url): a = Article(url, language='ko') a.download() a.parse() return a
def __init__(self, url): self.article = NewsPlease.from_url(url) self.content = Article(url) self.content.download() self.content.parse()
def parse_content(self, response): #这个函数用作新闻的具体解析 ID = 'songtengteng' website_name = '商务部贸易救济调查局' # 网站板块 website_block = response.xpath( "//div[@class='position']/a[2]/text()").extract_first() news_url = response.meta['url'] # 作者 news_author_list = response.xpath('//script') if len(news_author_list) != 0: news_author = news_author_list.re( 'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"') if news_author != []: news_author = news_author[0][13:].replace('"', '') else: news_author = '商务部贸易救济调查局' else: news_author = '商务部贸易救济调查局' # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS publish_time = response.meta['publish_time'] year = publish_time[0:4] month = publish_time[5:7] day = publish_time[8:10] juti_time = publish_time[-8:] publish_time = year + month + day + ' ' + juti_time # 新闻自带标签 news_tags = response.xpath('//script').re( 'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '') # 新闻标题 news_title = response.xpath('//h3/text()').extract_first() # 新闻正文 a = Article(response.url, language='zh') # Chinese a.download() a.parse() news_content = a.text #获取文章的图片和名称 image_urls = [] image_names = [] image_urls1 = response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src' ).extract() if image_urls1 != []: image_urls = image_urls1 for i in range(len(image_urls)): if i < 10 and i >= 0: image_name = news_title + '_000' + str(i) image_names.append(image_name) elif i < 100 and i >= 10: image_name = news_title + '_00' + str(i) image_names.append(image_name) elif i < 1000 and i >= 100: image_name = news_title + '_0' + str(i) image_names.append(image_name) else: image_name = news_title + str(i) image_names.append(image_name) yield self.getItem( id=ID, news_url=news_url, website_name=website_name, website_block=website_block, news_title=news_title, publish_time=publish_time, news_author=news_author, news_tags=news_tags, news_content=news_content, image_urls=image_urls, image_names=image_names, )
from urllib.parse import urlparse import newspaper from newspaper import Article urlFileRead = open("newspaperURLS.txt", "r") urlRead = urlFileRead.readlines() urlFileAppend = open("newspaperURLS.txt", "a") cnn_paper = newspaper.build('http://cnn.com') for article in cnn_paper.articles: urlFileAppend.write(article.url) textfile_url = open("urltextfiles.txt", "w") for urlLine in urlRead: first_article = Article(url=urlLine) first_article.download() print article.html first_article.parse() print(first_article.publish_date) textfile_url.write(first_article.text) ''' urlFinal = "\"" + urlLine.rstrip() + "\"" print (urlFinal)'''
from newspaper import Article import time import sys , os import json # first filename witt be i+1 # so i should be the latest number given to an article i=0 with open("links-sz-2020-07-13.txt" , "r") as link_file : all_lines = link_file.readlines() for link in all_lines[0:3]: article = Article(link) try: print(i , ": ", link) article.download() print(article.title) time.sleep(2) article.parse() article.nlp() # article.fetch_images() # I am not working with images at the moment ## generate a filename i=i+1 filename = f'{i:05}' # should check, if file exists ... keep = article.meta_data['og'] keep['authors'] = article.authors keep['text-link'] = filename
from newspaper import Article import nltk from gtts import gTTS import os # BBC News: Coronavirus: How can AI help fight the pandemic? url = 'https://www.bbc.com/news/technology-51851292' # Getting the article text using newspaper3k library article = Article(url) article.download() article.parse() nltk.download('punkt') # Applying NLP article.nlp() # Storing the text of the article in article_text article_text = article.text[:250] print("First 250 character of the article: \n", article_text) # Selecting the Article's Language article_language = 'en' #English # Applying Google Text-to-Speech text_to_speech = gTTS(text=article_text, lang=article_language, slow=False) # Saving the audio file text_to_speech.save("text_to_speech_article.mp3")
def read_doc(record): url = record.url article = None if url: article = Article(url, language="en") return url, article