def get_data(path, destination): links_list = set() with open(path, 'r', encoding='utf-8') as file: for line in file: link = line.split()[0] if len(link) < 10: continue links_list.add(link) links_list = list(links_list) final_outputs = {} important_keys = [ 'authors', 'date_publish', 'description', 'image_url', 'language', 'title', 'maintext' ] multiple_index = 200 for i in tqdm(range(len(links_list) // multiple_index)): keys = links_list[i * multiple_index:(i + 1) * multiple_index] values = NewsPlease.from_urls(keys, timeout=6) for key, value in values.items(): paper_data = {} for im_key in important_keys: paper_data[key] = value.__dict__[im_key] final_outputs[key] = paper_data pickle.dump(final_outputs, open(destination, 'wb'))
def parallel_crawl(urls): url_batches = get_url_batches(urls) crawlled_data = [] for batch in url_batches: data = NewsPlease.from_urls(batch) crawlled_data.append(data) data_crawled = get_dict(crawlled_data) logger.info(f"get response {data_crawled}") return data_crawled
def article_crawler(): # crawler n = 0 for i in range(0, len(batch)): try: slice = batch[i] # print slice print(n) slice_name = str(i) + '-NewsPlease-articleCrawl.p' article_information = NewsPlease.from_urls(slice) print(article_information) pickle.dump(article_information, open(slice_name, 'wb')) n += 1 except: continue
def download_articles(search_term, n_articles, start, end=None): start_date = datetime.datetime.strptime(start, "%Y-%m-%d") end_date = start_date if end is None else datetime.datetime.strptime( end, "%Y-%m-%d") tbs = get_tbs(start_date, end_date) urls = find_urls(search_term, tbs, 10) valid_articles = [] while (len(valid_articles) < n_articles and len(urls) > 0): articles_left = 5 - len(valid_articles) articles = NewsPlease.from_urls(urls[:articles_left]) empty, articles = detect_empty_articles(articles) for new in articles: valid_articles.append(articles.get(new)) urls = urls[articles_left:] # print("valid_articles", len(valid_articles)) return { "search_term": search_term, "start": start, "end": start if end is None else end, "articles": valid_articles }
# -*- coding: utf-8 -*- """Untitled3.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1nNtRq9yovCwQmXpYk6Ugq1Uufz7tMthT """ from newsplease import NewsPlease import time url1='https://timesofindia.indiatimes.com/india/bengaluru-firm-to-build-moon-lander-for-nasa-2020-mission/articleshow/69684821.cms' url2='https://www.nytimes.com/2017/02/23/us/politics/cpac-stephen-bannon-reince-priebus.html?hp' url=[] for i in range(100): url.append(url2) tic=time.time() a=NewsPlease.from_urls(url) toc=time.time() print(toc-tic) print(a[url2].title)
def main(argv): if argv.__len__() < 2: print("Usage: crowley date output_file") date = argv[0] outputfile = argv[1] print 'Date is "', date print 'Output file is "', outputfile newsapi = NewsApiClient(open("token", 'r').read()) # Get articles urls qc_articles = newsapi.get_everything(q='quantum computing', from_param=date, language='en', sort_by='relevancy') qp_articles = newsapi.get_everything(q='quantum physics', from_param=date, language='en', sort_by='relevancy') all_urls = [] for article in qc_articles.get('articles'): all_urls.append(article.get('url')) for article in qp_articles.get('articles'): all_urls.append(article.get('url')) print("All articles ", all_urls.__len__()) # Get content of urls all_articles = NewsPlease.from_urls(all_urls) articles = {} for article in all_articles.values(): articles[article.title] = Article(article.image_url, article.url, article.maintext) # Write urls in file open(outputfile, 'w').truncate(0) json_data = {} json_article = {} for article in articles: art_value = articles[article] if art_value.url is not None and article is not None and art_value.image_url is not None and \ art_value.maintext is not None: json_article[art_value.url] = art_value.maintext print("Successfully wrote article %s " % article) json_data['content'] = json_article try: open(outputfile, 'w').write(json.dumps(json_data)) except OSError: print("Failed to open file to write article %s" % article) df = pd.read_json(outputfile) df.head() data = df.content.values.tolist() data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] data = [re.sub('\s+', ' ', sent) for sent in data] data = [re.sub("\'", "", sent) for sent in data] words_tokenized = tokenize(data) stop_words = stopwords.words('english') stop_words.extend([ 'from', 'subject', 're', 'edu', 'use', 'to', 'the', 'of', 'a', 'and', 'that', 'in', 'is', 'can', 'with', 'for', 'are', 'has' ]) without_stopwords = remove_stopwords(words_tokenized, stop_words) try: ldamodel_all = gensim.models.ldamodel.LdaModel.load(path_to_lda_all) except: print("Could not find models on disk! Will train.") print("Will generate dictionaries.") dictionary_all = Dictionary(without_stopwords) print("Will generate corpus") corpus_all = [ dictionary_all.doc2bow(text) for text in without_stopwords ] print("Will begin training...") ldamodel_all = gensim.models.ldamodel.LdaModel(corpus=corpus_all, num_topics=NUM_TOPICS, id2word=dictionary_all, update_every=5, chunksize=10000, passes=1) ldamodel_all.save(path_to_lda_all) print("Done training models. Saved them on disk.") df_topic_sents_keywords = format_topics_sentences( ldamodel=ldamodel_all, corpus=corpus_all, texts=without_stopwords) # Format df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = [ 'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text' ] # Show print(df_dominant_topic.head(5)) # Group top 5 sentences under each topic sent_topics_sorteddf_mallet = pd.DataFrame() sent_topics_outdf_grpd = df_topic_sents_keywords.groupby( 'Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([ sent_topics_sorteddf_mallet, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1) ], axis=0) # Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True) # Format sent_topics_sorteddf_mallet.columns = [ 'Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text" ] # Show print(sent_topics_sorteddf_mallet.head()) all_topics = get_topics(ldamodel=ldamodel_all, num_words=10) for i, topic in enumerate(all_topics): print(i, topic)
'https://www.am.com.mx/leon', 'https://www.am.com.mx/sanfranciscodelrincon', 'https://www.mural.com/', 'https://www.eldiariodechihuahua.mx/Delicias/', 'https://www.elsoldeparral.com.mx/', 'https://www.elnorte.com/', 'http://www.el-mexicano.com.mx/inicio.htm', 'https://www.elsudcaliforniano.com.mx/', 'https://www.diariodequeretaro.com.mx/', 'https://www.eloccidental.com.mx/', 'https://www.elsoldemexico.com.mx/', 'https://www.lavozdelafrontera.com.mx/', 'https://www.elsoldesanluis.com.mx/', 'http://www.milenio.com/temas/torreon', 'http://www.milenio.com/estado-de-mexico', 'http://www.milenio.com/leon', 'http://www.milenio.com/hidalgo', 'http://www.milenio.com/jalisco', 'http://www.milenio.com/monterrey', 'http://www.milenio.com/puebla', 'http://www.milenio.com/tamaulipas', 'http://www.milenio.com/temas/xalapa' ] articles = NewsPlease.from_urls(urls, timeout=3) #print(len(articles)) for url in urls: #dump(articles[url]) insert_data_into_db(articles[url]) retrieve_records()