def get_articles(output_path, ): config = dict( language='pl', fetch_images=False, MIN_WORD_COUNT=100, MIN_SENT_COUNT=5, memoize_articles=False, ) papers = {} for url in SITES_URLS: paper = newspaper.build(url, **config) print(f"{url} contains {paper.size()} articles") papers[url] = paper print("Downloading...") news_pool.set(papers.values(), threads_per_source=2) news_pool.join() print("Parsing...") for paper in papers.values(): paper.parse_articles() articles = [ art.text for paper in papers.values() for art in paper.articles ] articles = [art.replace("\n", "") for art in articles] print(f"Scraped {len(articles)} articles") with open(output_path, "w", encoding='utf-8') as f: json.dump(articles, f)
def download_and_parse(self, url_list=None): if not self.submissions and url_list is None: logging.error('Must give a "url_list" or "query_reddit" first.') return self.set_articles(url_list) news_pool.set(self.articles) news_pool.join()
def loadNews(knownrealSites, s): articles = Article.objects.all() #for a in articles: #print(a) papers = [] for url in knownrealSites: real_paper = None try: real_paper = newspaper.build(url) papers.append(real_paper) print(url + ' contains ' + str(len(real_paper.articles)) + ' ' + s + ' articles') except: print(url) print('url is bad') continue news_pool.set(papers, threads_per_source=4) news_pool.join() for paper in papers: for article in paper.articles: #due to multithreading above we can assume every article has had download called on it. #for article in real_paper.articles: try: #article.download() article.parse() #print('article.authors:**************************\n');print(article.authors) #print('article.text:**************************\n');print(article.text) #print('article.url:**************************\n');print(article.url) #print('article.title:**************************\n');print(article.title) #article.nlp() #print('keywords:**************************\n');print(article.keywords) #print('summary:**************************\n');print(article.summary) except: print('issue with download/parse') continue #x,y,z = tweetParser.getSentiment(url,2000) #print(article.publish_date) a = Article( address=article.url, title=article.title, body=article.text, date=article.publish_date, result=s, #positive = x, #negative = y, #neutral = z, ) #article.parse() #article.nlp() try: a.save() print( '**************************article SAVED**************************' ) except: print( '**************** article failed to save with field **************' ) continue
def get_newspapers(source_urls): papers = [] for url in source_urls: papers.append(newspaper.build(url)) news_pool.set(papers, threads_per_source=2) news_pool.join() return papers
def scrape_news(): #t = time.time() ## connect client = MongoDB() with client as db: # #connect (not necessary) # connect(db) ## multi-threading eu_paper = newspaper.build('https://theguardian.com', memoize_articles=False, fetch_images=False) us_paper = newspaper.build('https://www.cbsnews.com/', memoize_articles=False, fetch_images=False) hk_paper = newspaper.build('http://scmp.com', memoize_articles=False, fetch_images=False) jp_paper = newspaper.build('https://www.japantimes.co.jp/', memoize_articles=False, fetch_images=False) papers = [eu_paper, us_paper, hk_paper, jp_paper] news_pool.set(papers, threads_per_source=2) # (4*2) = 8 threads total news_pool.join() print("Size of EU paper: " + str(eu_paper.size())) print("Size of US paper: " + str(us_paper.size())) print("Size of HK paper: " + str(hk_paper.size())) print("Size of JP paper: " + str(jp_paper.size())) for paper in papers: for article in paper.articles: try: article.parse() print(len(article.text)) if len(article.text) > 100: article.nlp() item = { 'url': article.url, 'brand': paper.brand, 'title': article.title, 'text': article.text, 'keywords': article.keywords, 'summary': article.summary, 'date': dt.today(), 'date_str': dt.today().strftime('%Y-%m-%d') } db.news_items.insert_one(item) except Exception as e: #In case it fails, skip article print(e) print("continuing...") continue
def build_sources(self, param): replies = list() for sources in param: replies.append( newspaper.build('http://' + str(sources) + '.com', language='en')) news_pool.set(replies, threads_per_source=3) news_pool.join() return replies
def __init__(self): # create list containing news sites to scrape self.web_list = ['http://www.foxnews.com','http://www.usatoday.com'] # setup newspaper to multi-thread news sources self.newsWebList = [newspaper.build(i, memoize_articles=True, fetch_images=False) for i in self.web_list] news_pool.set(self.newsWebList, threads_per_source=10) news_pool.join() self.connectDB() self.compareArticle()
def download_all_articles(self): logging.info("Downloading all articles...") papers = self.create_source_feed_list() news_pool.set(papers, threads_per_source=self.THREADS_PER_NEWS_SOURCE) # Download feed from all sources in parallel threads news_pool.join() logging.info("Download complete.") logging.info(datetime.now())
def get_who_articles(self): covid_articles = newspaper.build(self.newspaper_link, memoize_articles=False) papers = [covid_articles, ] news_pool.set(papers, threads_per_source=4) news_pool.join() for index, article in enumerate(covid_articles.articles): print(article.url) article.parse() write_file = open('sites/articles/article' + str(index) + '.txt', 'w', encoding='utf-8') write_file.write(str(article.title) + "\n") write_file.write(textwrap.fill(article.text, width=120)) write_file.close()
def main() -> List[List[str]]: papers = [newspaper.build(url, memoize_articles=False, fetch_images=False, verbose=DEBUG) for url in SITE_URLS] news_pool.set(papers, threads_per_source=THREADS_PER_SOURCE) news_pool.join() articles = [] for paper in papers: articles.extend(get_articles(paper)) print('Final number of articles:', len(articles)) return articles
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print('Slate has %d articles TC has %d articles ESPN has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html) print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
def download_newspapers(sources): ''' This function will download the data from the newspapers in the sources variable, and then save them to mongodb database. ''' res = [] paper = [] l = newspaper.build(sources, memoize_articles=False) paper.append(l) news_pool.set(paper, threads_per_source=2) news_pool.join() for r in paper[0].articles: res.append(r) return res
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print ('slate has %d articles tc has %d articles espn has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html) print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def collect_news(): papers = [] papers.append(newspaper.build('http://cnn.com', memoize_articles=True)) papers.append( newspaper.build('http://www.bbc.com/news', memoize_articles=True)) papers.append( newspaper.build('http://news.sky.com/world', memoize_articles=True)) papers.append( newspaper.build('https://nytimes.com/section/world', memoize_articles=True)) papers.append( newspaper.build('https://washingtonpost.com/world', memoize_articles=True)) papers.append( newspaper.build('http://reuters.com/news/world', memoize_articles=True)) news_pool.set(papers, threads_per_source=1) news_pool.join() news_list = [] categories = fetch_20newsgroups(subset='train', shuffle=True) clf = joblib.load(os.path.join(os.path.dirname(__file__), 'model.pkl')) for paper in papers: for current_article in itertools.islice(paper.articles, 0, 5): current_article.download() current_article.parse() current_article.nlp() news_to_add = { 'title': current_article.title, 'keywords': current_article.keywords, 'url': current_article.url, 'category': news_predictor([current_article.text], categories, clf), 'source': paper.brand, 'collected': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } news_list.append(news_to_add) return news_list
def extract_articles(url_list): """Extracts article text and keywords from url. Inputs ------ url_list: list Returns ------- generator with keywords parsed from article url list """ articles = [Article(url) for url in url_list] news_pool.set(articles) news_pool.join() r = Rake() for article in articles: article.parse() r.extract_keywords_from_text(article.text) article_kwords = r.get_ranked_phrases() yield article_kwords
def main(): # Build a news souce # use memoize_articles flag to turn off article caching fox = newspaper.build("http://www.foxnews.com", memoize_articles=False) print(fox.size()) msnbc = newspaper.build("http://www.msnbc.com", memoize_articles=False) print(msnbc.size()) bbc = newspaper.build("http://www.bbc.com", memoize_articles=False) print(bbc.size()) papers = [fox, msnbc, bbc] news_pool.set(papers, threads_per_source=2) #6 total news_pool.join() # extract and save articles saveFile("fox.json", downloadAndParse(fox)) saveFile("msnbc.json", downloadAndParse(msnbc)) saveFile("bbc.json", downloadAndParse(bbc))
def test_download_works(self): """ """ config = Configuration() config.is_memoize_articles = False slate_paper = newspaper.build('http://slate.com', config) tc_paper = newspaper.build('http://techcrunch.com', config) espn_paper = newspaper.build('http://espn.com', config) print 'slate has %d articles tc has %d articles espn has %d articles' \ % (slate_paper.size(), tc_paper.size(), espn_paper.size()) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html) print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def pool(): #Download all new news articles from our sources papers = [] user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0' config = Config() config.browser_user_agent = user_agent config.request_timeout = 10 config.memoize_articles=False #Build a model of all articles from the website. Get only those we haven't retrieved before reuters_paper = newspaper.build('https://www.reuters.com/', memoize_articles=False)#, request_timeout=10) bbc_paper = newspaper.build('https://www.bbc.co.uk/news', memoize_articles=False)#, request_timeout=10) #We add the models of the news sources papers.append(reuters_paper) papers.append(bbc_paper) news_pool.set(papers, threads_per_source=8) news_pool.join() return papers
def main(argv): TOP_PATH = os.path.dirname(__file__) OUT_PATH = os.path.join(TOP_PATH, 'output') if not os.path.exists(OUT_PATH): os.makedirs(OUT_PATH) # Our permanent config for crawling config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True # Get contents of our source file sourcefile = os.path.join(TOP_PATH, "sources.txt") with open(os.path.join(sourcefile), 'r') as f: sourcelist = f.read().strip().split('\n') # Initialize our sources sources = [IntiSource(source,config=config) for source in sourcelist] # Make domain directories inside our output path and build sources for s in sources: if not os.path.exists(os.path.join(OUT_PATH, s.domain)): dom_path = os.path.join(OUT_PATH, s.domain) os.makedirs(dom_path) # Build s.build() if config.verbose: s.print_summary() # Multithreaded source downloading and parsing news_pool.set(sources, threads_per_source = 4) news_pool.join() article_parse(sources)
def scrape_website(urls): papers = [] for url in urls: if url: papers.append(newspaper.build(url, memoize_articles=False)) for paper in papers: delete_queue = [] # articles to be deleted for article in paper.articles: if 'video' in article.url or 'videos' in article.url: delete_queue.append(article) for article in delete_queue: paper.articles.remove(article) news_pool.set(papers, threads_per_source=2) # (2*2) = 4 threads in all news_pool.join() for paper in papers: paper.parse_articles() es = ElasticStorage.get_instance(dev=False) for paper in papers: es.store_articles(paper.articles, paper.url)
def scrape_articles(domains=DOMAINS): """Crawls domains and scrapes new web articles. """ papers = [newspaper.build(s, memoize_articles=False) for s in domains] news_pool.set(papers, threads_per_source=1) news_pool.join() for domain, paper in zip(domains, papers): paper_source = parse_source(domain) for article in paper.articles: article_source = parse_source(article.url) if article_source != paper_source: continue article.parse() a = Article(url=article.url, title=article.title, text=article.text, image=article.top_image, domain=domain) a.save() n_articles = sum(map(lambda p: len(p.articles), papers)) logmsg = '{} articles crawled'.format(n_articles) logger.info(logmsg)
print(firstarg, secondarg) x = 0 allpapers = newspaper.popular_urls() for n in allpapers: x += 1 if int(secondarg) < x: print("there are that many papers", x) sourcearts = [] for paper in allpapers[int(firstarg):int(secondarg)]: sourcepaper = newspaper.build(paper) sourcearts.append(sourcepaper) poolset = news_pool.set(sourcearts, threads_per_source=3) # (3*2) = 6 threads total pooljoin = news_pool.join() iart = 0 for iart in range(len(sourcearts)): print("newspaper {}: {}".format(iart + 1, sourcearts[iart].size())) iart = 0 try: connection = mysql.connector.connect(host='localhost', database='newspy', user='', password='') if connection.is_connected(): db_Info = connection.get_server_info() print("Connected to MySQL Server version ", db_Info) cursor = connection.cursor()
def main(): import newspaper from newspaper import news_pool import re import csv import unicodedata # Active list of news/media sources sources = ['http://fivethirtyeight.com'] #sources = ['http://cnn.com','http://foxnews.com', #'http://npr.org','http://msnbc.com','http://cbs.com', #'http://economist.com','http://time.com','http://nytimes.com', #'http://espn.com','http://reuters.com','http://usatoday.com', #'http://bbc.com','http://fivethirtyeight.com'] papers = {} # Empty dictionary print("Building papers\n....\n...\n...") # Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn' for i in range(len(sources)): key = re.sub(r'(^https?:\/\/|\.com$|\.org$)','',sources[i]) papers[key] = newspaper.build(sources[i],memoize_articles=False) # Print number of articles added from "recent" list for logging purposes print(key,papers[key].size()) print("Downloading articles (this may take a while)\n...\n...\n...") # Download all articles via multi-threading news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts news_pool.join() print("Extracting text from articles \n...\n...\n...") # Parse all articles for i in papers: for j in range(papers[i].size()): #call to "download()" deprecated by news_pool.set & news_pool.join #papers[i].articles[j].download() papers[i].articles[j].parse() #extract keywords papers[i].articles[j].nlp() print("Writing new articles to dump file \n...\n...\n...") # Append articles to csv # Prototype format: col(1) = source, col(2) = title, col(3) = authors, col(4) = text with open('papers.csv','a') as outcsv: writer = csv.writer(outcsv) writer.writerow(["Source","Date","Title","Authors","Text","Keywords"]) for i in papers: source = i for j in range(papers[i].size()): # Grab key features title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore') authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors] text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore') date = papers[i].articles[j].publish_date #date = unicodedata.normalize('NFKD',papers[i].articles[j].publish_date).encode('ascii','ignore') # Identify keywords, while we're at it keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords] writer.writerow([source,date,title,authors,text,keywords])
import newspaper from newspaper import news_pool hq_paper = newspaper.build('https://www.huanqiu.com', language="zh") sh_paper = newspaper.build('http://news.sohu.com', language="zh") sn_paper = newspaper.build('https://news.sina.com.cn', language="zh") papers = [hq_paper, sh_paper, sn_paper] # 线程数为 3 * 2 = 6 news_pool.set(papers, threads_per_source=2) news_pool.join() print(hq_paper.articles[0].html)
def crawl(): import newspaper from newspaper import news_pool memoize_articles = True conn = connect() threads_per_source = 4 round = 0 #loop indefinetely while True: count = get_news_source_count(conn) offset = 0 limit = 15 round += 1 log.info("Crawling round %s.", round) while offset <= count: papers = [] sources = get_news_sources(conn, offset, limit) offset += limit for source in sources: log.info("Creating newspaper for source %s", source[1]) news_paper = newspaper.build(source[1], memoize_articles=memoize_articles, MIN_WORD_COUNT=100) papers.append(news_paper) log.info("Found %s articles from %s.", news_paper.size(), source[1]) log.info("Creating a pool of newspapers for %s newspapers.", len(papers)) news_pool.set(papers, threads_per_source=threads_per_source) log.info("Downloading articles for all newspapers.") start_time = time.time() news_pool.join() end_time = time.time() - start_time log.info("Downloading finished in %s", end_time) log.info("Storing downloaded articles in the database.") for paper in papers: #get the source id for this nespaper news_source_id = get_news_source(conn, paper.url)[0] #Get already cralwed articles for this newspaper crawled_urls = articles_exist(conn, paper.article_urls()) crawled_urls_size = 0 if crawled_urls: crawled_urls_size = len(crawled_urls) else: crawled_urls = [''] log.info("For newspaper %s %s articles already crawled.", paper.url, crawled_urls_size) #articles = [] #crawled_articles = articles_for_news_source(conn, news_source_id) article_count = 0 for article in paper.articles: #if the article is not crawled already if article.url not in crawled_urls: #parse it try: article.parse() #check if its a news article, and not some other page if article.is_valid_body(): article_count += 1 insert_news_article(conn, article, news_source_id) except: pass #Check if the combination title and publish date already exists for this newspaper #publish_date = article.publish_date #if publish_date: # publish_date = publish_date.replace(tzinfo=None) #if (article.title, publish_date) not in crawled_articles: #If not, add it for insertion # articles.append(article) # crawled_articles.append((article.title, publish_date)) # log.info("Article '%s' publish date '%s' doesn't exists.", article.title, publish_date) #else: # log.warn("Article '%s' already exists", article.url) log.info("For newspaper %s stored %s articles.", paper.url, article_count) #Insert the articles in the database #insert_news_articles(conn, list(set(articles)), news_source_id) time.sleep(1000) #sleep for 1000 seconds before continuing
# ### Scraping articles # In[11]: title = [] author = [] published = [] body = [] #downloading articles #multi-threading to be nicer to medium articles = [Article(link, fetch_images = False) for link in links] news_pool.set(articles, threads_per_source = 6) news_pool.join() #getting title, author, publish date, and text body for each article for i in range(0, len(articles)): try: articles[i].parse() except ArticleException: pass #appending each to the corresponding list title.append(articles[i].title) author.append(articles[i].authors) published.append(articles[i].publish_date)
print "yahoo built" google = newspaper.build('http://www.usnews.com/') print "google built" bbc = newspaper.build('http://www.bbc.com/news/world/us_and_canada/') print "bbc built" nbc = newspaper.build('http://www.nbcnews.com/news/us-news') print "nbcbuild" cnn = newspaper.build('http://www.cnn.com/US/') print "cnn" abc = newspaper.build('http://abcnews.go.com/US/') print "abc built" fox = newspaper.build('http://www.foxnews.com/us/index.html') print "fox built" papers = [yahoo, google, bbc, nbc, cnn, abc, fox] news_pool.set(papers, threads_per_source=2) news_pool.join() for Source in papers: for article in Source.articles: url = article.url htmlcode = article.html print url filename = "html/" + article.title + ".html" filename = filename.replace("'", "") print filename.encode('utf-8') htmlfile = open(filename.encode('utf-8'), "wb") htmlfile.write(htmlcode.encode('utf-8')) htmlfile.close() #HTML(filename).write_png(pngfilename)
break elif str(veiculo) == '4': escolha = input('Digite o endereço (URL) do veículo que deseja: ') break else: print( 'Você não digitou um valor da lista. Digite apenas um número entre 1 e 4' ) print() continue except ValueError: print() meio = newspaper.build(escolha, language='pt', memoize_articles=False) fast = [meio] news_pool.set(fast, threads_per_source=2) print() print('Total de registros coletados: ' + str(meio.size())) listaurl = [] urlfinal = [] for article in meio.articles: listaurl.append(article.url) for url in listaurl: if veiculo == "1": if "comments" not in url: if "especial" not in url: if "oauth" not in url: if "aovivo" not in url: if "2019" in url: urlfinal.append(url) elif veiculo == "2":
Juila Sell """ # import required modules for webscraping and html parsing import requests import newspaper from newspaper import news_pool import sqlite3 # create list containing news sites to scrape web_list = ['http://www.foxnews.com', 'http://www.usatoday.com'] # setup newspaper to multi-thread news sources newsWebList = [newspaper.build(i) for i in web_list] news_pool.set(newsWebList, threads_per_source=2) news_pool.join() # connect to Sqlite database and initiate / build table con = sqlite3.connect('tnc.db') with con: cur = con.cursor() cur.execute("DROP TABLE IF EXISTS NewsArticle") cur.execute("CREATE TABLE NewsArticle(Id TEXT, Number INT, Name TEXT, Count INT)") # The News Counter Webscraper def tncWebscraper(): # iterates through sources for web_page in web_list: # set get request for html i = 0
def gather_different(self, extra_urls=None, only_extra=False, ignore_gotten=False, save=True): checklang = False if extra_urls: self.urls["extras"] = set(extra_urls) for url_ext in extra_urls: mine_article(url_ext) if not only_extra: print(self.newssites) if len(self.newssites) > 1 and type(self.newssites) is list: papers = [ build(paper, config=self.config) for paper in self.newssites ] else: papers = build(self.newssites, config=self.config) log(f"Getting Data from {len(self.newssites)} newssites...") news_pool.set(papers, threads_per_source=2) news_pool.join() for art_pool, url in zip(papers, self.newssites): print( f"Handling newssite {int(self.newssites.index(url)) + 1}/{len(self.newssites)}" ) for art in art_pool.articles: art.parse() if (str(art.url) not in self.urls["gotten"]) or ignore_gotten: created = date_to_posix(dates=art.publish_date, list=False) if created is not None and created != "None": dic_temp = { "link": str(art.url), "text": str( art.text.replace(" ", "").replace("\n", "")), "title": str(art.title), "created": float(created), "keywords": str(art.keywords), "author": str(art.authors) } self.urls["gotten"] = np.append( self.urls["gotten"], art.url) if checklang: try: if check_lang_is_en(str(art.text)): self.df_art = self.df_art.append( dic_temp, ignore_index=True) else: print(f"Blocked: {dic_temp['text']}") except json.decoder.JSONDecodeError as e: error(e) if check_lang_is_en(str(art.title)): self.df_art = self.df_art.append( dic_temp, ignore_index=True) else: print(f"Blocked: {dic_temp['text']}") print("fixed?") else: self.df_art = self.df_art.append( dic_temp, ignore_index=True) if save: print(self.df_art) try: pass #print(self.df_art.to_string()) except: pass update_hdf5(files["news_store"], "news_articles", dataframe=self.df_art, mode="a", append=False)
print data.shape # create list of Article objects urls = data[1:, 0].tolist() # for each line in csv articles = [] for i in range(len(urls)): # print "iteration:{} {} ".format(i,urls[i]) articles.append(Article(url=urls[i])) # create a source of aricltes news_source = Source("https://www.dummyurl.com") news_source.articles = articles # create a news_pool for threading purposes news_pool.set([news_source], threads_per_source=2) news_pool.join() # iterate through article list to create a column for the csv print "Parsing articles..." article_list = [] labels = ['title', 'authors', 'text', 'keywords', 'summary', 'tags'] for article in articles: print "Parsing article {}".format(article.url) article.parse() article_list.append({ labels[0]: article.title, labels[1]: article.authors, labels[2]: article.text, labels[3]: article.keywords,
__author__ = 'James' import newspaper from newspaper import Config, news_pool config = Config() config.set_language('en') config.memoize_articles = False reuters = newspaper.build(url='http://www.reuters.com', config=config) indo = newspaper.build(url='http://www.independent.ie', config=config) papers = [reuters, indo] news_pool.set(paper_list=papers, threads_per_source=3) news_pool.join() for paper in papers: print(paper.brand + ": " + str(paper.size()) + " article(s)") # for article in paper.articles: # print(article.title) # print("-----------\nCATEGORIES\n-----------") # # for category in a.categories: # print(category.url) # b = newspaper.Source(url=category.url) # b.build() # print("\t-----------\n\tFEEDS\t\n-----------\t") # for feed_url in b.feed_urls(): # print("\t->" + feed_url)
def post(self, request): form = SearchForm(request.POST) if form.is_valid(): search_key = form.cleaned_data['search_keys'] search_weight = form.cleaned_data['search_weight'] search_key = search_key.split(',') search_weight = search_weight.split(',') search_key_weight = {} for l in range(len(search_key)): search_key_weight[search_key[l]] = search_weight[l] if detect(search_key[0]) != 'zh' and detect( search_key[0]) != 'zh-cn': # cnn_paper = newspaper.build('http://cnn.com', memoize_articles=False) # print(cnn_paper.size()) # times_paper = newspaper.build('https://www.nytimes.com/', memoize_articles=False) # print(times_paper.size()) # guardian_paper = newspaper.build('https://www.theguardian.com/us', memoize_articles=False) # print(guardian_paper.size()) # abc_paper = newspaper.build('https://abcnews.go.com/', memoize_articles=False) # print(abc_paper.size()) # bbc_paper = newspaper.build('https://www.bbc.com/', memoize_articles=False) # print(bbc_paper.size()) boston_paper = newspaper.build('https://www.bostonglobe.com//', memoize_articles=False) print(boston_paper.size()) seattle_paper = newspaper.build( 'https://www.seattletimes.com/', memoize_articles=False) print(seattle_paper.size()) # papers = [cnn_paper, times_paper, guardian_paper, abc_paper, bbc_paper] papers = [boston_paper, seattle_paper] news_pool.set(papers, threads_per_source=2) # (5*2) = 10 threads total news_pool.join() # for article in cnn_paper.articles: # self.all_scrapy(article, search_key_weight) # for article in times_paper.articles: # self.all_scrapy(article, search_key_weight) # for article in guardian_paper.articles: # self.all_scrapy(article, search_key_weight) # for article in abc_paper.articles: # self.all_scrapy(article, search_key_weight) # for article in bbc_paper.articles: # self.all_scrapy(article, search_key_weight) for article in boston_paper.articles: self.all_scrapy(article, search_key_weight) for article in seattle_paper.articles: self.all_scrapy(article, search_key_weight) elif detect(search_key[0]) == 'zh-cn': qq_paper = newspaper.build('https://www.qq.com/', memoize_articles=False) print('qq_paper: ' + str(qq_paper.size())) # wy_paper = newspaper.build('https://news.163.com/', memoize_articles=False) # papers = [qq_paper, wy_paper] papers = [qq_paper] news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total news_pool.join() for article in qq_paper.articles: print('processing') self.all_scrapy(article, search_key_weight) # for article in wy_paper.articles: # print('processing') # self.all_scrapy(article, search_key_weight) else: form = SearchForm() return HttpResponseRedirect(reverse('searching:results', args=()))
memoize_articles=False) globalnewsca = newspaper.build('https://globalnews.ca/', memoize_articles=False) thestar = newspaper.build('https://www.thestar.com/', memoize_articles=False) cna = newspaper.build('https://www.channelnewsasia.com/news/international', memoize_articles=False) #Combine all the sources list_of_sources = [ cnn, bbc, slate, breitbart, politico, thehill, cbc, washingtonpost, globeandmail, tc, gamespot, globalnewsca, thestar, cna ] #Intaitate Muli-Threading Downloads #WARNING: keep the threads_per_source at a reasonable number news_pool.set(list_of_sources, threads_per_source=4) #2 threads per each source news_pool.join() #Create our final dataframe df_articles = pd.DataFrame() #Create a download limit per sources limit = 100 for source in list_of_sources: #tempoary lists to store each element we want to extract list_title = [] list_text = [] list_source = [] count = 0
client = MongoClient() db = client['news_crawls'] outlet_list_file_open = open(sys.argv[1], 'r') outlet_list_file = outlet_list_file_open.read() outlet_list = outlet_list_file.split('\n') outlet_list.pop(-1) build_objects = [] for outlet in outlet_list: build_objects.append(np.build('http://' + outlet, memoize_articles=False)) news_pool.set(build_objects, threads_per_source=2) news_pool.join() for outlet in build_objects: for article in outlet.articles: count = db.italian_outlets.find({"url": article.url}) if count.count() != 0: continue else: article.parse() print article.url db.italian_outlets.insert_one({ "title": article.title, "text": article.text,
def scrape_newspapers(self, company_name, start_date, end_date, bodies=False): """ Build a list of the newspapers articles from a given url """ def build_papers(news_url): return newspaper.build(news_url, language=self.language, memoize_articles=False) """ Return a relevant article matching company name and optional params such as start_date, end_date, bodies """ def relevant_articles(papers): try: for article in papers.articles: """ Lets analyse the HTML of the article to inspect the h1 (title) of the article. Reading documentation of newspaper3k suggests parse() is expensive method so try to limit overhead and only parse articles with a relevant title. """ soup = BeautifulSoup(article.html, "html.parser") title = soup.find('h1').get_text() #If the company name is found wihtin the headline of a news article then parse the article for more information if title and company_name in title.lower(): article.parse() if within_date_range(article.publish_date, start_date, end_date): article_dict = { "headline": article.title, "source": article.url, "published_date": article.publish_date, "company_name": company_name } if bodies: article_dict.update({"body": article.text}) yield article_dict except Exception as e: #log the error to a file, continue print("Exception:", e) pass articles = [] company_name = company_name.lower() try: print("Downloading papers .....") papers = [build_papers(src) for src in self.news_urls] print("Papers downloaded", len(papers), papers) news_pool.set(papers, threads_per_source=2) news_pool.join() except Exception as e: #should log the error to a file in production then continue print("Exception:", e) pass finally: articles.extend( [article for p in papers for article in relevant_articles(p)]) return articles
#!/usr/bin/python # -*- coding: utf-8 -*- import newspaper from newspaper import news_pool from pprint import pprint # slate_paper = newspaper.build('http://slate.com') # tc_paper = newspaper.build('http://techcrunch.com') # espn_paper = newspaper.build('http://espn.com') elpais = newspaper.build('http://elpais.com') elmundo = newspaper.build('http://www.elmundo.es') publico = newspaper.build('http://www.publico.es') papers = [elpais, elmundo, publico] news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total news_pool.join() print(len(papers)) pprint(papers) print(len(elpais.articles)) print(len(elmundo.articles))
def main(): import newspaper # article download utility from newspaper import news_pool, Config, Article, Source import re # regex import csv # csv file-formatting import unicodedata # string cleaning from datetime import datetime # time-checking for cache-updates print("Retrieving sources and update times\n...") # Read active list of news/media sources f = open("sourcelist","r") sources = f.read().splitlines() times = [] # # ONGOING: update time storage and retrieval # -dependent on if caching is sufficient papers = {} # Empty dictionary print("Building papers\n....\n...\n...") # Store total and current number of articles for progress metrics total_articles = 0; current_articles = 0 # Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn' for i in range(len(sources)): key = re.sub(r'(^https?:\/\/|\.com\n$|\.org\n$)','',sources[i]) papers[key] = newspaper.build(sources[i],memoize_articles=True) # Print number of articles added from "recent" list for logging purposes total_articles = total_articles + papers[key].size() print(key,papers[key].size()) print("Downloading articles (this may take a while)\n...\n...\n...") config = Config() config.fetch_images = False # Download all articles via multi-threading news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts news_pool.join() print("Extracting text from articles and writing to dump files \n...\n...\n...") # Append articles to aggregate and individual csv's # Format: col(1) = source, col(2) = date, col(3) = title, col(4) = authors, col(5) = text, col(6) = keywords with open('papers.csv','a') as outcsv: # Setup aggregate csv writer writer = csv.writer(outcsv) #writer.writerow(["Source","Date","Title","Authors","Text","Keywords"]) # Traverse sources for i in papers: # Setup single_source csv writing source = i ind_outcsv = open(str(i+".csv"),'a') ind_writer = csv.writer(ind_outcsv) # Traverse articles in source for j in range(papers[i].size()): # Parse articles and extract features current_articles += 1 print("Processing " + str(i) + " article " + str(current_articles) + " of " + str(total_articles) + " (" + str("{0:.2f}".format((current_articles/float(total_articles)*100),2)) + " %)") try: papers[i].articles[j].parse() # Grab key features title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore') authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors] text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore') date = papers[i].articles[j].publish_date keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords] # Add new row to both single-source and aggregate files ind_writer.writerow([source,date,title,authors,text,keywords]) writer.writerow([source,date,title,authors,text,keywords]) papers[i].articles[j].nlp() except httplib.BadStatusLine: print "httplib.BadStatusLine, no dice"
def get_bot_response(): while True: userText = request.args.get('msg') msg = str(userText) entrada = msg.lower() f = csv.writer(open('inputs.csv', 'a', encoding='utf-8')) f.writerow([msg]) response = searchbot.get_response(userText) if float(response.confidence) >= 0.8: return str(searchbot.get_response(userText)) elif userText == str('NÃO'): return str('Refaça a pergunta, por favor!') elif userText == str("SIM"): return str("Agradecemos o seu contato") elif float(response.confidence) == 0.0: entrada = msg # print(entrada) p1 = 'http://receita.economia.gov.br/@@busca?advanced_search=False&sort_on=&SearchableText=' p2 = '&portal_type%3Alist=Document&created.query%3Arecord%3Alist%3Adate=1970-01-02&created.range%3Arecord=min' html = str(p1 + entrada + p2) stop2 = nltk.corpus.stopwords.words('portuguese') stop2.append('faço') stop2.append('um') stop2.append('gostaria') stop2.append('fazer') stop2.append('saber') stop2.append('posso') stop2.append('como') splitter = re.compile('\\W+') lista_palavras = [] lista = [p for p in splitter.split(entrada) if p != ''] for p in lista: if p not in stop2: if len(p) > 1: lista_palavras.append(p) ar = len(lista_palavras) ax = str(lista_palavras[0:ar]) e = str(ax).replace(',', ' ').strip('[]') e.strip("'") #headers = {'User-Agent': 'Mozilla/5.0'} headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } try: page = requests.get(html, headers=headers, verify=False, stream=False, timeout=7) soup = BeautifulSoup(page.content, 'lxml') cla = soup.find(class_='searchResults') links = cla.find_all('a') except (KeyError, IndexError, AttributeError): pass # namess = soup.find_all('a') # ra = (lista_palavras) # CRIAR A LISTA DE LINKS SITE RFB listr = [] for link in links: texto = str(link.get_text()).lower().replace('ã', 'a').replace( '-', ' ').replace('ç', 'c').split() time.sleep(0.5) # print(len(texto)) url = str(link.get('href')) time.sleep(0.5) # print(len(url)) urls = str(link.get('href')).lower().replace('/', ' ').replace( '-', ' ').replace('.', ' ').split() time.sleep(0.5) # print(len(urls)) if entrada in texto: listr.append(url) for i in range(0, ar): if lista_palavras[i] in texto: listr.append(url) elif lista_palavras[i] in urls: listr.append(url) else: listr == [] pass listag = [] rec = 'site:receita.economia.gov.br intitle:' + msg + " -filetype:pdf -.pdf" for urla in search(rec, tld='com.br', lang='pt-br', stop=4, pause=8): time.sleep(1) listag.append(urla) g = int(len(listag)) # print(g) listago = [] for z in range(0, g): ur = str(listag[z]) listago.append(ur) # print(listago) # print(len(listago)) qo = int(len(listago)) # print(listr) # print(len(listr)) listaunida = listago + listr conj = list(set(listaunida)) # print(conj) # print(len(conj)) # print(type(conj)) # print(p) # print(len(p)) j = len(conj) reports2 = [] news_pool.set(reports2, threads_per_source=2) news_pool.join() for r in range(0, j): try: ia = str(conj[r]) article = Article(ia, language="pt") article.download() article.parse() article.text article.nlp() article.summary except: pass reports2.append(str(article.summary).replace('\n', ' ')) # print(len(reports2)) resposta_finalc = set(reports2) print(resposta_finalc) if resposta_finalc == set(): wikipedia.set_lang("pt") a = msg result = wikipedia.search(a, results=1) page = wikipedia.summary(result, sentences=6) content = page return str(content) else: try: resposta_final = (str(resposta_finalc).replace( '\n', ' ').replace('[', ' ').replace(']', ' ').replace( ',', ' ').replace("'", ' ').replace('{', ' ').replace("}", ' ')) f = csv.writer(open('chats.csv', 'a', encoding='utf-8')) f.writerow([msg + '\n' + resposta_final]) return str( resposta_final + '\n' + 'Encontrou a resposta que precisava? SIM ou NÃO?') except: return str( 'Desculpe! Não encontrei uma resposta para sua pergunta. Poderia repetir com outros termos?' )
def auto_article_go_getter(): print("starting builds ", file=sys.stderr) cnn_paper = newspaper.build("https://www.cnn.com", memorize_articles=True, language = 'en') print("cnn_paper built", file=sys.stderr) nbc_paper = newspaper.build("https://www.nbcnews.com", memorize_articles=True, language = 'en') #print("nbc_paper built", file=sys.stderr) #nyt_paper = newspaper.build("https://www.nytimes.com/", memorize_articles=True, language = 'en') #print("nyt_paper built", file=sys.stderr) apn_paper = newspaper.build("https://apnews.com/", memorize_articles=True, language = 'en') print("apn_paper built", file=sys.stderr) abc_paper = newspaper.build("https://abcnews.go.com/", memorize_articles=True, language = 'en') print("abc_paper built", file=sys.stderr) papers = [cnn_paper, nbc_paper, apn_paper, abc_paper] verge_paper = newspaper.build("https://www.theverge.com/", memorize_articles=True, language = 'en') print("verge_paper built", file=sys.stderr) techP = [verge_paper] espn_paper = newspaper.build("https://www.espn.com/", memorize_articles=True, language = 'en') print("espn_paper built", file=sys.stderr) sportP = [espn_paper] et_paper = newspaper.build("https://ew.com/", memorize_articles=True, language = 'en') print("ew_paper built", file=sys.stderr) entertainmentP = [et_paper] crypto_paper = newspaper.build("https://cryptonews.com/", memorize_articles=True, language = 'en') print("crypto_paper built", file=sys.stderr) cryptoP = [crypto_paper] climate_paper = newspaper.build("https://www.climatechangenews.com/", memorize_articles=True, language = 'en') print("climate_paper built", file=sys.stderr) climateP = [climate_paper] print("all papers built", file=sys.stderr) count = 0 article_list = [] print("Starting pool threading", file=sys.stderr) print("Starting pool for papers", file=sys.stderr) news_pool.set(papers, threads_per_source=1000) news_pool.join() print("Finished pool threading for papers", file=sys.stderr) print("Starting pool for techp", file=sys.stderr) news_pool.set(techP, threads_per_source=1000) news_pool.join() print("Finished pool threading for techp", file=sys.stderr) print("Starting pool for sportp", file=sys.stderr) news_pool.set(sportP, threads_per_source=1000) news_pool.join() print("Finished pool threading for sportp", file=sys.stderr) print("Starting pool for entertainmentp", file=sys.stderr) news_pool.set(entertainmentP, threads_per_source=1000) news_pool.join() print("Finished pool threading for entertainmentp", file=sys.stderr) print("Starting pool for cryptop", file=sys.stderr) news_pool.set(cryptoP, threads_per_source=1000) news_pool.join() print("Finished pool threading for cryptop", file=sys.stderr) print("Starting pool for climatep", file=sys.stderr) news_pool.set(climateP, threads_per_source=1000) news_pool.join() print("Finished pool threading for climatep", file=sys.stderr) print("Saving articles to mongodb", file=sys.stderr) for build in papers: for news in (build.articles): if "politics" in news.url and "cnnespanol" not in news.url: news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) if "apnews.com" in news.url: textSum = news.text article = NewsArticle( link = news.url, image = news.top_image, wing = "political", #text = news.text, text = textSum, title = news.title ).save() #email_services = ["hotmail", "gmail", "yahoo"] #email_contains_service = any(email_service in user_email for email_service in email_services) elif ["stock", "net", "loss", "Q1", "Q2", "Q3", "Q4", "Gain"] in word_tokenize(news.text): news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) if "apnews.com" in news.url: textSum = news.text article = NewsArticle( link = news.url, image = news.top_image, wing = "buisness", text = textSum, title = news.title ).save() elif "covid" in news.url or "corona" in news.url: news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) if "apnews.com" in news.url: textSum = news.text article = NewsArticle( link = news.url, image = news.top_image, wing = "covid", text = textSum, title = news.title ).save() count += 1 for build in techP: for news in (build.articles): news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) if "apnews.com" in news.url: textSum = news.text if "#comments" not in news.url: article = NewsArticle( link = news.url, image = news.top_image, wing = "tech", text = textSum, title = news.title ).save() for build in sportP: for news in (build.articles): news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) article = NewsArticle( link = news.url, image = news.top_image, wing = "sports", text = textSum, title = news.title ).save() for build in entertainmentP: for news in (build.articles): news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) article = NewsArticle( link = news.url, image = news.top_image, wing = "entertainment", text = textSum, title = news.title ).save() for build in cryptoP: for news in (build.articles): news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) article = NewsArticle( link = news.url, image = news.top_image, wing = "crypto", text = textSum, title = news.title ).save() for build in climateP: for news in (build.articles): news.parse() #call on text summarizer with text of article textSum = text_summarizer(news.text) article = NewsArticle( link = news.url, image = news.top_image, wing = "climate", text = textSum, title = news.title ).save() print("Articles saved in mongodb", file=sys.stderr)