def get_configuration() -> newspaper.Config: """Return configuration for news site scraping.""" conf = newspaper.Config() conf.memoize_articles = False conf.fetch_images = False conf.MIN_WORD_COUNT = 1 conf.MAX_TEXT = 6 * 5000 return conf
def calculate_article_word_count(url): config = newspaper.Config() config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) " \ "AppleWebKit/537.36 (KHTML, like Gecko) " \ "Chrome/64.0.3282.186 Safari/537.36" article = newspaper.Article(url, config=config) article.download() article.parse() if (len(article.text.split())) < 200: raise ValidationError('Could not find article') return len(article.text.split()) + len(article.title.split())
def download_article(url): try: # user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/537.36 (KHTML, like Gecko) Chrome/20100101 Firefox/78.0' config = newspaper.Config() config.browser_user_agent = user_agent config.request_timeout = 10 config.fetch_images = False config.memoize_articles = False article = newspaper.Article(url, config=config) article.download() article.parse() return article.text except: raise Exception("Error: Parsing failed.")
def summarizeLinksToAudio(url, summary) -> None: """Summarize a list of urls into audio files.""" results = list() result = str() try: config = newspaper.Config() configNews(config) urls = getURLS(url, summary) for url in urls: results.append(summarizeLinkToAudio(url)) except Exception as e: logging.exception(e) finally: result = "".join(results) return result
def genericScraper(self): config = newspaper.Config() config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14' #config.request_timeout = 15 if self.source not in [ "washingtonpost", "usnews" ]: # washingtonpost and usnews get funky when you set a user agent for some reason (WaPo fails if the timeout isn't long, usnews throws a 403) a = newspaper.Article(self.url, config=config) else: a = newspaper.Article(self.url) try: # make sure page download goes smoothly a.download() a.parse() except Exception as e: print("Rejected - DOWNLOAD ERROR: ", e) return None text = cleanText(a.text) if len( text ) < 500: # not much article text - full article is likely not picked up, and worst case scenario a short article is rejected (probably not all that useful in the long run) print( "Rejected - Article text was less than 500 characters, likely bad scraping job" ) return None # get title, author, date and images as necessary if not self.title: if a.title: self.title = a.title if not self.author: if a.authors: self.author = a.authors[0] if not self.date: if a.publish_date: self.date = a.publish_date.strftime("%Y-%m-%d") if not self.images: if a.top_image: self.images.append(a.top_image) article = Article(self.title, self.author, self.date, self.url, self.source, text.strip(), self.images) return article
def process_article(self, article): """Scrape data from news article from url Args: article ([ArticleObj]): [Article Object to scrape - article.url must not be null] Returns: [ArticleObj] """ print("processing article: " + str(article.title) + " - " + str(article.source)) try: user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = newspaper.Config() config.browser_user_agent = user_agent newspaper_article = newspaper.Article(article.url, language='en', config=config) newspaper_article.download() newspaper_article.parse() except Exception as e: # handle any exceptions print("Error parsing newspaper article") print(e) if newspaper_article: article.text = self.get_text(newspaper_article).encode( 'utf-8').decode("utf-8") #lang = langdetect.detect(article.text) lang = 'en' if lang == 'en': article.cleaned_text = self.clean_text(article.text) if len(article.text.split()) > 60 and article.title != None: #article.entities = self.get_entities(article.text) article.entities = [] #article.category = self.get_category(article.cleaned_text) article.category = "" if not article.img_url: article.img_url = self.get_image_url(newspaper_article) article.desc = ' '.join(article.text.split()[:50]) + "..." return article
def build(newspaperURL): logprint( "Fetching articles from {} ...\nThis might take some time...".format( newspaperURL)) start = time.time() config = newspaper.Config() config.MIN_WORD_COUNT = 700 config.MIN_SENT_COUNT = 40 paper = newspaper.build( url=newspaperURL, config=config, memoize_articles=False, fetch_images=False, ) end = time.time() logprint("Done. Fethching took {} seconds.".format(end - start)) return paper
def applyConfig(self): """ apply configuration """ os.environ['HTTP_PROXY'] = '' os.environ['HTTPS_PROXY'] = '' try: newspaper_config = newspaper.Config() newspaper_config.memoize_articles = True newspaper_config.http_success_only = True newspaper_config.fetch_images = False newspaper_config.number_threads = 2 newspaper_config.browser_user_agent = self.configData['user_agent'] newspaper_config.request_timeout = self.configData['fetch_timeout'] # add this to config data self.configData['newspaper_config'] = newspaper_config # set OS environment variables for proxy server: if len(self.configData['proxy_url_http']) > 3 and len( self.configData['proxy_url_https']) > 3: os.environ['HTTP_PROXY'] = self.configData['proxy_url_http'] os.environ['HTTPS_PROXY'] = self.configData['proxy_url_https'] self.configData['proxies'] = { "http": self.configData['proxy_url_http'], "https": self.configData['proxy_url_https'] } # else: # print("INFO: Not using any proxy servers: " # , self.configData['proxy_url_http'] # , " or " # , self.configData['proxy_url_https']) nltk.set_proxy(self.configData['proxies']) self.configData['newspaper_config'].proxies = self.configData[ 'proxies'] # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies()) except Exception as e: print("ERROR: Unable to set proxy parameters: %s", e)
def __init__(self, url="", keep_html=True): self.config = newspaper.Config() self.config.keep_article_html = True self.url = url # Validate URL try: self.parse_result = urllib.parse.urlparse(self.url) print(self.parse_result) except Exception as e: self.parse_result = False self.article_data = dict() else: # It's okay to Attempt a parse self.article_data = self.extract() finally: # Any Future Cleanup Goes Here self.article_data["url"] = url pass
def __init__(self, url: str, articles_limit=100, summary_sentences=5): """ Initializes the information source with summaries of the articles_limit articles. Each summary will have summary_sentences number of sentences. :param url: news website URL, e.g. "https://www.bbc.co.uk/" :param articles_limit: limit number of articles to fetch :param summary_sentences: number of sentences in each summary """ config = newspaper.Config() config.MAX_SUMMARY_SENT = summary_sentences config.memoize_articles = False config.fetch_images = False self.paper = newspaper.build(url, config=config) summaries_list = list() i = 0 while len(summaries_list) < articles_limit and i < len( self.paper.articles): article = self.paper.articles[i] i += 1 try: article.download() article.parse() article.nlp() except newspaper.article.ArticleException: continue if article.summary != str(): summaries_list.append(article.summary.split('\n')) self.summaries = list() for summary in summaries_list: sentence_to_id = dict( zip( range(InformationSource._info_id, InformationSource._info_id + len(summary)), summary)) InformationSource._info_id += len(summary) self.summaries.append(sentence_to_id)
def parse_article_url(url): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0' config = newspaper.Config() config.browser_user_agent = user_agent print(url) content = {} article = {} try: content = newspaper.Article(url, config=config) content.download() content.parse() except Exception as e: print("Invalid url! Please try again") return None article['title'] = content.title article['text'] = content.text article['link'] = content.url article['date'] = content.publish_date.isoformat() return article
def applyNetworkConfig(self): """ Apply configuration for networking """ os.environ['HTTP_PROXY'] = '' os.environ['HTTPS_PROXY'] = '' try: newspaper_config = newspaper.Config() newspaper_config.memoize_articles = False newspaper_config.http_success_only = True newspaper_config.fetch_images = False newspaper_config.number_threads = 2 newspaper_config.browser_user_agent = self.user_agent newspaper_config.request_timeout = self.fetch_timeout newspaper_config.use_cached_categories = False # add this to config data self.newspaper_config = newspaper_config # set OS environment variables for proxy server: if len(self.proxy_url_http) > 3 and len(self.proxy_url_https) > 3: os.environ['HTTP_PROXY'] = self.proxy_url_http os.environ['HTTPS_PROXY'] = self.proxy_url_https self.proxies = { "http": self.proxy_url_http, "https": self.proxy_url_https } else: os.environ['HTTP_PROXY'] = '' os.environ['HTTPS_PROXY'] = '' self.proxy_url_http = None self.proxy_url_https = None self.proxies = {} nltk.set_proxy(self.proxies) self.newspaper_config.proxies = self.proxies # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies()) except Exception as e: print("ERROR: Unable to set proxy parameters: %s", e)
"link": "http://www.nbcnews.com/" }, "buzzfeed": { "rss": "http://www.buzzfeed.com/politics.xml", "link": "http://www.buzzfeed.com/" } } try: with open(script_path + 'scraped_articles.yaml') as data_file: data = yaml.load(data_file) except Exception as e: print(e) print('Unable to load previous scraped articles.') config = newspaper.Config() config.fetch_images = False config.memoize_articles = False #iterate through all sources. Stop after RSS feed exhausted, or LIMIT reached for source, value in sources.items(): site = { "rss": value['rss'], "link": value['link'], "articles": [], "link_set": set() } count = 1 print("Capturing articles from", source) d = fp.parse(value['rss']) if source not in data:
def convert_tweet(twitter_dumpfn): try: with open(twitter_dumpfn, 'r') as dumpf: data = json.loads(dumpf.read()) if len(data['comments'])==0: return jsonfn = '%s/%s/%s.json' % (QASRC_DIRFN, corpus_name, data['id']) if os.path.exists(jsonfn): return url = data['textUrl'] if 'textUrl' in data else '' text = '' if url: skip = False for blocked_url in BLOCKLIST: if blocked_url in url: skip = True if skip: return logging.debug ('%-20s: %s ... ' % (data['user'], url)) config = newspaper.Config() config.browser_user_agent = random.choice(USER_AGENTS) article = Article(url=url, config=config) article.download() article.parse() text = article.text # print (text) # text if text: ds = {'info': text, 'date': data['date'], 'dlg': [data['text']]} else: ds = {'info': data['text'], 'date': data['date'], 'dlg': []} fav = 0 for c in data['comments']: if c['favorites'] == 0: continue ds['dlg'].append(c['text']) fav += 1 if (not text) and (fav == 0): return # print(repr(ds)) with open(jsonfn, 'w') as jsonf: jsonf.write(json.dumps(ds)) logging.debug ('%-20s: %s written. %s' % (data['user'], jsonfn, url[:30])) except newspaper.article.ArticleException as ae: logging.info ('%-20s: %s' % (data['user'], str(ae))) except: logging.exception('exception caught %s' % repr(data))
def newspaper_stories(words, search_type='or', search_level=0, urls=None, display=True, memorize=False, language='en'): config = newspaper.Config() config.memoize_articles = memorize config.language = language config.fetch_images = False config.request_timeout = 20 config.MIN_WORD_COUNT = 300 config.MIN_SENT_COUNT = 10 if urls == None or urls == 'top_news': news_urls = { 'huffington': 'http://huffingtonpost.com', 'reuters': 'http://www.reuters.com', 'cbs-news': 'http://www.cbsnews.com', 'usa-today': 'http://usatoday.com', 'cnn': 'http://cnn.com', 'npr': 'http://www.npr.org', 'abc-news': 'http://abcnews.com', 'us-news': 'http://www.usnews.com', 'msn': 'http://msn.com', 'pbs': 'http://www.pbs.org', 'nbc-news': 'http://www.nbcnews.com', 'msnbc': 'http://www.msnbc.com', 'fox': 'http://www.foxnews.com' } elif urls == 'all_us_news': news_urls = { 'abc-news': 'https://abcnews.go.com', 'al-jazeera-english': 'http://www.aljazeera.com', 'ars-technica': 'http://arstechnica.com', 'associated-press': 'https://apnews.com/', 'axios': 'https://www.axios.com', 'bleacher-report': 'http://www.bleacherreport.com', 'bloomberg': 'http://www.bloomberg.com', 'breitbart-news': 'http://www.breitbart.com', 'business-insider': 'http://www.businessinsider.com', 'buzzfeed': 'https://www.buzzfeed.com', 'cbs-news': 'http://www.cbsnews.com', 'cnbc': 'http://www.cnbc.com', 'cnn': 'http://us.cnn.com', 'crypto-coins-news': 'https://www.ccn.com', 'engadget': 'https://www.engadget.com', 'entertainment-weekly': 'http://www.ew.com', 'espn': 'http://espn.go.com', 'espn-cric-info': 'http://www.espncricinfo.com/', 'fortune': 'http://fortune.com', 'fox-news': 'http://www.foxnews.com', 'fox-sports': 'http://www.foxsports.com', 'google-news': 'https://news.google.com', 'hacker-news': 'https://news.ycombinator.com', 'ign': 'http://www.ign.com', 'mashable': 'http://mashable.com', 'medical-news-today': 'http://www.medicalnewstoday.com', 'msnbc': 'http://www.msnbc.com', 'mtv-news': 'http://www.mtv.com/news', 'national-geographic': 'http://news.nationalgeographic.com', 'national-review': 'https://www.nationalreview.com/', 'nbc-news': 'http://www.nbcnews.com', 'new-scientist': 'https://www.newscientist.com/section/news', 'newsweek': 'http://www.newsweek.com', 'new-york-magazine': 'http://nymag.com', 'next-big-future': 'https://www.nextbigfuture.com', 'nfl-news': 'http://www.nfl.com/news', 'nhl-news': 'https://www.nhl.com/news', 'politico': 'https://www.politico.com', 'polygon': 'http://www.polygon.com', 'recode': 'http://www.recode.net', 'reddit-r-all': 'https://www.reddit.com/r/all', 'reuters': 'http://www.reuters.com', 'techcrunch': 'https://techcrunch.com', 'techradar': 'http://www.techradar.com', 'american-conservative': 'http://www.theamericanconservative.com/', 'hill': 'http://thehill.com', 'huffington-post': 'http://www.huffingtonpost.com', 'next-web': 'http://thenextweb.com', 'verge': 'http://www.theverge.com', 'wall-street-journal': 'http://www.wsj.com', 'washington-post': 'https://www.washingtonpost.com', 'washington-times': 'https://www.washingtontimes.com/', 'time': 'http://time.com', 'usa-today': 'http://www.usatoday.com/news', 'vice-news': 'https://news.vice.com', 'wired': 'https://www.wired.com' } elif urls == "texas_universities": news_urls = { 'A&M': 'http://www.tamu.edu', 'A&M-Commerce': 'http://www.tamuc.edu', 'A&M-Corpus': 'http://www.tamucc.edu', 'A&M-Kingsville': 'http://www.tamuk.edu', 'A&M-Galveston': 'http://www.tamug.edu', 'A&M-PrairieView': 'http://www.pvamu.edu', 'A&M-International': 'http://www.tamiu.edu', 'A&M-WestTexas': 'http://www.wtamu.edu', 'Baylor': 'http://www.baylor.edu', 'Rice': 'http://www.rice.edu', 'SFAustin': 'http://www.sfasu.edu', 'SMU': 'http://www.smu.edu', 'SulRoss': 'http://www.sulross.edu', 'TexasState': 'http://www.txstate.edu', 'Texas_Tech': 'http://www.ttu.edu', 'UDallas': 'http://www.udallas.edu', 'UHouston': 'http://www.uh.edu', 'UTexas': 'http://www.utexas.edu', 'UT_Dallas': 'http://www.utdallas.edu', 'UT_ElPaso': 'http://www.utep.edu', 'UT_Houston': 'http://www.uth.edu', 'UT_NorthTexas': 'http://www.unt.edu', 'UT_SanAntonio': 'http://www.utsa.edu' } elif urls == 'popular': news_urls = {} agency_urls = newspaper.popular_urls() for i in range(len(agency_urls)): val = agency_urls[i] url = agency_urls[i].replace("http://", "") url = url.replace("www.", "") url = url.replace("blog.", "") url = url.replace("blogs.", "") url = url.replace(".com", "") url = url.replace(".net", "") url = url.replace(".au", "") url = url.replace(".org", "") url = url.replace(".co.uk", "") url = url.replace("the", "") url = url.replace(".", "-") url = url.replace('usa', 'usa-') if url == 'berkeley-edu': continue if url == 'beta-na-leagueoflegends': continue if url == 'bottomline-as-ucsb-edu': continue news_urls[url] = val else: news_urls = urls print("\nSearch Level {:<d}:".format(search_level), end="") if search_level == 0: print(" Screening URLs for search words") print(" URLs must contain one or more of:", end="") else: print(" No URL Screening") print(" Deep Search for Articles containing: ", end="") i = 0 for word in words: i += 1 if i < len(words): if search_type == 'or': print(word + " or ", end="") else: print(word + " & ", end="") else: print(word) df_articles = pd.DataFrame(columns=[ 'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text' ]) n_articles = {} today = str(date.today()) for agency, url in news_urls.items(): paper = newspaper.build(url, config=config) if display: print("\n{:>6d} Articles available from {:<s} on {:<10s}:". format(paper.size(), agency.upper(), today)) article_collection = [] for article in paper.articles: url_lower = article.url.lower() # Exclude articles that are in a language other then en # or contains mostly video or pictures # search_level 0 only downloads articles with at least # one of the key words in its URL # search_level 1 download all articles that appear to be # appear to be in English and are not mainly photos or # videos. # With either search level, if an article is downloaded # it is scanned to see if it contains the search words # It is also compared to other articles to verify that # it is not a duplicate of another article. # Special Filters for some Agencies if agency == 'cbs-news': if url_lower.find('.com') >= 0: # secure-fly are duplicates of http if article.url.find('secure-fly') >= 0: continue if agency == 'usa-today': if url_lower.find('tunein.com') >= 0: continue if agency == 'huffington': # Ignore huffington if it's not .com if url_lower.find('.com') < 0: continue # Filter Articles that are primarily video, film or not en if url_lower.find('.video/') >=0 or \ url_lower.find('/video') >=0 or \ url_lower.find('/picture') >=0 or \ url_lower.find('.pictures/')>=0 or \ url_lower.find('/photo') >=0 or \ url_lower.find('.photos/') >=0 or \ url_lower.find('espanol') >=0 or \ url_lower.find('.mx/' ) >=0 or \ url_lower.find('/mx.' ) >=0 or \ url_lower.find('.fr/' ) >=0 or \ url_lower.find('/fr.' ) >=0 or \ url_lower.find('.de/' ) >=0 or \ url_lower.find('/de.' ) >=0 or \ url_lower.find('.it/' ) >=0 or \ url_lower.find('/it.' ) >=0 or \ url_lower.find('.gr/' ) >=0 or \ url_lower.find('/gr.' ) >=0 or \ url_lower.find('.se/' ) >=0 or \ url_lower.find('/se.' ) >=0 or \ url_lower.find('.es/' ) >=0 or \ url_lower.find('/es.' ) >=0 or \ url_lower.find('?button') >=0 or \ url_lower.find('calendar.') >=0 or \ url_lower.find('calendar/') >=0 or \ url_lower.find('/event/') >=0 or \ url_lower.find('engr.utexas') >=0 or \ url_lower.find('sites.smu.') >=0: continue # Filter if search_level == 0, URL quick search if search_level == 0: # Verify url contains at least one of the key words found_it = False for word in words: j = url_lower.find(word) if j >= 0: found_it = True break if found_it: # Article contains words and passes filters # Save this article for full review article_collection.append(article.url) else: # No URL screening, Save for full review article_collection.append(article.url) n_to_review = len(article_collection) if display: print("{:>6d} Selected for download".format(n_to_review)) for article_url in article_collection: article = Article(article_url, config=config) try: article.download() except: if display: print("Cannot download:", article_url[0:79]) continue n = 0 # Limit download failures stop_sec = 1 # Initial max wait time in seconds while n < 2: try: article.parse() n = 99 except: n += 1 # Initiate download again before new parse attempt article.download() # Timeout for 5 seconds waiting for download t0 = time() tlapse = 0 while tlapse < stop_sec: tlapse = time() - t0 # Double wait time if needed for next exception stop_sec = stop_sec + 1 if n != 99: if display: print("Cannot download:", article_url[0:79]) n_to_review -= 1 continue article.nlp() keywords = article.keywords title = article.title summary = article.summary text = article.text text_lower_case = text.lower() if search_type == 'or': found_it = False # Verify the url contains at least one of the key words for word in words: j = text_lower_case.find(word) if j >= 0: found_it = True break else: # search type 'and' found_it = True for word in words: j = text_lower_case.find(word) if j < 0: found_it = False break if found_it: # Article contains words and passes filters # Save this article for later full review length = len(text) df_story = pd.DataFrame([[ agency, article_url, length, keywords, title, summary, text ]], columns=[ 'agency', 'url', 'length', 'keywords', 'title', 'summary', 'text' ]) # Check for an identical already in the file if df_articles.shape[0] == 0: df_articles = df_articles.append(df_story) else: # Verify this story is not already in df_articles same_story = False for i in range(df_articles.shape[0]): if text == df_articles['text'].iloc[i]: same_story = True n_to_review -= 1 continue if not (same_story): df_articles = df_articles.append(df_story) else: n_to_review -= 1 print("=", end='') n_articles[agency] = [n_to_review, len(article_collection)] if display: print("\n\nArticles Selected by Agency:") for agency in news_urls: ratio = str(n_articles[agency][0]) + "/" + \ str(n_articles[agency][1]) ratio = ratio print("{:>10s} Articles from {:<s}".format( ratio, agency.upper())) print("\nArticles Collected on " + today + ":", df_articles.shape[0], 'from', df_articles['agency'].nunique(), "Agencies.") print("\nSize Agency Title") print("*{:->78s}*".format("-")) for i in range(df_articles.shape[0]): k = len(df_articles['title'].iloc[i]) if k > 63: for j in range(25): k = 63 - j if df_articles['title'].iloc[i][k] == " ": break print("{:>5d} {:<10s} {:<63s}".format( df_articles['length'].iloc[i], df_articles['agency'].iloc[i], df_articles['title'].iloc[i][0:k])) if len(df_articles['title'].iloc[i]) > 63: print(" {:<60s}".format( df_articles['title'].iloc[i][k:120])) else: print("{:>5d} {:<10s} {:<s}".format( df_articles['length'].iloc[i], df_articles['agency'].iloc[i], df_articles['title'].iloc[i])) print("") print("*{:->78s}*".format("-")) return df_articles
newslinks = [] for line in lines: newslinks.append(line[:-1]) #f.close() def getdatestring(day): year, month, day = str(day)[:10].split("-") #day = day.split(" ")[0] return year, month, day #links to articles summaries = [] user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = news.Config() config.browser_user_agent = user_agent x = 1 for link in newslinks: date, link = link.split(",", 1) print("article number " + str(x)) a = news.Article(link, config=config) data = "" try: a.download() a.parse() a.nlp() #a.summary() data = a.text except news.article.ArticleException: print("something bad happened on article " + str(x) + ", for " + date)
import bs4 as bs from datetime import date, datetime, timedelta import time import os from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common import action_chains from selenium.common.exceptions import TimeoutException as TimeoutError_ from selenium.common.exceptions import MoveTargetOutOfBoundsException as OutOfBoundsError import re import json from random import randint configuration = nws.Config() configuration.fetch_images = False configuration.follow_meta_refresh = True league_list = ["Europa League", "FA Cup", "Championship", "Champions League", "EFL Cup", "Premier League", "La Liga", "League One", "League Two", "Bundesliga", "Serie A", "Ligue 1"] def datespan(start_date, end_date, delta): """generates daily time stamps of the format yyyy-mm-dd. Takes: start and end date and a time step. Returns: an iterable date.""" current_date = start_date while current_date < end_date: yield current_date
def init_config(): config = newspaper.Config() config.fetch_images = False config.verbose = True return config