def summarise_one(url, title=True, keywords=True, summary=False, \ top_img_src=False): ''' Get url and return summary ''' article = Article(url) # configuration for Newspaper to minimize processing time configure = Config() configure.fetch_images = False configure.MAX_SUMMARY = 300 configure.MAX_SUMMARY_SENT = 3 try: article.download() article.parse() except: print(url) title = article.title if keywords or summary: try: article.nlp() if keywords: keywords = article.keywords if summary: summary = article.summary except : print('NEwspaper error with nlp() call') if top_img_src: top_img_src = article.top_image return title, keywords, summary, top_img_src
def download_article(url): config = Config() config.follow_meta_refresh = True config.keep_article_html = True article = Article(url, config=config) article.download() return article
def get_text(url): """Given URL of the article extract text from it Arguments: url {str} -- url of the article Returns: str -- Text of thr article """ global CNT global NAN if (CNT % 100 == 0): print(CNT) config = Config() config.keep_article_html = True try: article = Article(url, config=config) article.download() article.parse() return article.text except Exception as e: print(e) NAN += 1 return np.nan finally: CNT += 1
def fetch_by_newspaper(url): is_linkedin_url = url.startswith(linkedinUrl) if is_linkedin_url: config = Config() config.MAX_TITLE = 1000 article = get_article(url, config) article = replace_title_text_from_title_url(article) else: article = get_article(url) return json.dumps({ "authors": article.authors, "html": article.html, "images:": list(article.images), "movies": article.movies, "publish_date": article.publish_date.strftime("%s") if article.publish_date else None, "text": article.text, "title": article.title, "topimage": article.top_image }), 200, { 'Content-Type': 'application/json' }
def extract(self, request: ExtractorRequest) -> ExtractorResponse: try: user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" config = Config() config.browser_user_agent = user_agent article = Article( request.url, keep_article_html=True, fetch_images=False, config=config ) article.download() article.parse() # article.nlp() text = article.text meta = { "source": "web_article", "source_url": article.source_url, "article_html": article.article_html, "title": article.title, "top_image": article.top_image, "images": article.images, "videos": article.movies, "meta_language": article.meta_lang, "meta_keywords": article.meta_keywords, "authors": article.authors, "publish_date": article.publish_date, } # construct response response = ExtractorResponse(meta=meta, text=text or "") except Exception as e: msg = f"Error using newspaper3k extractor: {str(e)}" log.error(msg) response = ExtractorResponse(error=msg) return response
def getPolarity(uniName): from GoogleNews import GoogleNews from newspaper import Article from newspaper import Config import pandas as pd from textblob import TextBlob uniName = uniName + ' Coronavirus' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent googlenews = GoogleNews(start='08/01/2020', end='09/26/2020') googlenews.search(uniName) result = googlenews.result() for i in range(0, 5): googlenews.getpage(i) result = googlenews.result() df = pd.DataFrame(result) sum = 0 counter = 1 for ind in df.index: try: article = Article(df['link'][ind], config=config) article.download() article.parse() article.nlp() testimonial = TextBlob(article.summary) counter += 1 sum += testimonial.sentiment.polarity except: pass return sum / counter
def subjectivity(): """ Graphs the posts based on the relationship between subjectivity and popularity :return: a graph with the x axis being the subjectivity value, and y axis being upvotes """ scores = [] for index, row in topics_data.iterrows(): if index in actual_list: scores.append(row['score']) subs = [] for index, row in topics_data.iterrows(): if index in actual_list: url = row['url'] if 'newsweek' or 'democracynow' in url: user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' config = Config() config.browser_user_agent = user_agent article = Article(url, config=config) else: article = Article(url) article.download() article.parse() article.nlp() text = article.summary obj = TextBlob(text) subjectivity = obj.sentiment.subjectivity subs.append(subjectivity) plt.figure(figsize=(50, 10)) plt.scatter(subs, scores) plt.xlabel('Subjectivity') plt.ylabel('Score') plt.title('Posts in r/politics') plt.show()
def handle(task, progress): url = task.url progress.set_status("Requesting page...") resp = http_downloader.page_text(url, json=False) if not resp: return False config = Config() config.memoize_articles = False config.verbose = False article = Article(url='', config=config) article.download() article.set_html(resp) article.parse() if not article.top_image: return None src = article.top_image if 'http' not in src: if 'https' in url: src = 'https://' + src.lstrip('/ ').strip() else: src = 'http://' + src.lstrip('/ ').strip() progress.set_status("Downloading image...") return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)
def cache_art(url): headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0", 'From': '*****@*****.**' } r = requests.get(url, headers=headers) from newspaper import Config from newspaper import Article config = Config() config.browser_user_agent = headers['User-Agent'] article = Article(url, keep_article_html=True, config=config) article.download() # article = Article(url='') # article.set_html(str(soup)) article.parse() struct = { 'html': article.article_html, 'author': article.authors, 'pubdate': article.publish_date, 'text': article.text, 'image': article.top_image, 'rawhtml': str(r.content), 'buzotext': scrape(r.content, 'html5lib') } return struct
def get_top_news(news_information, todays_date): load_dotenv(find_dotenv(r"path to env variable")) news_api_key = os.getenv("news_api") params = ( ('country', 'us'), ('apiKey', f'{news_api_key}'), ) response = requests.get( """https://newsapi.org/v2/top-headlines""", params=params) news_articles = response.json()['articles'] for i in news_articles: user_agent = """Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0""" config = Config() config.browser_user_agent = user_agent config.request_timeout = 10 article = Article(i['url'], config=config) print(i["url"]) article.download() article.parse() article.nlp() summary = article.summary # noqa news_information[i["source"]["name"]] = {"title": i["title"], "summary": summary, "url": i["url"], "date": f"{todays_date}"} # noqa
def process_link(self, link=None, nlp=False): """processes the linksobtain by get_links() method, extracts both the text and a summary of the article with newspaper package Args: link :: str URL of links stored in the dictionary returned by get_links() nlp :: bool Whether or not to perform nlp on the text of the link. This extracts a summary of the text, but is a somewhat expensive operation. Returns: article :: 'article' object object that contains parsed properties for the link, such as summary, text and date. """ #parameters for the processing config = Config() config.fetch_images = False #no need for images config.memoize_articles = False #no need for article caching try: article = Article(link, language="en", config=config) article.download() article.parse() if nlp: article.nlp() #extract summary as per the newspaper API except: return None return article
def get_article_text(url: str) -> str: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54' } text = '' try: config = Config() config.browser_user_agent = headers['User-Agent'] article = Article(url, keep_article_html=True, fetch_images=False, config=config) article.download() article.parse() # article.nlp() text = article.text meta = { "source": "web_article", "source_url": article.source_url, "article_html": article.article_html, "title": article.title, "top_image": article.top_image, "images": article.images, "videos": article.movies, "meta_language": article.meta_lang, "meta_keywords": article.meta_keywords, "authors": article.authors, "publish_date": article.publish_date, } text = clean(text) except Exception as e: msg = f"Error using newspaper3k extractor: {str(e)}" print(msg) return text
def filterNews(url, keyword): """ 对新闻进行筛选,通过关键词,以新闻标题为对象筛查 通过user_agent和timeout来解决访问超时问题 https://stackoverflow.com/questions/63061172/newspaper3k-api-article-download-failed-with-httpsconnectionpool-port-443-read :param url: :param keyword: :return: """ user_agent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " r"Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47 " config = Config() config.browser_user_agent = user_agent config.request_timeout = 30 # Try-Except-Continue will skip to the next article in the For loop if there is an exception # https://github.com/codelucas/newspaper/issues/444 solve the bugs for not found url try: news = newspaper.Article(url, config=config) news.download() news.parse() for word in keyword: if word in news.title: # print(word) return True return False except ArticleException: return False
def home(request): main_url = "http://newsapi.org/v2/top-headlines?country=in&pageSize=17&apiKey=813ecab326254348a7d1e8ef73d2838f" # fetching data in json format open_news_page = requests.get(main_url).json() # getting all articles in a string article article = open_news_page["articles"] results = [] user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent for ar in article: article = Articles() intl = [] intl.append(ar["title"]) intl.append(ar["description"]) intl.append(ar["urlToImage"]) intl.append(ar["url"]) toi_article = Article(intl[3], language="en", config=config) toi_article.download() #To parse the article toi_article.parse() #To perform natural language processing ie..nlp toi_article.nlp() summ = toi_article.summary intl.append(summ) results.append(intl) content = {'article': results} return render(request, 'summnews/index.html', content)
def extract_content(url): content = {} content = get_static_content() try: ua = UserAgent() config = Config() config.browser_user_agent = ua.chrome config.language = 'es' article = Article(url, config= config) article.download() article.parse() text = article.text content['text'] = text top_image = article.top_image content['image'] = top_image movielinks = [] for movie in article.movies: movielinks.append(movie) content['videos'] = movielinks except Exception as e: print_exc(e) return content
def cache_art(url): headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0", 'From': '*****@*****.**' } from newspaper import Config from newspaper import Article config = Config() config.browser_user_agent = headers['User-Agent'] article = Article(url, keep_article_html=True, config=config) article.download() article.parse() struct = { 'html': article.article_html, 'link': url, 'author': article.authors, 'pubdate': article.publish_date, 'text': article.text, 'title': article.title, 'image': article.top_image, 'description': article.meta_description } return struct
def GetArticle(FeedLink, request): user_agent = request.META['HTTP_USER_AGENT'] config = Config() config.browser_user_agent = user_agent newarticle = Article(FeedLink, config=config) newarticle.download() newarticle.parse() return newarticle
def _getArticleObj(self): config = Config() config.keep_article_html = True article = Article(self.url, config=config) article.download() article.parse() article.nlp() return article
def parse_submission(url): ''' Input: HTTP URL Ouput: List of keywords compiled from names found in the article title and keywords found in the article text ''' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent a = Article(url, config=config) try: a.download() a.parse() except Exception as e: logging.error(e) return a.nlp() keywords = a.keywords title = a.title content = a.text thumbnail_image = get_thumbnail_image(a.html) #clean up content content = to_lower(content) content = strip_numbers(content) #clean up title title = to_lower(title) title = strip_numbers(title) #get names and identities from content text content_names = get_names(''.join(content)) top_keywords = [] for name in content_names: if name in title: #if the name is in the title, it is significant top_keywords.append(name) #Find other keywords that are not names and are found in the title for keyword in keywords: if keyword in title and keyword not in stop_words and not keyword.isdigit( ): top_keywords.append(keyword) if not top_keywords: top_keywords = title.split(" ") else: #remove duplicates top_keywords = list(dict.fromkeys(top_keywords)) top_keywords = remove_duplicates(top_keywords) return { 'keywords': top_keywords, 'title': a.title, 'thumbnail_url': thumbnail_image }
def make_news_table(db, articles, logger): class News(db.Model): uuid = db.Column(UUIDType(), primary_key=True) url = db.Column(db.Text, nullable=False) title = db.Column(db.Text, nullable=False) picture = db.Column(URLType, nullable=True) icon = db.Column(URLType, nullable=True) body = db.Column(db.Text, nullable=False) publish_time = db.Column(db.DateTime, nullable=False) __tablename__ = 'News' def __repr__(self): return f'<News: {self.uuid}>' db.create_all() db.session.commit() urls = tuple( set(articles) - set(a[0] for a in News.query.with_entities(News.url).all())) if len(urls) > 0: from newspaper import Article, Config from newspaper.mthreading import NewsPool from uuid import uuid4 config = Config() config.memoize_articles = False config.fetch_images = True config.request_timeout = 20 config.thread_request_timeout = 20 config.follow_meta_refresh = True config.MIN_WORD_COUNT = 150 config.MIN_SENT_COUNT = 4 config.browser_user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' news_pool = NewsPool(config=config) articles = tuple(Article(url, config=config) for url in urls) news_pool.set(articles, override_threads=4) logger.info(f"Articles being downloaded: {urls}") news_pool.join() for article in articles: logger.debug(f"Processing: {article.url}") if article.download_state == 1: logger.warn(f'Failed to download: {article.url}') continue article.parse() try: db.session.add( News(uuid=uuid4(), url=article.url, title=article.title, picture=article.top_image, icon=article.meta_favicon, body=article.text, publish_time=article.publish_date)) except Exception as e: logger.warn(e) db.session.commit() return News
def load_and_parse_full_article_text_and_image(url: str) -> Article: config = Config() config.MAX_SUMMARY_SENT = 8 article = Article(url, config=config) article.set_html(load_page_safe(url)) # safer than article.download() article.parse() return article
def handle(url, data, log): try: log.out(0, 'Downloading article...') resp = requests.get(url, headers={'User-Agent': data['user_agent']}) if resp.status_code != 200: return False #!cover config = Config() config.memoize_articles = False config.verbose = False article = Article(url='', config=config) log.out(0, 'Parsing article...') article.download() article.set_html(resp.text) article.parse() if article.top_image: src = article.top_image if 'http' not in src: #!cover if 'https' in url: src = 'https://' + src.lstrip('/ ').strip() else: src = 'http://' + src.lstrip('/ ').strip() log.out(0, 'Newspaper located image: %s' % src) r = requests.get(src, headers={'User-Agent': data['user_agent']}, stream=True) if r.status_code == 200: content_type = r.headers['content-type'] ext = mimetypes.guess_extension(content_type) if not ext or ext == '': #!cover log.out( 1, 'NewsPaper Error locating file MIME Type: %s' % url) return False if '.jp' in ext: ext = '.jpg' #!cover path = data['single_file'] % ext if not os.path.isfile(path): if not os.path.isdir(data['parent_dir']): #!cover log.out(1, ("+Building dir: %s" % data['parent_dir'])) os.makedirs( data['parent_dir'] ) # Parent dir for the full filepath is supplied already. with open(path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) return path else: #!cover log.out( 0, ('\t\tError Reading Image: %s responded with code %i!' % (url, r.status_code))) return False except Exception as e: log.out(0, ('"Newspaper" Generic handler failed. ' + (str(e).strip()))) return False #!cover
def get_article(url): config = Config() config.request_timeout = 1.3 article = Article(url, language='zh', config=config) try: article.download() article.parse() # article.nlp() except Exception: return return article.text
def extract_text(self, response): conf = ArticleConfig() conf.fetch_images = False conf.follow_meta_refresh = False conf.memoize_articles = False article = Article(url=response.url, config=conf) article.download(html=response.body) article.parse() return article.text
def parse(self, response): newsItem = CNNItem() newsItem['title'] = uniToAscii( response.selector.xpath( '//meta[@property="og:title"]/@content').extract_first()) newsItem['author'] = uniToAscii( response.selector.xpath( '//meta[@name="author"]/@content').extract_first()) newsItem['link'] = response.url print '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', newsItem[ 'title'] text = "" pars = response.selector.xpath( '//*[@class="zn-body__paragraph"]//text()').extract() for par in pars: par_text = uniToAscii(par) if par_text[-1:] != '.': par_text += '.' text = text + par_text + " " newsItem['text'] = text keywords = uniToAscii( response.xpath( "//meta[@name='keywords']/@content").extract_first()) keywords_list = keywords.split(',') newsItem['keywords'] = [word.strip() for word in keywords_list] print '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', newsItem[ 'title'], ' ', len(pars), ' ', newsItem['keywords'] if newsItem['text']: print newsItem['text'][:100] yield newsItem config = Config() config.fetch_images = False config.memoize_articles = False url_build = newspaper.build(url=response.url, config=config) print response.url storyLinks = [article_url for article_url in url_build.article_urls()] print storyLinks for link in storyLinks: if re.match('.*/2017/03/0.*\.html', str(link)): print 'Fetching from ', str(link) yield scrapy.Request(str(link))
def nlp_feed(term,limiter=None): fresh_cache = True sources = set() config = Config() config.browser_user_agent = AGENT urls = search_fetch(term,limiter) print(urls) for url in urls[:1+EVIDENCE_BATCH_SIZE]: # Exclude any links (usually Associated Press ones) which are themselves fact-checks, which would be cheating... #unless running on closed domain, in which case 'factcheck' is in every domain... if limiter is None and 'factcheck' in url or 'fact-check' in url: continue # Scrape requested URLs if they aren't currently in cache. if url in article_cache: wc = article_cache[url] if wc != '': sources.add((url, wc)) print("Cache Hit on ", url[:max(len(url), 50)], "......") else: try: # newspapers can't handle bbc ergo custom approach if 'www.bbc' in url: article = requests.get(url) if article.status_code != 200: raise ArticleException soup = BeautifulSoup(article.content, 'html.parser') body = ' '.join(z.text for z in soup.findAll('p')) sources.add((url, body)) if body not in article_cache: article_cache[url] = body fresh_cache = False else: article = Article(url, config=config, language='en') article.download() article.parse() sources.add((url, article.text)) if article.text not in article_cache: article_cache[url] = article.text fresh_cache = False except ArticleException: print("Couldn't fetch: ", url) article_cache[url] = '' fresh_cache = False # Only dump to disk if cache has been modified if not fresh_cache: dump_to_disk() return sources
def getData(url): data_url = [] try: user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent page = Article(url, config=config) page.download() page.parse() data_url.append(page.text) except: print('***FAILED TO DOWNLOAD***',page.url) return data_url
def newss(self, lienn): config = Config() config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' article = Article(lienn, language="fr", config=config) article.download() article.parse() title = article.title author = article.authors article.nlp() summary = article.summary keywords = article.keywords return title, lienn, author, summary, keywords
def get_keywords(self, url): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent paper = Article(url, config=config) try: paper.download() paper.parse() paper.nlp() except: return [] return paper.keywords
def whoHasTimeToRead(url): is_article = valid_url(url, verbose=True) config = Config() config.MAX_KEYWORDS = 10 if is_article: sumitup = {} b = Article(url=url,config=config) b.download() b.parse() b.nlp() sumNews = summary(b.title, b.text, b.keywords) sumTitle = b.title movies = b.movies[0] if len(b.movies) > 0 else "None" return sumNews,sumTitle,movies return "Nope"
def scrape(url): config = Config() config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36' article = Article(url['url'], config=config) article.download() article.parse() soup = BeautifulSoup(article.html, 'lxml') ga = LongformArticle() ga.url = url['url'] ga.title = url['title'] ga.link = url['url'] ga.text = article.text return ga
def __init__(self, output_dir, dataframe, num_rows=None, user_agent=default_user_agent) -> None: self.output_dir = output_dir temp_df = dataframe if num_rows: self.df = temp_df.iloc[:num_rows] else: self.df = temp_df self.user_agent = user_agent self.config = Config() self.config.browser_user_agent = user_agent self.download_logs_dict = { "link": [], "download_state": [], "download_exception_msg": [] } self.number_of_downloaded = 0 self.preprocess_df()
def extract_article_html(url): config = Config() config.keep_article_html = True article = Article(url, config=config) article.download() article.parse() article_html = article.article_html html = lxml.html.fromstring(article_html) for tag in html.xpath('//*[@class]'): tag.attrib.pop('class') return lxml.html.tostring(html).decode('utf-8') """
def __init__(self, url): self.url = url self.config = Config() self.config.memoize_articles = False #self.config.fetch_images = False self.config.browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.35 Safari/537.36' self.config.request_timeout = 30 #logger.info('url is %s', url) self.article = Article(url, config=self.config) self._import_article() self.results = {} self.results['sourceurl'] = url try: self.results['title'] = self.article.title self.results['authors'] = self.article.authors self.results['canonical_link'] = self.article.canonical_link self.results['url'] = self.article.url self.results['top_image'] = self.article.top_image self.results['images'] = list(self.article.images) self.results['movies'] = self.article.movies self.results['description'] = self.article.meta_description #self.results['meta_favicon'] = self.article.meta_favicon self.results['keywords'] = self.article.meta_keywords self.results['lang'] = self.article.meta_lang self.results['summary'] = self.article.summary self.results['tags'] = list(self.article.tags) self.results['text'] = self.article.text except Exception: logger.fatal('Unable to make results for:%s, %s', url, results) pass
def main(argv): sourcelist = [] if len(argv) > 1: sourcefile = argv[1] try: with open(sourcefile,'r') as f: sourcelist = f.read().strip().split('/n') except IOError: print("File does not exist") """ Check for existence of memo cache If it doesn't exist, create memo cache and populate top sources file with the specified sources.txt file. If it is not specified return an error and terminate. If memo cache exists, if sources.txt is specified do a check against top sources and add any new ones. If no sources.txt is specified use top sources file. """ firstrun = False memo_cache_path = os.path.join(os.path.dirname(__file__), '.memo_cache') if not os.path.exists(memo_cache_path): if len(sourcelist) > 0: firstrun = True os.makedirs(memo_cache_path) with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f: [f.write(source + '\n') for source in sourcelist] else: print("You must specify an input file on the first run") print("An input file contains line-separated urls to the top-level domains you wish to crawl") raise SystemExit else: if len(sourcelist) > 0: with open(os.path.join(memo_cache_path, '.top_sources'), 'w') as f: [f.write(source + '\n') for source in sourcelist] else: with open(os.path.join(memo_cache_path, '.top_sources'), 'r') as f: sourcelist = f.read().split('\n') # this config applies to the entire crawling process config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = True config.fetch_images = False top_sources = [IntiSource(url=source,config=config) for source in sourcelist] if firstrun: build_categories(top_sources)
def __init__(self, url): c = Config() c.keep_article_html = True article = Article(url=url, config=c) article.download() article.parse() try: article.nlp() summary = article.summary if summary == "": self.summary = "Summary not available!" else: self.summary = summary except Exception, e: self.summary = "Summary not available!"
def __init__(self): Config.__init__(self) self.memoize_articles = False self.fetch_images = False self.request_timeout = 60 self.number_threads = 20 self.keep_article_html = True self.MIN_WORD_COUNT = 20 self.verbose = True self.keep_article_html = True self.urls = { 'category1': [ 'a_url,articles,a_regex_for_links,a_tags', 'b_url,article,,b.tags', 'c_url,feed,,c.tags' ], } self.outputDir = 'content' + os.sep + 'category' self.max_number = 2000 self.max_days = 100
def main(argv): if len(argv) > 1: htmlist = argv[1] else: htmlist = 'htmlist' # Our permanent config for html cleaning config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True cleaner = Article(url='', config=config) with open(htmlist, 'r') as f: htmfile = f.read().split('\n') raw = [] for htm in htmfile: print (htm) if not htm.endswith("rss.html"): with open(htm, 'r') as f: h = f.read() cleaner.set_html(h) cleaner.parse() sentences = nlp.split_sentences(cleaner.text) #raw.append(sentences]) with open('htm-out', 'a') as f: [f.write(r + '\n') for r in sentences]
def getConf(language, parser): ''' get newspaper Config object :return: ''' conf = Config(); conf.fetch_images = False; conf.memoize_articles = False conf.set_language(language); conf.parser_class = parser return conf
def main(argv): TOP_PATH = os.path.dirname(__file__) OUT_PATH = os.path.join(TOP_PATH, 'output') if not os.path.exists(OUT_PATH): os.makedirs(OUT_PATH) # Our permanent config for crawling config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True # Get contents of our source file sourcefile = os.path.join(TOP_PATH, "sources.txt") with open(os.path.join(sourcefile), 'r') as f: sourcelist = f.read().strip().split('\n') # Initialize our sources sources = [IntiSource(source,config=config) for source in sourcelist] # Make domain directories inside our output path and build sources for s in sources: if not os.path.exists(os.path.join(OUT_PATH, s.domain)): dom_path = os.path.join(OUT_PATH, s.domain) os.makedirs(dom_path) # Build s.build() if config.verbose: s.print_summary() # Multithreaded source downloading and parsing news_pool.set(sources, threads_per_source = 4) news_pool.join() article_parse(sources)
from newspaper import Article, Config from lxml.etree import tostring config = Config() config.keep_article_html = True def extract(url): article = Article(url=url, config=config) article.download() article.parse() return dict( title=article.title, text=article.text, html=article.html, image=article.top_image, top=article.clean_top_node, authors=article.authors, ) if __name__ == '__main__': if len(sys.argv) < 2 : print("Usage : %s url" % (sys.argv[0])) sys.exit(1) a = extract(sys.argv[1]) print(tostring(a['top']))
__author__ = 'James' import newspaper from newspaper import Config, news_pool config = Config() config.set_language('en') config.memoize_articles = False reuters = newspaper.build(url='http://www.reuters.com', config=config) indo = newspaper.build(url='http://www.independent.ie', config=config) papers = [reuters, indo] news_pool.set(paper_list=papers, threads_per_source=3) news_pool.join() for paper in papers: print(paper.brand + ": " + str(paper.size()) + " article(s)") # for article in paper.articles: # print(article.title) # print("-----------\nCATEGORIES\n-----------") # # for category in a.categories: # print(category.url) # b = newspaper.Source(url=category.url) # b.build() # print("\t-----------\n\tFEEDS\t\n-----------\t") # for feed_url in b.feed_urls(): # print("\t->" + feed_url)
def getFeeds(url): config = Config() config.fetch_images = False paper = newspaper.build(url, config) for feed_url in paper.feed_urls(): # paper.article_urls(): # print feed_url
def main(): import newspaper # article download utility from newspaper import news_pool, Config, Article, Source import re # regex import csv # csv file-formatting import unicodedata # string cleaning from datetime import datetime # time-checking for cache-updates print("Retrieving sources and update times\n...") # Read active list of news/media sources f = open("sourcelist","r") sources = f.read().splitlines() times = [] # # ONGOING: update time storage and retrieval # -dependent on if caching is sufficient papers = {} # Empty dictionary print("Building papers\n....\n...\n...") # Store total and current number of articles for progress metrics total_articles = 0; current_articles = 0 # Build diction, using url name for keys ex/ 'http://cnn.com' key will be 'cnn' for i in range(len(sources)): key = re.sub(r'(^https?:\/\/|\.com\n$|\.org\n$)','',sources[i]) papers[key] = newspaper.build(sources[i],memoize_articles=True) # Print number of articles added from "recent" list for logging purposes total_articles = total_articles + papers[key].size() print(key,papers[key].size()) print("Downloading articles (this may take a while)\n...\n...\n...") config = Config() config.fetch_images = False # Download all articles via multi-threading news_pool.set([x[1] for x in papers.items()], threads_per_source=2) # Test various thread counts news_pool.join() print("Extracting text from articles and writing to dump files \n...\n...\n...") # Append articles to aggregate and individual csv's # Format: col(1) = source, col(2) = date, col(3) = title, col(4) = authors, col(5) = text, col(6) = keywords with open('papers.csv','a') as outcsv: # Setup aggregate csv writer writer = csv.writer(outcsv) #writer.writerow(["Source","Date","Title","Authors","Text","Keywords"]) # Traverse sources for i in papers: # Setup single_source csv writing source = i ind_outcsv = open(str(i+".csv"),'a') ind_writer = csv.writer(ind_outcsv) # Traverse articles in source for j in range(papers[i].size()): # Parse articles and extract features current_articles += 1 print("Processing " + str(i) + " article " + str(current_articles) + " of " + str(total_articles) + " (" + str("{0:.2f}".format((current_articles/float(total_articles)*100),2)) + " %)") try: papers[i].articles[j].parse() # Grab key features title = unicodedata.normalize('NFKD',papers[i].articles[j].title).encode('ascii','ignore') authors = [x.encode('UTF-8') for x in papers[i].articles[j].authors] text = unicodedata.normalize('NFKD',papers[i].articles[j].text).encode('ascii','ignore') date = papers[i].articles[j].publish_date keywords = [x.encode('UTF-8') for x in papers[i].articles[j].keywords] # Add new row to both single-source and aggregate files ind_writer.writerow([source,date,title,authors,text,keywords]) writer.writerow([source,date,title,authors,text,keywords]) papers[i].articles[j].nlp() except httplib.BadStatusLine: print "httplib.BadStatusLine, no dice"