def retrieve_article(url): try: config = Configuration() config.fetch_images = False req = urllib.request.Request( url, headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919" }) con = urllib.request.urlopen(req, timeout=10) html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128]) article = Article(url='', config=config) article.set_html(html) article.parse() text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)]) if len(text) < 300: article = Article(url='', config=config, language="id") article.set_html(html) article.parse() text = ''.join( [i if ord(i) < 128 else ' ' for i in str(article.text)]) text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') return text except Exception as e: print(e) return False
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = 'cnn' config = Configuration() config.verbose = False s = Source('http://cnn.com', config=config) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC # For this test case and a few more, I don't believe you can actually # assert two values to equal eachother because some values are ever changing. # Insead, i'm just going to print some stuff out so it is just as easy to take # a glance and see if it looks OK. print '\t\tWe have %d articles currently!' % s.size() print print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
def test_source_build(self): """ builds a source object, validates it has no errors, prints out all valid categories and feed urls """ DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = "cnn" config = Configuration() config.verbose = False s = Source("http://cnn.com", config=config) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC # For this test case and a few more, I don't believe you can actually # assert two values to equal eachother because some values are ever changing. # Insead, i'm just going to print some stuff out so it is just as easy to take # a glance and see if it looks OK. print "\t\tWe have %d articles currently!" % s.size() print print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
def extract(results): try: config = Configuration() config.fetch_images = False req = urllib.request.Request( results["url"], headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919" }) con = urllib.request.urlopen(req, timeout=10) html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128]) article = Article(url='', config=config) article.set_html(html) article.parse() text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)]) if len(text) < 300: article = Article(url='', config=config, language="id") article.set_html(html) article.parse() text = ''.join( [i if ord(i) < 128 else ' ' for i in str(article.text)]) text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') print("=", end='', flush=True) return (results["url"], results["title"], text, article.publish_date) except Exception as e: print(e) return (results["url"], results["title"], None, None)
def newspaper_fulltext2(parser, language, url): ''' This is a faster version of the function that uses some internal newspaper3k functions so that the lxml parse tree doesn't need to be recreated. Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = document_cleaner.clean(doc) doc = extractor.calculate_best_node(doc) if doc is not None: doc = extractor.post_cleanup(doc) text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value': { 'text': text, 'html': html, }, 'pattern': 'newspaper3k', }
def newspaper_fulltext(parser, language): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) if top_node is not None: top_node = extractor.post_cleanup(top_node) text, html = output_formatter.get_formatted(top_node) else: text = None html = None return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'newspaper3k', }
def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press')
def test_meta_refresh_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain')
def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press')
def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain')
def __init__(self): self.config = Configuration( ) # sets meta config for article and parser self.parser = self.config.get_parser() # parser self.extractor = ContentExtractor( self.config ) # extracts info (author, tags, text, etc.) from parsed article self.doc_cleaner = DocumentCleaner( self.config) # cleans unwanted tags and nodes from DOM self.formatter = OutputFormatter( self.config) # outputs formatted text from parsed xpath nodes
def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(html=html) article.parse() self.assertEqual(article.title, 'Example Domain')
def clean(html_content): config = Configuration() config.fetch_images = False # TODO: allow URL passing article = Article("http://example.com", config=config) article.set_html(html_content) article.is_downloaded = True article.parse() return article.text
def html_to_article(content, language): content = content.strip() if not len(content): return '' config = NewspaperConfig() config.language = language doc = config.get_parser().fromstring(content.strip()) if doc is None: return '' # Split block-level elements with newlines for tag in _BLOCKLEVEL_TAGS: if tag in _MEANINGLESS_TAGS: continue for node in doc.xpath('//{}'.format(tag)): node.append(etree.Element('br')) node.append(etree.Element('br')) # Initial cleanup cleaner = _NewspaperCleaner(config) doc = cleaner.clean(doc) # Best node estimation extractor = NewspaperExtractor(config) top = extractor.calculate_best_node(doc) if top is None: del doc, cleaner, extractor etree.clear_error_log() return '' top = extractor.post_cleanup(top) # Cleanup dummy nodes used for estimation for dummy in top.xpath("//p[@newspaper='dummy']"): dummy.getparent().remove(dummy) # Custom formatting to avoid unnecessary computations formatter = NewspaperFormatter(config) formatter.top_node = top formatter.remove_negativescores_nodes() content = formatter.convert_to_html() content = str(content).strip() content = unescape(content) del doc, top, cleaner, extractor, formatter etree.clear_error_log() return content
def get_articles_c_tribune(complement): ''' Given a string (complement) of the form 2011/01/01, get articles from the Chicago Tribune Archives. Inputs: a string called complement containing the date for a given day Returns: info dictionary for that day writes csv file with nltk scores for complement ''' c_tribune = 'http://articles.chicagotribune.com/' archive_url = c_tribune + complement + '/' articles = {} pm = urllib3.PoolManager() html = pm.urlopen(url=archive_url, method="GET").data soup = bs4.BeautifulSoup(html, 'lxml') #print(soup) tag_list = soup.find_all('h3') if tag_list: for index, tag in enumerate(tag_list): rv = {} articles[index] = rv article = c_tribune + tag.a['href'] #print(article) config = Configuration() config.browser_user_agent = get_user_agent() article_object = Article(article) article_object.download() if article_object: article_object.parse() if 'Death Notice:' in article_object.title: continue title = article_object.title #date = article_object.publish_date text = article_object.text rv['article'] = title rv['pub_date'] = complement rv['nltk_score'] = get_nltk_score(text) rv['nltk_score_title'] = get_nltk_score(title) rv['source'] = 'Chicago Tribune' write_csv_pro( articles, 'chicago_tribune_' + re.sub("/", "_", complement) + '.csv')
def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press') # Err Fixed: Called before the first test case of this unit begins @print_test def test_pre_download_parse(self): article = Article(self.article.url) self.assertRaises(ArticleException, article.parse)
def get_articles_pro(complement): ''' Given a string (complement) of the form 2011/01/01, get articles for a given day from ProPublica Inputs: a string called complement containing the date for a given day propublica tag_type = 'div' propublica class_type = 'excerpt-thumb' Returns: Dictionary with articles for that day Writes csv files with nltk scores ''' propublica = 'https://www.propublica.org/archive/' archive_url = propublica + complement + '/' articles = {} pm = urllib3.PoolManager() html = pm.urlopen(url=archive_url, method="GET").data soup = bs4.BeautifulSoup(html, 'lxml') tag_list = soup.find_all('div', class_='excerpt-thumb') if tag_list: for index, tag in enumerate(tag_list): rv = {} articles[index] = rv article = tag.a['href'] print(article) config = Configuration() config.browser_user_agent = get_user_agent() article_object = Article(article) article_object.download() if article_object: article_object.parse() title = article_object.title #date = article_object.publish_date text = article_object.text rv['article'] = title rv['pub_date'] = complement rv['nltk_score'] = get_nltk_score(text) rv['nltk_score_title'] = get_nltk_score(title) rv['source'] = 'ProPublica' write_csv_pro(articles, 'propublica_' + re.sub("/", "_", complement) + '.csv') return articles
def test_source_build(self): """builds a source object, validates it has no errors, prints out all valid categories and feed urls""" DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides.""" BRAND = 'cnn' configs = Configuration() configs.verbose = False s = Source('http://cnn.com', configs=configs) s.clean_memo_cache() s.build() assert s.brand == BRAND assert s.description == DESC print '\t\tWe have %d articles currently!' % s.size()
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print ('slate has %d articles tc has %d articles espn has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html) print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print('Slate has %d articles TC has %d articles ESPN has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html) print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
def modified_fulltext(parser, language, url): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' url_parsed = urlparse(url) from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter config = Configuration() config.language = language config.keep_article_html = True extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser doc = rm_ads(doc,url_parsed.hostname) doc = clean(document_cleaner,doc) #doc = document_cleaner.clean(doc) doc = calculate_best_node(extractor,doc) #doc = extractor.calculate_best_node(doc) if doc is not None: #doc = extractor.add_siblings(doc) doc = post_cleanup(doc) #doc = extractor.post_cleanup(doc) text, html = get_formatted(doc) #text, html = output_formatter.get_formatted(doc) else: text = '' html = '' return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }
def fulltext(html, language='en'): """Takes article HTML string input and outputs the fulltext Input string is decoded via UnicodeDammit if needed """ config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = WithTagOutputFormatter(config) doc = config.get_parser().fromstring(html) doc = document_cleaner.clean(doc) top_node = extractor.calculate_best_node(doc) top_node = extractor.post_cleanup(top_node) text, article_html = output_formatter.get_formatted(top_node) return text, article_html
def test_download_works(self): """ """ config = Configuration() config.is_memoize_articles = False slate_paper = newspaper.build('http://slate.com', config) tc_paper = newspaper.build('http://techcrunch.com', config) espn_paper = newspaper.build('http://espn.com', config) print 'slate has %d articles tc has %d articles espn has %d articles' \ % (slate_paper.size(), tc_paper.size(), espn_paper.size()) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html) print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def get_info(dictionary): ''' Get information for all the articles for the selected sections in La Jornada Inputs: Dictionary with selected sections as keys and list of urls representing articles in every section Returns: A dictionary with nltk scores for title and text for every article in every section ''' rv = {} count = 0 for key, item in dictionary.items(): for i in item: config = Configuration() config.browser_user_agent = get_user_agent() article = Article(i, language='es') article.download() if article.is_downloaded == True: irv = {} rv[count] = irv article.parse() count = count + 1 title = article.title tr_title = mtranslate.translate(title, "en", "auto") #print(title, key, count) date = article.publish_date.date() text = article.text tr_text = translate_article(text) #if key not in rv: irv['article'] = tr_title irv['pub_date'] = date irv['nltk_score'] = get_nltk_score( tr_text) #will be converted into sentiment score irv['source'] = 'Jornada' irv['nltk_score_title'] = get_nltk_score(tr_title) #rv[key].append((title, date, text)) return rv
def ProcessArticle(urlStr, domain, htmlStr, cursor): config = Configuration() extractor = ContentExtractor(config) clean_doc = config.get_parser().fromstring(htmlStr) title = extractor.get_title(clean_doc) authors = extractor.get_authors(clean_doc) text = fulltext(htmlStr) text_keyws = list(nlp.keywords(text).keys()) title_keyws = list(nlp.keywords(title).keys()) keyws = list(set(title_keyws + text_keyws)) summary_sents = nlp.summarize(title=title, text=text, max_sents=config.MAX_SUMMARY_SENT) summary = '\n'.join(summary_sents) if len(text) == 0: OnArticleProcessError(urlStr) else: StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary, cursor)
def __init__(self, html: str, url_to_visit: str, scraped_from: str, fake: bool, conf: Optional[Configuration] = None, do_nlp: bool = True): super().__init__(url='', config=conf if conf is not None else Configuration()) super().set_html(html) super().parse() if do_nlp: super().nlp() self.fake = fake self.url_to_visit: str = url_to_visit self.scraped_from: str = scraped_from self.soup: BeautifulSoup = utreq.soup_from_response(html) self.actual_url: str = url_to_visit self.__text_length: Optional[int] = None self.scraped_date = datetime.now()
def get_data_from_html(html): result = {} parsed_html = Parser.fromstring(html) config = Configuration() extractor = ContentExtractor(config) formatter = OutputFormatter(config) cleaner = DocumentCleaner(config) result['title'] = extractor.get_title(parsed_html) publishing_date = extractor.get_publishing_date('', parsed_html) if publishing_date is None: publishing_date = datetime.datetime.now() result['published_at'] = publishing_date.isoformat() cleaned_html = cleaner.clean(parsed_html) top_node = extractor.calculate_best_node(cleaned_html) top_node = extractor.post_cleanup(top_node) result['content'], _ = formatter.get_formatted(top_node) return result
BLACKLIST_SUFFIX = [ '.js', '.css', '.png', '.jpg', '.jpeg', '.pdf', '.ico', '.gif', '.m4a', '.woff2' ] BLACKLIST_REGEX = [ 'http[s]?://(.*)signout(.*)' ] NEWSPAPER_CONFIG = Configuration() NEWSPAPER_CONFIG.fetch_images = False NEWSPAPER_CONFIG.memoize_articles = False class BaseCrawler: # Crawler Identifier crawler_id = 'com.base' # Rate limit configuration requests_per_sec = 1 # robots.txt url robots_url = None # URLs of pages to crawl # start from
def setUp(self): self.extractor = newspaper.extractors.ContentExtractor(Configuration()) self.parser = newspaper.parsers.Parser
__author__ = "Vishal Jasrotia" __copyright__ = "" __credits__ = ["Vishal Jasrotia"] __license__ = "" __version__ = "1.0.0" __maintainer__ = "Vishal Jasrotia" __email__ = "*****@*****.**" __status__ = "Testing" from newsly.Builder import NewsBuilder from newspaper.configuration import Configuration if __name__ == "__main__": #pass config = Configuration() config = Configuration() config.memoize_articles = True # True in production config.MAX_AUTHORS = 2 config.MIN_WORD_COUNT = 300 #config.MAX_SUMMARY = 900 on text .Not on summary . dont use it #TODO : Have a separate ArticleConfig and SourceConfig extend this! builder = NewsBuilder(config) builder.build() builder.print_source_vs_article_url()
class ArticleExtractionPipeline(object): def __init__(self): self.config = Configuration( ) # sets meta config for article and parser self.parser = self.config.get_parser() # parser self.extractor = ContentExtractor( self.config ) # extracts info (author, tags, text, etc.) from parsed article self.doc_cleaner = DocumentCleaner( self.config) # cleans unwanted tags and nodes from DOM self.formatter = OutputFormatter( self.config) # outputs formatted text from parsed xpath nodes # right now basically only works for RT # params: doc is parsed html from self.parser def find_date_from_html(self, doc): # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py candidates = self.parser.getElementsByTag(doc, tag="time") # add more times = [] for candidate in candidates: time_string = candidate.text for indicator in ["Edited", "Updated", "Published"]: if indicator in time_string: # indicator probably followed by "at" or ":", actual time is after that if "at" in time_string: time_string = time_string.split("at", 1)[1] elif ":" in time_string: time_string = time_string.split(":", 1)[1] break time = self.datetime_from_str(time_string) if time: times.append(time) if times: return min(times) else: return None def datetime_from_str(self, datetime_string): try: return date_parser.parse(datetime_string).replace( tzinfo=None ) # otherwise can't compare naive and (timezone) offset-aware times except (ValueError, OverflowError, AttributeError, TypeError): return None # params: doc is parsed html from self.parser # TODO: generalize def get_date(self, url, doc): raw_date = ( self.extractor.get_publishing_date(url, doc) or # telesur, africanews self.extractor.get_meta_content(doc, "meta[name='LastModifiedDate']") or # aljazeera, Sun, 07 January 2018 18:36:49 GMT self.extractor.get_meta_content(doc, "meta[name='Last-Modified']") or # times of india, Jan 9, 2018, 05:18 IST self.extractor.get_meta_content( doc, "meta[property='og:updated_time']") ) # diplomat, "2018-01-05 23:22:46" if raw_date: return self.datetime_from_str(raw_date) else: return self.find_date_from_html(doc) # params: date is datetime object def recent_article(self, date, max_days_elapsed=3): return datetime.datetime.now() - date < datetime.timedelta( days=max_days_elapsed) def process_item(self, item, spider): doc = self.parser.fromstring(item["content"]) item["title"] = self.extractor.get_title(doc) item["description"] = self.extractor.get_meta_description(doc) item["keywords"] = (self.extractor.get_meta_content( doc, "meta[name='news_keywords']") or self.extractor.get_meta_keywords(doc)) item["date"] = self.get_date(item["url"], doc) # drop item if no date if not item["date"] or not self.recent_article( item["date"], max_days_elapsed=7 ): # or not self.recent_article(item["date"]) raise DropItem("Missing or invalid date for: {}".format( item["title"])) # clean: clean_doc = self.doc_cleaner.clean(doc) top_node = self.extractor.post_cleanup( self.extractor.calculate_best_node(clean_doc)) item["content"] = self.formatter.get_formatted(top_node)[ 0] # [1] returns html of article # drop item if article too short if len(item["content"]) < 600: raise DropItem("Not enough text: {}".format(item["title"])) logging.info("ARTICLE TITLE: {}".format(item["title"])) logging.info("\t time: {}".format(item["date"])) return item
def modified_fulltext(parser, language): ''' Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71 but modified to use an already existing lxml parser ''' from newspaper.cleaners import DocumentCleaner from newspaper.configuration import Configuration from newspaper.extractors import ContentExtractor from newspaper.outputformatters import OutputFormatter def calculate_best_node(self, doc): top_node = None cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)') #nodes_to_check = self.nodes_to_check(doc) starting_boost = float(1.0) #cnt = 0 #i = 0 parent_nodes = [] nodes_with_text = [] #for node in nodes_to_check: for node in cxpath_body_nodes(doc): text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) nodes_number = len(nodes_with_text) negative_scoring = 0 bottom_negativescore_nodes = float(nodes_number) * 0.25 #for node in nodes_with_text: for i,node in enumerate(nodes_with_text): boost_score = float(0) # boost if self.is_boostable(node): #if cnt >= 0: if i >= 0: boost_score = float((1.0 / starting_boost) * 50) starting_boost += 1 # nodes_number if nodes_number > 15: if (nodes_number - i) <= bottom_negativescore_nodes: booster = float( bottom_negativescore_nodes - (nodes_number - i)) boost_score = float(-pow(booster, float(2))) negscore = abs(boost_score) + negative_scoring if negscore > 40: boost_score = float(5) text_node = self.parser.getText(node) word_stats = self.stopwords_class(language=self.language). \ get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) parent_node = self.parser.getParent(node) self.update_score(parent_node, upscore) self.update_node_count(parent_node, 1) if parent_node not in parent_nodes: parent_nodes.append(parent_node) # Parent of parent node parent_parent_node = self.parser.getParent(parent_node) if parent_parent_node is not None: self.update_node_count(parent_parent_node, 1) self.update_score(parent_parent_node, upscore / 2) if parent_parent_node not in parent_nodes: parent_nodes.append(parent_parent_node) #cnt += 1 #i += 1 top_node_score = 0 for e in parent_nodes: score = self.get_score(e) if score > top_node_score: top_node = e top_node_score = score if top_node is None: top_node = e return top_node config = Configuration() config.language = language extractor = ContentExtractor(config) document_cleaner = DocumentCleaner(config) output_formatter = OutputFormatter(config) doc = parser #doc = document_cleaner.clean(doc) top_node = calculate_best_node(extractor,doc) if top_node is not None: top_node = extractor.post_cleanup(top_node) text, html = output_formatter.get_formatted(top_node) else: text = None html = None return { 'value' : { 'text' : text, 'html' : html, }, 'pattern' : 'modified', }
from newspaper import NewsPool from newspaper.configuration import Configuration from difflib import SequenceMatcher from urllib.parse import urlparse ZIPS = 'zips' blocklisted = ['http://www.legacy.com/'] with open('article_template.json') as file: article_template = json.load(file) alog = open('article_log1.log', 'a') slog = open('sources_log1.log', 'a') ## TODO do not fetch images config = Configuration() config.fetch_images = False def similar(a, b): return SequenceMatcher(None, a, b).ratio() def scrape_source(source): try: news_source = newspaper.build(source['url'], config=config) store_articles(source, news_source) except Exception as e: slog.write('\n' + datetime.now().isoformat() + '\t' + str(e))
def newspaper_config() -> Configuration: conf = Configuration() conf.MAX_TITLE = 500 return conf