def test_create_exist_article_warc(self): """ crawling a exist article warc will replace new """ wc.create_article_warc(ARTICLE_HTML) os.chdir("..") self.assertTrue(os.path.isfile(WARC_ARTICLE_DIRECTORY + "/http:__www.cbc.ca_news_world_nor-" + "easter-storm-blasts-eastern-u-s-with" + "-heavy-rain-and-snow-1.2851711" + ".warc.gz"))
def test_create_wrong_url_article_warc(self): """ crawling a wrong url article should be going to success because there is 404 page """ try: os.chdir("..") os.remove(WARC_ARTICLE_DIRECTORY + "/http:__www.cbc.ca_news_world_nor-easter-storm-blasts-" + "eastern-u-s-with.warc.gz") except OSError: pass self.setUp() wc.create_article_warc(WRONG_ARTICLE_HTML) os.chdir("..") time.sleep(1) self.assertTrue(os.path.isfile(WARC_ARTICLE_DIRECTORY + "/http:__www.cbc.ca_news_world_nor-" + "easter-storm-blasts-eastern-u-s-with" + ".warc.gz"))
def test_create_article_warc(self): """ create a real article url warc should work """ try: os.chdir("..") os.remove(WARC_ARTICLE_DIRECTORY + "/http:__www.cbc.ca_news_world_nor-easter-storm-blasts" + "-eastern-u-s-with-heavy-rain-and-snow-1.2851711.warc.gz" ) except OSError: pass self.setUp() wc.create_article_warc(ARTICLE_HTML) os.chdir("..") time.sleep(1) self.assertTrue(os.path.isfile(WARC_ARTICLE_DIRECTORY + "/http:__www.cbc.ca_news_world_nor-" + "easter-storm-blasts-eastern-u-s-with" + "-heavy-rain-and-snow-1.2851711.warc.gz" ))
for line in article_file: if (len(line.split(' ')) == 2 and not line.split(' ') in article_queue): article_queue.append(line.split(' ')) article_file.seek(0) article_file.truncate() article_file.close() if (len(article_queue) > 0): # get first element in the queue line = article_queue.pop(0) url = line[0] warc_file_name = line[1].strip() print 'processing: ' + url + ' : ' + warc_file_name article_processes.append( warc_creator.create_article_warc(url, warc_file_name)) # set time out for pdf generator p = warc_creator.create_article_pdf(url, warc_file_name) # wait for 200 seconds, if timeout, kill the process num_polls = 0 while p.poll() is None: # Waiting for the process to finish. time.sleep(0.1) # Avoid being a CPU busy loop. num_polls += 1 if num_polls > 2000: # after 150 secs, it will be considered as failure, # the process will be terminated and put into failure list p.terminate() fail_name = "article_warc.stream.failure" fail = open(fail_name, "a") fail.write(url + "\n")
def parse_articles(referring_sites, db_keywords, source_sites, twitter_accounts_explorer): """ (list of [str, newspaper.source.Source, str], list of str, list of str, str) -> None Downloads each db_article in the site, extracts, compares with Foreign Sites and Keywords provided. Then the db_article which had a match will be stored into the Django database Keyword arguments: referring_sites -- List of [name, 'built_article'] of each site db_keywords -- List of keywords source_sites -- List of foreign sites """ added, updated, failed, no_match = 0, 0, 0, 0 # for each db_article in each sites, download and parse important data for site in referring_sites: # print "\n%s" % site[0] article_count = 0 newspaper_articles = [] crawlersource_articles = [] logging.info("Site: %s Type:%i"%(site['name'], site['type'])) #0 = newspaper, 1 = crawler, 2 = both if(site["type"] == 0 or site["type"] == 2): logging.disable(logging.ERROR) newspaper_source = newspaper.build(site["url"], memoize_articles=False, keep_article_html=True, fetch_images=False, language='en', number_threads=1) logging.disable(logging.NOTSET) newspaper_articles = newspaper_source.articles article_count += newspaper_source.size() logging.info("populated {0} articles using newspaper".format(article_count)) if(site["type"] == 1 or site["type"] == 2): crawlersource_articles = Crawler.CrawlerSource(site["url"]) article_count += crawlersource_articles.probabilistic_n logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n)) article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles) processed = 0 for article in article_iterator: logging.info("Looking: %s"%article.url) # Check for any new command on communication stream check_command() url = article.url if 'http://www.' in url: url = url[:7] + url[11:] elif 'https://www.' in url: url = url[:8] + url[12:] # Try to download and extract the useful data try: if(not article.is_downloaded): article.download() if(not article.is_parsed): article.parse() title = article.title except (KeyboardInterrupt, SystemExit): raise except: logging.warning("Could not parse article") title = "" # If downloading/parsing the page fails, # stop here and move on to next db_article if not ((title == "") or (title == "Page not found")): logging.debug(u"found title: {0}".format(title)) # Regex the keyword from the article's text keywords = get_keywords(article, db_keywords) logging.debug("keywords: {0}".format(str(keywords))) # Regex the links within article's html sources = get_sources_sites(article.article_html, source_sites) logging.debug("sources: {0}".format(str(sources))) twitter_accounts = get_sources_twitter(article.article_html, twitter_accounts_explorer) logging.debug("twitter_accounts: {0}".format(str(twitter_accounts[0]))) # Store parsed author authors = article.authors # Try to parse the published date pub_date = get_pub_date(article) # If neither of keyword nor sources matched, # then stop here and move on to next article if not (keywords == [] and sources[0] == [] and twitter_accounts[0] ==[]): logging.info("Found Match") try: logging.info("Requesting canonical url") url = requests.get(url).url except requests.RequestException: logging.warning("Raised requests.RequestException") except: logging.warning("Could not resolve canonical url") logging.debug("canonical url found") # Check if the entry already exists db_article_list = Article.objects.filter(url=url) if not db_article_list: logging.info("Adding new Article to the DB") # If the db_article is new to the database, # add it to the database db_article = Article(title=title, url=url, domain=site["url"], date_added=timezone.localtime( timezone.now()), date_published=pub_date) db_article.save() db_article = Article.objects.get(url=url) for key in keywords: db_article.keyword_set.create(name=key) for author in authors: db_article.author_set.create(name=author) for account in twitter_accounts[0]: db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=True, local=(source[1] in site["url"])) for source in sources[1]: db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=False, local=(source[1] in site["url"])) added += 1 else: logging.info("Modifying existing Article in the DB") # If the db_article already exists, # update all fields except date_added db_article = db_article_list[0] db_article.title = title db_article.url = url db_article.domain = site["url"] # Do not update the added date # db_article.date_added = today db_article.date_published = pub_date db_article.save() for key in keywords: if not db_article.keyword_set.filter(name=key): db_article.keyword_set.create(name=key) for author in authors: if not db_article.author_set.filter(name=author): db_article.author_set.create(name=author) for account in twitter_accounts[0]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=True, local=(source[1] in site["url"])) for source in sources[1]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=False, local=(source[1] in site["url"])) warc_creator.create_article_warc(url) else: logging.info("No matches") else: logging.info("No title found") processed += 1 print( "%s (Article|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], site["name"], processed, article_count)) # Null the db_article data to free the memory #newspaper_source.articles[db_article] = None # Null the article object's content to free the memory article.article_html = None article.text = None article.title = None article.source_url = None article.url = None article.top_img = None article.meta_img = None article.imgs = None article.movies = None article.keywords = None article.meta_keywords = None article.tags = None article.authors = None article.publish_date = None article.summary = None article.html = None article.is_parsed = None article.is_downloaded = None article.meta_description = None article.meta_lang = None article.meta_favicon = None article.meta_data = None article.canonical_link = None article.top_node = None article.clean_top_node = None article.doc = None article.clean_doc = None article.additional_data = None logging.info("(%s | %i/%i) Finished looking: %s"%(article.url, processed, article_count, article.url)) logging.info("Finished Site: %s"%site['name']) print( "%s (Article|%s) %i/%i " % (str(timezone.localtime(timezone.now()))[:-13], site["name"], processed, article_count))
# read the file and get all the urls article_file = open(article_file_name, "r+") for line in article_file: if (len(line.split(' ')) == 2 and not line.split(' ') in article_queue): article_queue.append(line.split(' ')) article_file.seek(0) article_file.truncate() article_file.close() if (len(article_queue) > 0): # get first element in the queue line = article_queue.pop(0) url = line[0] warc_file_name = line[1].strip() print 'processing: ' + url + ' : ' + warc_file_name article_processes.append(warc_creator.create_article_warc(url, warc_file_name)) # set time out for pdf generator p = warc_creator.create_article_pdf(url, warc_file_name) # wait for 200 seconds, if timeout, kill the process num_polls = 0 while p.poll() is None: # Waiting for the process to finish. time.sleep(0.1) # Avoid being a CPU busy loop. num_polls += 1 if num_polls > 2000: # after 150 secs, it will be considered as failure, # the process will be terminated and put into failure list p.terminate() fail_name = "article_warc.stream.failure" fail = open(fail_name, "a") fail.write(url + "\n")
# if not tasks remaining, retry thoes url that failed before if os.stat(article_file_name).st_size == 0: article_file_name = "article_warc.stream.failure" # read the file and get all the urls article_file = open(article_file_name, "r+") for url in article_file: if (url.strip() != "" and (not url.strip() in article_queue)): article_queue.append(url.strip()) article_file.seek(0) article_file.truncate() article_file.close() if (len(article_queue) > 0): # get first element in the queue url = article_queue.pop(0) article_processes.append(warc_creator.create_article_warc(url)) # set time out for pdf generator p = warc_creator.create_article_pdf(url) # wait for 30 seconds, if timeout, kill the process num_polls = 0 while p.poll() is None: # Waiting for the process to finish. time.sleep(0.1) # Avoid being a CPU busy loop. num_polls += 1 if num_polls > 600: # after 60 secs, it will be considered as failure, # the process will be terminated and put into failure list p.terminate() fail_name = "article_warc.stream.failure" fail = open(fail_name, "a") fail.write(url + "\n")
def parse_articles(referring_sites, db_keywords, source_sites, twitter_accounts_explorer): """ (list of [str, newspaper.source.Source, str], list of str, list of str, str) -> None Downloads each db_article in the site, extracts, compares with Foreign Sites and Keywords provided. Then the db_article which had a match will be stored into the Django database Keyword arguments: referring_sites -- List of [name, 'built_article'] of each site db_keywords -- List of keywords source_sites -- List of foreign sites """ added, updated, failed, no_match = 0, 0, 0, 0 # for each db_article in each sites, download and parse important data for site in referring_sites: # print "\n%s" % site[0] article_count = 0 newspaper_articles = [] crawlersource_articles = [] logging.info("Site: %s Type:%i"%(site['name'], site['type'])) #0 = newspaper, 1 = crawler, 2 = both if(site["type"] == 0 or site["type"] == 2): logging.disable(logging.ERROR) newspaper_source = newspaper.build(site["url"], memoize_articles=False, keep_article_html=True, fetch_images=False, language='en', number_threads=1) logging.disable(logging.NOTSET) newspaper_articles = newspaper_source.articles article_count += newspaper_source.size() logging.info("populated {0} articles using newspaper".format(article_count)) if(site["type"] == 1 or site["type"] == 2): crawlersource_articles = Crawler.Crawler(site["url"], site["filter"]) article_count += crawlersource_articles.probabilistic_n logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n)) article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles) processed = 0 for article in article_iterator: #have to put all the iteration stuff at the top because I used continue extensively in this loop processed += 1 # Check for any new command on communication stream check_command() if url_in_filter(article.url, site["filter"]): logging.info("Matches with filter, skipping the {0}".format(article.url)) continue print( "%s (Article|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], site["name"], processed, article_count)) logging.info("Processing %s"%article.url) url = article.url if 'http://www.' in url: url = url[:7] + url[11:] elif 'https://www.' in url: url = url[:8] + url[12:] article = ExplorerArticle(article.url) # Try to download and extract the useful data if(not article.is_downloaded): if(not article.download()): logging.warning("article skipped because download failed") continue article.preliminary_parse() if not article.title: logging.info("article missing title, skipping") continue if not article.text: logging.info("article missing text, skipping") continue # Regex the keyword from the article's text keywords = get_keywords(article, db_keywords) logging.debug(u"matched keywords: {0}".format(repr(keywords))) # Regex the links within article's html sources = get_sources_sites(article, source_sites) logging.debug(u"matched sources: {0}".format(repr(sources))) twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer) logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0]))) if((not keywords) or (not sources[0]) or (not twitter_accounts[0])):#[] gets coverted to false logging.debug("skipping article because it's not a match") continue logging.info("match found") article.newspaper_parse() authors = article.authors pub_date = get_pub_date(article) # Check if the entry already exists db_article_list = Article.objects.filter(url=url) if not db_article_list: logging.info("Adding new Article to the DB") # If the db_article is new to the database, # add it to the database db_article = Article(title=article.title, url=url, domain=site["url"], date_added=timezone.localtime( timezone.now()), date_published=pub_date) db_article.save() db_article = Article.objects.get(url=url) for key in keywords: db_article.keyword_set.create(name=key) for author in authors: db_article.author_set.create(name=author) for account in twitter_accounts[0]: db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=True, local=(source[1] in site["url"])) for source in sources[1]: db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=False, local=(source[1] in site["url"])) added += 1 else: logging.info("Modifying existing Article in the DB") # If the db_article already exists, # update all fields except date_added db_article = db_article_list[0] db_article.title = article.title db_article.url = url db_article.domain = site["url"] # Do not update the added date # db_article.date_added = today db_article.date_published = pub_date db_article.save() for key in keywords: if not db_article.keyword_set.filter(name=key): db_article.keyword_set.create(name=key) for author in authors: if not db_article.author_set.filter(name=author): db_article.author_set.create(name=author) for account in twitter_accounts[0]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=True, local=(source[1] in site["url"])) for source in sources[1]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], matched=False, local=(source[1] in site["url"])) warc_creator.create_article_warc(url) logging.info("Finished Site: %s"%site['name']) print( "%s (Article|%s) %i/%i " % (str(timezone.localtime(timezone.now()))[:-13], site["name"], processed, article_count))
def parse_articles(populated_sites, db_keywords, foreign_sites): """ (list of [str, newspaper.source.Source, str], list of str, list of str, str) -> None Downloads each article in the site, extracts, compares with Foreign Sites and Keywords provided. Then the article which had a match will be stored into the Django database Keyword arguments: populated_sites -- List of [name, 'built_article'] of each site db_keywords -- List of keywords foreign_sites -- List of foreign sites """ added, updated, failed, no_match = 0, 0, 0, 0 # for each article in each sites, download and parse important data for site in populated_sites: # print "\n%s" % site[0] article_count = site[1].size() processed = 0 for k in range(len(site[1].articles)): art = site[1].articles[k] # Stop any print statements, even newspaper's warning messages sys.stdout = open(os.devnull, "w") sys.stderr = open(os.devnull, "w") # Check for any new command on communication stream check_command() url = art.url if "http://www." in url: url = url[:7] + url[11:] elif "https://www." in url: url = url[:8] + url[12:] # Try to download and extract the useful data try: art.download() art.parse() title = art.title except: title = "" # If downloading/parsing the page fails, # stop here and move on to next article if not ((title == "") or (title == "Page not found")): # Regex the keyword from the article's text keywords = get_keywords(art, db_keywords) # Regex the links within article's html sources = get_sources(art.article_html, foreign_sites) # Store parsed author authors = art.authors # Try to parse the published date pub_date = get_pub_date(art) # If neither of keyword nor sources matched, # then stop here and move on to next article if not (keywords == [] and sources == []): # Check if the entry already exists article_list = Article.objects.filter(url=url) if not article_list: # If the article is new to the database, # add it to the database article = Article( title=title, url=url, url_origin=site[2], date_added=timezone.localtime(timezone.now()), date_published=pub_date, ) article.save() article = Article.objects.get(url=url) for key in keywords: article.keyword_set.create(keyword=key) for author in authors: article.author_set.create(author=author) for source in sources: article.source_set.create(url=source[0], url_origin=source[1]) added += 1 else: # If the article already exists, # update all fields except date_added article = article_list[0] article.title = title article.url = url article.url_origin = site[2] # Do not update the added date # article.date_added = today article.date_published = pub_date article.save() for key in keywords: if not A_keyword.objects.filter(keyword=key): article.keyword_set.create(keyword=key) for author in authors: if not Author.objects.filter(author=author): article.author_set.create(author=author) for source in sources: if not Source.objects.filter(url=source[0]): src = article.source_set.create(url=source[0]) src.url_origin = source[1] warc_creator.create_article_warc(url) processed += 1 # Let the output print back to normal for minimal ui sys.stdout = sys.__stdout__ # Print out minimal information sys.stdout.write( "%s (Article|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], site[0], processed, article_count) ) sys.stdout.flush() # Null the article data to free the memory site[1].articles[k] = None print( "%s (Article|%s) %i/%i " % (str(timezone.localtime(timezone.now()))[:-13], site[0], processed, article_count) )