def parse_articles_per_site(db_keywords, source_sites, twitter_accounts_explorer, site): logging.info("Started multiprocessing of Site: %s", site.name) #Setup logging for this site setup_logging(site.name) article_count = 0 newspaper_articles = [] crawlersource_articles = [] logging.info("Site: %s Type:%i"%(site.name, site.mode)) #0 = newspaper, 1 = crawler, 2 = both if(site.mode == 0 or site.mode == 2): logging.disable(logging.ERROR) newspaper_source = newspaper.build(site.url, memoize_articles=False, keep_article_html=True, fetch_images=False, number_threads=1) logging.disable(logging.NOTSET) newspaper_articles = newspaper_source.articles article_count += newspaper_source.size() logging.info("populated {0} articles using newspaper".format(article_count)) if(site.mode == 1 or site.mode == 2): crawlersource_articles = Crawler.Crawler(site) article_count += crawlersource_articles.probabilistic_n logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n)) article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles).__iter__() processed = 0 filters = site.referringsitefilter_set.all() while True: try: try: article = article_iterator.next() except StopIteration: break #have to put all the iteration stuff at the top because I used continue extensively in this loop processed += 1 if url_in_filter(article.url, filters): logging.info("Matches with filter, skipping the {0}".format(article.url)) continue print( "%s (Article|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], site.name, processed, article_count)) logging.info("Processing %s"%article.url) url = article.url if 'http://www.' in url: url = url[:7] + url[11:] elif 'https://www.' in url: url = url[:8] + url[12:] article = ExplorerArticle(article.url) logging.debug("ExplorerArticle Created") # Try to download and extract the useful data if(not article.is_downloaded): if(not article.download()): logging.warning("article skipped because download failed") continue url = article.canonical_url if (not article.is_parsed): if (not article.preliminary_parse()): logging.warning("article skipped because parse failed") continue logging.debug("Article Parsed") logging.debug(u"Title: {0}".format(repr(article.title))) if not article.title: logging.info("article missing title, skipping") continue if not article.text: logging.info("article missing text, skipping") continue # Regex the keyword from the article's text keywords = get_keywords(article, db_keywords) logging.debug(u"matched keywords: {0}".format(repr(keywords))) # Regex the links within article's html sources = get_sources_sites(article, source_sites) logging.debug(u"matched sources: {0}".format(repr(sources))) twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer) logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0]))) if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false logging.debug("skipping article because it's not a match") continue article.newspaper_parse() text = article._newspaper_text # Rerun the get_keywords with text parsed by newspaper. keywords = get_keywords(article, db_keywords) if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false logging.debug("skipping article because it's not a match") continue logging.info("match found") #load selectors from db! #parameter is a namedtuple of "css" and "regex" title = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=0)) or article.title authors = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=1)) if(authors): authors = [authors] else: authors = article.authors pub_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=2)) if(pub_date): pub_date = dateutil.parser.parse(pub_date) else: pub_date = get_pub_date(article) mod_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=3)) language = article.language date_now=timezone.localtime(timezone.now()) # Check if the entry already exists db_article_list = Article.objects.filter(url=url) if not db_article_list: logging.info("Adding new Article to the DB") # If the db_article is new to the database, # add it to the database db_article = Article(title=title, url=url, domain=site.url, date_added=date_now, date_last_seen=date_now, date_published=pub_date, date_modified=mod_date, language=language, text=text) db_article.save() db_article = Article.objects.get(url=url) for key in keywords: db_article.keyword_set.create(name=key) for author in authors: db_article.author_set.create(name=author) for account in twitter_accounts[0]: db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=True, local=(source[1] in site.url)) for source in sources[1]: db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=False, local=(source[1] in site.url)) else: logging.info("Modifying existing Article in the DB") # If the db_article already exists, # update all fields except date_added db_article = db_article_list[0] db_article.title = title db_article.url = url db_article.domain = site.url # Do not update the added date # db_article.date_added = today db_article.date_last_seen = date_now db_article.date_published = pub_date db_article.date_modified = mod_date db_article.language = language db_article.text = text db_article.save() for key in keywords: if not db_article.keyword_set.filter(name=key): db_article.keyword_set.create(name=key) for author in authors: if not db_article.author_set.filter(name=author): db_article.author_set.create(name=author) for account in twitter_accounts[0]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = True) for account in twitter_accounts[1]: if not db_article.sourcetwitter_set.filter(name=account): db_article.sourcetwitter_set.create(name = account, matched = False) for source in sources[0]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=True, local=(source[1] in site.url)) for source in sources[1]: if not db_article.sourcesite_set.filter(url=source[0]): db_article.sourcesite_set.create(url=source[0], domain=source[1], anchor_text=source[2], matched=False, local=(source[1] in site.url)) warc_creator.enqueue_article(url) except (KeyboardInterrupt, SystemExit): raise except Exception as e: logging.exception("Unhandled exception while crawling: " + str(e)) logging.info("Finished Site: %s"%site.name) setup_logging(increment=False) logging.info("Finished Site: %s"%site.name)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'Frontend'))) import django os.environ['DJANGO_SETTINGS_MODULE'] = 'Frontend.settings' # For Models connecting with the Django Database from explorer.models import * from ExplorerArticle import ExplorerArticle if __name__ == "__main__": id = input("site id: ") django.setup() site = ReferringSite.objects.get(pk=id) url = input("url: ") article = ExplorerArticle(url) article.download() article.preliminary_parse() article.newspaper_parse() fields = {} for css in site.referringsitecssselector_set.all(): if not (css.field_choice in fields.keys()): fields[css.field_choice] = [] fields[css.field_choice].append({'pattern': css.pattern, 'regex': css.regex}) if(not len(fields.keys())): print "no fields" for key, value in fields.iteritems(): print "field \"{0}\"".format(key) print article.evaluate_css_selectors(value) print