def load_all_articles(): all_blob_path = os.path.join(config.NPR_DATA_DIR, "all_articles.gz") if not os.path.exists(all_blob_path): all_blobs = [] for filename in glob(os.path.join(config.NPR_DATA_DIR, "*gz")): all_blobs.extend(util.load_from_disk(filename)) util.save_to_disk(all_blob_path, all_blobs) else: all_blobs = util.load_from_disk(all_blob_path) return all_blobs
to_top_n(article_scores, n=50) # Print out some stuff... this is working!! show_best_articles(all_articles, petitions, article_scores, petition_number=10) show_best_articles(all_articles, petitions, article_scores, petition_number=100) petition_counts = load_petition_counts() all_clean_blobs = get_all_json(all_articles, petitions, article_scores, petition_counts) # Restrict to 2013 blobs with a decent amount of articles blob_paths = [] URL_OUTPUT_PATH = '/post-files/20140101-whitehouse-petitions/blobs' i = 0 for b in all_clean_blobs: if b['petition_date'].startswith('2013'): if type(b['petition_close']) == float: b['petition_close'] = None b['articles'] = [a for a in b['articles'] if a['date'].startswith('2013')] if len(b['articles']) >= MIN_ARTICLES: filename = 'blob-{}.json'.format(i) save_to_disk('blobs/' + filename, b, compress=False) blob_paths.append({'fragment': str(abs(hash(b['petition_title']))), 'url': '{}/{}'.format(URL_OUTPUT_PATH, filename)}) i += 1 save_to_disk('blobs/all_petitions.json', blob_paths, compress=False)
def crawl_all_articles(num_days, start_date): """Crawls num_days articles, going forwards from start_date.""" for i in range(num_days): article_date = start_date + timedelta(days=i) filename = article_filename(article_date) save_to_disk(filename, articles_for_date(article_date))