def split_dataset(outdir, tmp_clusters_path): print('splitting dataset into train/val/test...') for i, c in enumerate(utils.read_jsonl(tmp_clusters_path)): if i % 1000 == 0: print(i, 'clusters done') outpath = outdir / (c['collection'] + '.jsonl') utils.write_jsonl([c], outpath, mode='a')
def cleanup_clusters(path, tmp_path): print('cleaning up:', path.name) for i, c in enumerate(utils.read_jsonl(path)): if i % 1000 == 0: print(i, 'clusters done') articles = [] if 'wcep_articles_filled' in c: for a in c['wcep_articles_filled']: a['origin'] = 'WCEP' articles.append(a) if 'cc_articles_filled' in c: for a in c['cc_articles_filled']: a['origin'] = 'CommonCrawl' articles.append(a) c = { 'id': c['id'], 'date': c['date'], 'summary': c['summary'], 'articles': articles, 'collection': c['collection'], 'wiki_links': c['wiki_links'], 'reference_urls': c['reference_urls'], 'category': c['category'] } utils.write_jsonl([c], tmp_path, mode='a') shutil.move(tmp_path, path)
def add_cc_articles_to_clusters(clusters, cc_path, id_to_cluster_idx, tmp_clusters_path): print('adding articles from CommonCrawl to clusters') n_clusters = len(clusters) n_clusters_done = 0 for i, a in enumerate(utils.read_jsonl(cc_path)): if i % 10000 == 0: print( f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done' ) cluster_idx = id_to_cluster_idx[a['id']] c = clusters[cluster_idx] if c is not None: c['cc_articles_filled'].append(a) c['cc_ids_filled'].add(a['id']) if c['cc_ids'] == c['cc_ids_filled']: del c['cc_ids'], c['cc_ids_filled'] utils.write_jsonl([c], tmp_clusters_path, mode='a') clusters[cluster_idx] = None n_clusters_done += 1 # remaining few clusters that only have WCEP but not CC articles for c in clusters: if c is not None and c['cc_ids'] == c['cc_ids_filled']: print("Hmm") del c['cc_ids'], c['cc_ids_filled'] utils.write_jsonl([c], tmp_clusters_path, mode='a') clusters[cluster_idx] = None n_clusters_done += 1 print( f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done')
def process_batch(items, out_path, jobs): logging.debug('extracting articles...') pool = multiprocessing.Pool(processes=jobs) try: articles = pool.map(extract_article, items) articles = [a for a in articles if a is not None] pool.close() logging.debug('extracting articles done') except KeyboardInterrupt: pool.terminate() sys.exit() utils.write_jsonl(articles, out_path, mode='a') new_record_ids = [x['id'] for x in items] logging.info(f'done-record-ids:{" ".join(new_record_ids)}') return articles
def add_cc_articles_to_clusters(clusters, cc_path, id_to_cluster_idx, tmp_clusters_path): print('adding articles from CommonCrawl to clusters') n_clusters = len(clusters) n_clusters_done = 0 for i, a in enumerate(utils.read_jsonl(cc_path)): if i % 10000 == 0: print( f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done' ) cluster_idx = id_to_cluster_idx[a['id']] c = clusters[cluster_idx] c.setdefault('cc_articles_filled', []) c['cc_articles_filled'].append(a) if len(c['cc_articles']) == len(c['cc_articles_filled']): utils.write_jsonl([c], tmp_clusters_path, mode='a') clusters[cluster_idx] = None n_clusters_done += 1 print( f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input-file', type=str, help='Dataset filename') parser.add_argument('--output-file', type=str, help='Dataset filename') parser.add_argument('--random-seed', type=int, default=30, help='Random seed') parser.add_argument('--sample-size', type=int, default=5000, help='Dev size') args = parser.parse_args() # Load KG data samples = sample_kg(args) # Write tsv files write_jsonl(args.output_file, samples)
def main(args): outpath = pathlib.Path(args.o) done_urls = set() failed_articles = [] n_done = 0 n_success = 0 if args.override and outpath.exists(): outpath.unlink() elif outpath.exists(): with open(outpath) as f: for line in f: a = json.loads(line) url = a['archive_url'] if a['state'] == 'successful': n_success += 1 else: failed_articles.append(a) n_done += 1 done_urls.add(url) todo_articles = read_input(args.i) n_total = len(todo_articles) todo_articles = [ a for a in todo_articles if a['archive_url'] not in done_urls ] print('failed articles from last run:', len(failed_articles)) print('articles todo:', len(todo_articles)) if args.repeat_failed: todo_articles = failed_articles + todo_articles if args.shuffle: random.shuffle(todo_articles) durations = [] t1 = time.time() for todo_batch in batches(todo_articles, args.batchsize): pool = multiprocessing.Pool(processes=args.jobs) output = pool.map(extract_article, todo_batch) pool.close() articles = [] for a in output: if a['state'] == 'successful': n_success += 1 articles.append(a) done_urls.add(a['archive_url']) n_done += 1 if articles: utils.write_jsonl(articles, outpath, mode='a') t2 = time.time() elapsed = t2 - t1 durations.append(elapsed) t1 = t2 print(f'{n_done}/{n_total} done, {n_success}/{n_done} successful') print('Average per-batch time (seconds):') print('last batch:', elapsed) print('last 10:', np.mean(durations[-10:])) print('overall:', np.mean(durations)) print()
def main(args): data = utils.read_json(args.source) new_data = filter_data(data) _ = utils.write_jsonl(new_data, args.output)
def write_data(self, data, fname): _ = utils.write_jsonl(data, fname) return 0