def split_dataset(outdir, tmp_clusters_path):
    print('splitting dataset into train/val/test...')
    for i, c in enumerate(utils.read_jsonl(tmp_clusters_path)):
        if i % 1000 == 0:
            print(i, 'clusters done')
        outpath = outdir / (c['collection'] + '.jsonl')
        utils.write_jsonl([c], outpath, mode='a')
def cleanup_clusters(path, tmp_path):
    print('cleaning up:', path.name)
    for i, c in enumerate(utils.read_jsonl(path)):
        if i % 1000 == 0:
            print(i, 'clusters done')
        articles = []
        if 'wcep_articles_filled' in c:
            for a in c['wcep_articles_filled']:
                a['origin'] = 'WCEP'
                articles.append(a)
        if 'cc_articles_filled' in c:
            for a in c['cc_articles_filled']:
                a['origin'] = 'CommonCrawl'
                articles.append(a)

        c = {
            'id': c['id'],
            'date': c['date'],
            'summary': c['summary'],
            'articles': articles,
            'collection': c['collection'],
            'wiki_links': c['wiki_links'],
            'reference_urls': c['reference_urls'],
            'category': c['category']
        }

        utils.write_jsonl([c], tmp_path, mode='a')

    shutil.move(tmp_path, path)
def add_cc_articles_to_clusters(clusters, cc_path, id_to_cluster_idx,
                                tmp_clusters_path):
    print('adding articles from CommonCrawl to clusters')
    n_clusters = len(clusters)
    n_clusters_done = 0
    for i, a in enumerate(utils.read_jsonl(cc_path)):
        if i % 10000 == 0:
            print(
                f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done'
            )
        cluster_idx = id_to_cluster_idx[a['id']]
        c = clusters[cluster_idx]

        if c is not None:
            c['cc_articles_filled'].append(a)
            c['cc_ids_filled'].add(a['id'])
            if c['cc_ids'] == c['cc_ids_filled']:
                del c['cc_ids'], c['cc_ids_filled']
                utils.write_jsonl([c], tmp_clusters_path, mode='a')
                clusters[cluster_idx] = None
                n_clusters_done += 1

    # remaining few clusters that only have WCEP but not CC articles
    for c in clusters:
        if c is not None and c['cc_ids'] == c['cc_ids_filled']:
            print("Hmm")
            del c['cc_ids'], c['cc_ids_filled']
            utils.write_jsonl([c], tmp_clusters_path, mode='a')
            clusters[cluster_idx] = None
            n_clusters_done += 1

    print(
        f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done')
def process_batch(items, out_path, jobs):
    logging.debug('extracting articles...')
    pool = multiprocessing.Pool(processes=jobs)
    try:
        articles = pool.map(extract_article, items)
        articles = [a for a in articles if a is not None]
        pool.close()
        logging.debug('extracting articles done')
    except KeyboardInterrupt:
        pool.terminate()
        sys.exit()
    utils.write_jsonl(articles, out_path, mode='a')
    new_record_ids = [x['id'] for x in items]
    logging.info(f'done-record-ids:{" ".join(new_record_ids)}')
    return articles
def add_cc_articles_to_clusters(clusters, cc_path, id_to_cluster_idx,
                                tmp_clusters_path):
    print('adding articles from CommonCrawl to clusters')
    n_clusters = len(clusters)
    n_clusters_done = 0
    for i, a in enumerate(utils.read_jsonl(cc_path)):
        if i % 10000 == 0:
            print(
                f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done'
            )
        cluster_idx = id_to_cluster_idx[a['id']]
        c = clusters[cluster_idx]
        c.setdefault('cc_articles_filled', [])
        c['cc_articles_filled'].append(a)
        if len(c['cc_articles']) == len(c['cc_articles_filled']):
            utils.write_jsonl([c], tmp_clusters_path, mode='a')
            clusters[cluster_idx] = None
            n_clusters_done += 1
    print(
        f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done')
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--input-file', type=str, help='Dataset filename')
    parser.add_argument('--output-file', type=str, help='Dataset filename')
    parser.add_argument('--random-seed',
                        type=int,
                        default=30,
                        help='Random seed')
    parser.add_argument('--sample-size',
                        type=int,
                        default=5000,
                        help='Dev size')

    args = parser.parse_args()

    # Load KG data
    samples = sample_kg(args)

    # Write tsv files
    write_jsonl(args.output_file, samples)
def main(args):

    outpath = pathlib.Path(args.o)
    done_urls = set()
    failed_articles = []
    n_done = 0
    n_success = 0

    if args.override and outpath.exists():
        outpath.unlink()

    elif outpath.exists():
        with open(outpath) as f:
            for line in f:
                a = json.loads(line)
                url = a['archive_url']
                if a['state'] == 'successful':
                    n_success += 1
                else:
                    failed_articles.append(a)
                n_done += 1
                done_urls.add(url)

    todo_articles = read_input(args.i)
    n_total = len(todo_articles)
    todo_articles = [
        a for a in todo_articles if a['archive_url'] not in done_urls
    ]

    print('failed articles from last run:', len(failed_articles))
    print('articles todo:', len(todo_articles))

    if args.repeat_failed:
        todo_articles = failed_articles + todo_articles

    if args.shuffle:
        random.shuffle(todo_articles)

    durations = []
    t1 = time.time()
    for todo_batch in batches(todo_articles, args.batchsize):

        pool = multiprocessing.Pool(processes=args.jobs)
        output = pool.map(extract_article, todo_batch)
        pool.close()

        articles = []
        for a in output:
            if a['state'] == 'successful':
                n_success += 1
                articles.append(a)
                done_urls.add(a['archive_url'])
            n_done += 1

        if articles:
            utils.write_jsonl(articles, outpath, mode='a')

        t2 = time.time()
        elapsed = t2 - t1
        durations.append(elapsed)
        t1 = t2

        print(f'{n_done}/{n_total} done, {n_success}/{n_done} successful')
        print('Average per-batch time (seconds):')
        print('last batch:', elapsed)
        print('last 10:', np.mean(durations[-10:]))
        print('overall:', np.mean(durations))
        print()
Exemple #8
0
def main(args):
    data = utils.read_json(args.source)
    new_data = filter_data(data)
    _ = utils.write_jsonl(new_data, args.output)
Exemple #9
0
 def write_data(self, data, fname):
     _ = utils.write_jsonl(data, fname)
     return 0