Ejemplo n.º 1
0
        'fpath': raw_train.filenames[idx]
    } for idx in raw_ids_train]
    raw_data_test = [{
        'id': idx,
        'text': raw_test.data[idx],
        'fpath': raw_test.filenames[idx]
    } for idx in raw_ids_test]
    if REMOVE_HEADER:
        for doc in raw_data_train:
            doc['text'] = doc['text'][doc['text'].index('\n\n'):]
        for doc in raw_data_test:
            doc['text'] = doc['text'][doc['text'].index('\n\n'):]

    # keep this pseudo-ProdLDA version
    Path("replicated").mkdir(exist_ok=True)
    save_sparse(sparse.coo_matrix(raw_counts_train), "./replicated/train.npz")
    save_sparse(sparse.coo_matrix(raw_counts_test), "./replicated/test.npz")

    save_json(vocab, "./replicated/train.vocab.json")

    save_json(raw_tokens_train, "./replicated/train.tokens.json")
    save_json(raw_tokens_test, "./replicated/test.tokens.json")

    save_jsonlist(raw_data_train, "./replicated/train.jsonlist")
    save_jsonlist(raw_data_test, "./replicated/test.jsonlist")

    save_json([d['id'] for d in raw_data_train], "./replicated/train.ids.json")
    save_json([d['id'] for d in raw_data_test], "./replicated/test.ids.json")

    ## Alignment -- currently ok, but not great
Ejemplo n.º 2
0
    outdir.mkdir(exist_ok=True)

    # copy over the train files
    shutil.copy(Path(indir, "train.jsonlist"), Path(outdir, "train.jsonlist"))
    shutil.copy(Path(indir, "processed/train.npz"), Path(outdir, "train.npz"))
    shutil.copy(Path(indir, "processed/train.ids.json"),
                Path(outdir, "train.ids.json"))
    shutil.copy(Path(indir, "processed/train.vocab.json"),
                Path(outdir, "train.vocab.json"))

    # read in test
    test_jsonlist = utils.load_jsonlist(Path(indir, "test.jsonlist"))
    test_counts = utils.load_sparse(Path(indir, "processed/test.npz"))
    test_ids = utils.load_json(Path(indir, "processed/test.ids.json"))

    # split into a dev set
    dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = (
        train_test_split(test_jsonlist,
                         test_counts,
                         test_ids,
                         test_size=0.5,
                         random_state=11225))

    # save
    utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist"))
    utils.save_sparse(dev_counts, Path(outdir, "dev.npz"))
    utils.save_json(dev_ids, Path(outdir, "dev.ids.json"))

    utils.save_jsonlist(test_jsonlist, Path(outdir, "test.jsonlist"))
    utils.save_sparse(test_counts, Path(outdir, "test.npz"))
    utils.save_json(test_ids, Path(outdir, "test.ids.json"))
Ejemplo n.º 3
0
# python 3.8
from pathlib import Path
import json

from scipy import sparse
import numpy as np

from utils import save_sparse, load_json, save_json

if __name__ == "__main__":
    outdir = Path("aligned")
    outdir.mkdir(exist_ok=True)

    # data files
    train = np.loadtxt("intermediate/train.txt")
    test = np.loadtxt("intermediate/test.txt")

    train = sparse.coo_matrix(train)
    test = sparse.coo_matrix(test)

    save_sparse(train, "aligned/train")
    save_sparse(test, "aligned/test")

    # reorder vocabulary, save as list
    vocab = load_json("intermediate/vocab_dict.json")
    vocab = [  # ensure correct order
        k for k, v in sorted(vocab.items(), key=lambda kv: kv[1])
    ]
    save_json(vocab, "aligned/train.vocab.json")
    repl_train_ids = load_json("./replicated/dev/train.ids.json")
    repl_dev_ids = load_json("./replicated/dev/dev.ids.json")

    data = load_jsonlist(Path(dev_dir, "train.jsonlist"))
    counts = load_sparse(Path(dev_dir, "train.npz"))
    ids = load_json(Path(dev_dir, "train.ids.json"))

    # split based on how the replication data was split
    data_train = [doc for doc in data if doc['id'] in repl_train_ids]
    data_dev = [doc for doc in data if doc['id'] in repl_dev_ids]

    counts_train = counts[
        np.array([doc['id'] in repl_train_ids for doc in data]), :]
    counts_dev = counts[np.array([doc['id'] in repl_dev_ids
                                  for doc in data]), :]

    ids_train = [id for id in ids if id in repl_train_ids]
    ids_dev = [id for id in ids if id in repl_dev_ids]

    assert (len(data_train) == counts_train.shape[0] == len(ids_train))
    assert (len(data_dev) == counts_dev.shape[0] == len(ids_dev))

    # save
    save_jsonlist(data_train, Path(dev_dir, "train.jsonlist"))
    save_jsonlist(data_dev, Path(dev_dir, "dev.jsonlist"))

    save_sparse(counts_train, Path(dev_dir, "train.npz"))
    save_sparse(counts_dev, Path(dev_dir, "dev.npz"))

    save_json(ids_train, Path(dev_dir, "train.ids.json"))
    save_json(ids_dev, Path(dev_dir, "dev.ids.json"))
Ejemplo n.º 5
0
    utils.save_json(train_ids, f"{args.output_dir}/train.ids.json")
    utils.save_json(val_ids, f"{args.output_dir}/dev.ids.json")
    utils.save_json(test_ids, f"{args.output_dir}/test.ids.json")

    # save the raw text
    utils.save_jsonlist(
        ({
            "id": id,
            "text": text
        } for id, text in zip(train_ids, train_doc_list)),
        f"{args.output_dir}/train.jsonlist",
    )
    utils.save_jsonlist(
        ({
            "id": id,
            "text": text
        } for id, text in zip(val_ids, val_doc_list)),
        f"{args.output_dir}/dev.jsonlist",
    )
    utils.save_jsonlist(
        ({
            "id": id,
            "text": text
        } for id, text in zip(test_ids, test_doc_list)),
        f"{args.output_dir}/test.jsonlist",
    )

    utils.save_sparse(train_vectors, f'{args.output_dir}/train.npz')
    utils.save_sparse(val_vectors, f'{args.output_dir}/dev.npz')
    utils.save_sparse(test_vectors, f'{args.output_dir}/test.npz')