'fpath': raw_train.filenames[idx] } for idx in raw_ids_train] raw_data_test = [{ 'id': idx, 'text': raw_test.data[idx], 'fpath': raw_test.filenames[idx] } for idx in raw_ids_test] if REMOVE_HEADER: for doc in raw_data_train: doc['text'] = doc['text'][doc['text'].index('\n\n'):] for doc in raw_data_test: doc['text'] = doc['text'][doc['text'].index('\n\n'):] # keep this pseudo-ProdLDA version Path("replicated").mkdir(exist_ok=True) save_sparse(sparse.coo_matrix(raw_counts_train), "./replicated/train.npz") save_sparse(sparse.coo_matrix(raw_counts_test), "./replicated/test.npz") save_json(vocab, "./replicated/train.vocab.json") save_json(raw_tokens_train, "./replicated/train.tokens.json") save_json(raw_tokens_test, "./replicated/test.tokens.json") save_jsonlist(raw_data_train, "./replicated/train.jsonlist") save_jsonlist(raw_data_test, "./replicated/test.jsonlist") save_json([d['id'] for d in raw_data_train], "./replicated/train.ids.json") save_json([d['id'] for d in raw_data_test], "./replicated/test.ids.json") ## Alignment -- currently ok, but not great
outdir.mkdir(exist_ok=True) # copy over the train files shutil.copy(Path(indir, "train.jsonlist"), Path(outdir, "train.jsonlist")) shutil.copy(Path(indir, "processed/train.npz"), Path(outdir, "train.npz")) shutil.copy(Path(indir, "processed/train.ids.json"), Path(outdir, "train.ids.json")) shutil.copy(Path(indir, "processed/train.vocab.json"), Path(outdir, "train.vocab.json")) # read in test test_jsonlist = utils.load_jsonlist(Path(indir, "test.jsonlist")) test_counts = utils.load_sparse(Path(indir, "processed/test.npz")) test_ids = utils.load_json(Path(indir, "processed/test.ids.json")) # split into a dev set dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = ( train_test_split(test_jsonlist, test_counts, test_ids, test_size=0.5, random_state=11225)) # save utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist")) utils.save_sparse(dev_counts, Path(outdir, "dev.npz")) utils.save_json(dev_ids, Path(outdir, "dev.ids.json")) utils.save_jsonlist(test_jsonlist, Path(outdir, "test.jsonlist")) utils.save_sparse(test_counts, Path(outdir, "test.npz")) utils.save_json(test_ids, Path(outdir, "test.ids.json"))
# python 3.8 from pathlib import Path import json from scipy import sparse import numpy as np from utils import save_sparse, load_json, save_json if __name__ == "__main__": outdir = Path("aligned") outdir.mkdir(exist_ok=True) # data files train = np.loadtxt("intermediate/train.txt") test = np.loadtxt("intermediate/test.txt") train = sparse.coo_matrix(train) test = sparse.coo_matrix(test) save_sparse(train, "aligned/train") save_sparse(test, "aligned/test") # reorder vocabulary, save as list vocab = load_json("intermediate/vocab_dict.json") vocab = [ # ensure correct order k for k, v in sorted(vocab.items(), key=lambda kv: kv[1]) ] save_json(vocab, "aligned/train.vocab.json")
repl_train_ids = load_json("./replicated/dev/train.ids.json") repl_dev_ids = load_json("./replicated/dev/dev.ids.json") data = load_jsonlist(Path(dev_dir, "train.jsonlist")) counts = load_sparse(Path(dev_dir, "train.npz")) ids = load_json(Path(dev_dir, "train.ids.json")) # split based on how the replication data was split data_train = [doc for doc in data if doc['id'] in repl_train_ids] data_dev = [doc for doc in data if doc['id'] in repl_dev_ids] counts_train = counts[ np.array([doc['id'] in repl_train_ids for doc in data]), :] counts_dev = counts[np.array([doc['id'] in repl_dev_ids for doc in data]), :] ids_train = [id for id in ids if id in repl_train_ids] ids_dev = [id for id in ids if id in repl_dev_ids] assert (len(data_train) == counts_train.shape[0] == len(ids_train)) assert (len(data_dev) == counts_dev.shape[0] == len(ids_dev)) # save save_jsonlist(data_train, Path(dev_dir, "train.jsonlist")) save_jsonlist(data_dev, Path(dev_dir, "dev.jsonlist")) save_sparse(counts_train, Path(dev_dir, "train.npz")) save_sparse(counts_dev, Path(dev_dir, "dev.npz")) save_json(ids_train, Path(dev_dir, "train.ids.json")) save_json(ids_dev, Path(dev_dir, "dev.ids.json"))
utils.save_json(train_ids, f"{args.output_dir}/train.ids.json") utils.save_json(val_ids, f"{args.output_dir}/dev.ids.json") utils.save_json(test_ids, f"{args.output_dir}/test.ids.json") # save the raw text utils.save_jsonlist( ({ "id": id, "text": text } for id, text in zip(train_ids, train_doc_list)), f"{args.output_dir}/train.jsonlist", ) utils.save_jsonlist( ({ "id": id, "text": text } for id, text in zip(val_ids, val_doc_list)), f"{args.output_dir}/dev.jsonlist", ) utils.save_jsonlist( ({ "id": id, "text": text } for id, text in zip(test_ids, test_doc_list)), f"{args.output_dir}/test.jsonlist", ) utils.save_sparse(train_vectors, f'{args.output_dir}/train.npz') utils.save_sparse(val_vectors, f'{args.output_dir}/dev.npz') utils.save_sparse(test_vectors, f'{args.output_dir}/test.npz')