for doc in raw_data_train: doc['text'] = doc['text'][doc['text'].index('\n\n'):] for doc in raw_data_test: doc['text'] = doc['text'][doc['text'].index('\n\n'):] # keep this pseudo-ProdLDA version Path("replicated").mkdir(exist_ok=True) save_sparse(sparse.coo_matrix(raw_counts_train), "./replicated/train.npz") save_sparse(sparse.coo_matrix(raw_counts_test), "./replicated/test.npz") save_json(vocab, "./replicated/train.vocab.json") save_json(raw_tokens_train, "./replicated/train.tokens.json") save_json(raw_tokens_test, "./replicated/test.tokens.json") save_jsonlist(raw_data_train, "./replicated/train.jsonlist") save_jsonlist(raw_data_test, "./replicated/test.jsonlist") save_json([d['id'] for d in raw_data_train], "./replicated/train.ids.json") save_json([d['id'] for d in raw_data_test], "./replicated/test.ids.json") ## Alignment -- currently ok, but not great # tf-idf transform tfidf = TfidfTransformer() tfidf.fit( np.vstack([ orig_counts_train, orig_counts_test, raw_counts_train, raw_counts_test ]))
repl_train_ids = load_json("./replicated/dev/train.ids.json") repl_dev_ids = load_json("./replicated/dev/dev.ids.json") data = load_jsonlist(Path(dev_dir, "train.jsonlist")) counts = load_sparse(Path(dev_dir, "train.npz")) ids = load_json(Path(dev_dir, "train.ids.json")) # split based on how the replication data was split data_train = [doc for doc in data if doc['id'] in repl_train_ids] data_dev = [doc for doc in data if doc['id'] in repl_dev_ids] counts_train = counts[ np.array([doc['id'] in repl_train_ids for doc in data]), :] counts_dev = counts[np.array([doc['id'] in repl_dev_ids for doc in data]), :] ids_train = [id for id in ids if id in repl_train_ids] ids_dev = [id for id in ids if id in repl_dev_ids] assert (len(data_train) == counts_train.shape[0] == len(ids_train)) assert (len(data_dev) == counts_dev.shape[0] == len(ids_dev)) # save save_jsonlist(data_train, Path(dev_dir, "train.jsonlist")) save_jsonlist(data_dev, Path(dev_dir, "dev.jsonlist")) save_sparse(counts_train, Path(dev_dir, "train.npz")) save_sparse(counts_dev, Path(dev_dir, "dev.npz")) save_json(ids_train, Path(dev_dir, "train.ids.json")) save_json(ids_dev, Path(dev_dir, "dev.ids.json"))
outdir.mkdir(exist_ok=True) # copy over the train files shutil.copy(Path(indir, "train.jsonlist"), Path(outdir, "train.jsonlist")) shutil.copy(Path(indir, "processed/train.npz"), Path(outdir, "train.npz")) shutil.copy(Path(indir, "processed/train.ids.json"), Path(outdir, "train.ids.json")) shutil.copy(Path(indir, "processed/train.vocab.json"), Path(outdir, "train.vocab.json")) # read in test test_jsonlist = utils.load_jsonlist(Path(indir, "test.jsonlist")) test_counts = utils.load_sparse(Path(indir, "processed/test.npz")) test_ids = utils.load_json(Path(indir, "processed/test.ids.json")) # split into a dev set dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = ( train_test_split(test_jsonlist, test_counts, test_ids, test_size=0.5, random_state=11225)) # save utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist")) utils.save_sparse(dev_counts, Path(outdir, "dev.npz")) utils.save_json(dev_ids, Path(outdir, "dev.ids.json")) utils.save_jsonlist(test_jsonlist, Path(outdir, "test.jsonlist")) utils.save_sparse(test_counts, Path(outdir, "test.npz")) utils.save_json(test_ids, Path(outdir, "test.ids.json"))
utils.save_json(vocab_list, f"{args.output_dir}/train.vocab.json") train_ids = list(range(len(train_doc_list))) val_ids = list(range(len(val_doc_list))) test_ids = list(range(len(test_doc_list))) # save ids utils.save_json(train_ids, f"{args.output_dir}/train.ids.json") utils.save_json(val_ids, f"{args.output_dir}/dev.ids.json") utils.save_json(test_ids, f"{args.output_dir}/test.ids.json") # save the raw text utils.save_jsonlist( ({ "id": id, "text": text } for id, text in zip(train_ids, train_doc_list)), f"{args.output_dir}/train.jsonlist", ) utils.save_jsonlist( ({ "id": id, "text": text } for id, text in zip(val_ids, val_doc_list)), f"{args.output_dir}/dev.jsonlist", ) utils.save_jsonlist( ({ "id": id, "text": text } for id, text in zip(test_ids, test_doc_list)),