def load(self, path, features='BoW', match_avitm=True): if path[:2] == '~/': path = os.path.join(os.path.expanduser(path[:2]), path[2:]) ### Specify the file locations train_path = path + '/train.npz' dev_path = path + '/dev.npz' test_path = path + '/test.npz' vocab_path = path + '/train.vocab.json' ### Load train train_csr = load_sparse(train_path) train = np.array(train_csr.todense()).astype('float32') ### Load dev self.dev_counts = load_sparse(dev_path).tocsc() # will be used for NPMI ### Load test test_csr = load_sparse(test_path) test = np.array(test_csr.todense()).astype('float32') ### load vocab # ENCODING = "ISO-8859-1" ENCODING = "utf-8" with open(vocab_path, encoding=ENCODING) as f: vocab_list = json.load(f) # construct maps vocab2dim = dict(zip(vocab_list, range(len(vocab_list)))) dim2vocab = reverse_dict(vocab2dim) return [train, None, test, None, None, None], [None, None, None], [vocab2dim, dim2vocab, None, None]
def load(self, data_path, features='BoW', match_avitm=True): ### Specify the file locations train_path = data_path + '/train.npz' dev_path = data_path + '/dev.npz' test_path = data_path + '/test.npz' vocab_path = data_path + '/train.vocab.json' ### Load train train_csr = load_sparse(train_path) train_counts = np.array(train_csr.todense()).astype('float32') train_bert_logits = np.load(self.logit_path + "/train.npy") train = np.concatenate([train_counts, train_bert_logits], axis=1) if self.logit_clip is not None: # limit the document representations to the top k labels doc_tokens = np.sum(train_counts > 0, axis=1) vocab_size = train_counts.shape[1] for i, (row, total) in enumerate(zip(train_bert_logits, doc_tokens)): k = self.logit_clip * total # keep this many logits if k < vocab_size: min_logit = np.quantile(row, 1 - k / vocab_size) train_bert_logits[ i, train_bert_logits[i] < min_logit] = -np.inf #min_logits = np.quantile(train_bert_logits, np.quantile(train_counts.sum(1), 0.9) / 20_000, axis=1) #train_bert_logits[train_bert_logits < min_logits.reshape(-1, 1)] = -np.inf ### Load dev self.dev_counts = load_sparse( dev_path).tocsc() # will be used for NPMI ### Load test test_csr = load_sparse(test_path) test_counts = np.array(test_csr.todense()).astype('float32') test_bert_logits = np.ones_like(test_counts) test = np.concatenate([test_counts, test_bert_logits], axis=1) ### load vocab # ENCODING = "ISO-8859-1" ENCODING = "utf-8" with open(vocab_path, encoding=ENCODING) as f: vocab_list = json.load(f) # construct maps vocab2dim = dict(zip(vocab_list, range(len(vocab_list)))) dim2vocab = reverse_dict(vocab2dim) return [train, None, test, None, None, None], [None, None, None], [vocab2dim, dim2vocab, None, None]
def toks_to_onehot(doc, vocab): tokens = [vocab[word] for word in doc] return np.bincount(tokens, minlength=len(vocab)) if __name__ == "__main__": ## Re-processing REMOVE_HEADER = True with open("./aligned/train.vocab.json", "r") as infile: vocab = json.load(infile) vocab_dict = dict(zip(vocab, range(len(vocab)))) # Read in the ProdLDA 20ng data orig_counts_train = load_sparse("./aligned/train.npz").todense() orig_counts_test = load_sparse("./aligned/test.npz").todense() # Get the original raw text raw_train = fetch_20newsgroups(data_home="./intermediate", subset="train") raw_test = fetch_20newsgroups(data_home="./intermediate", subset="test") # Turn the raw text into count data wnl = WordNetLemmatizer() raw_tokens_train = [ process_raw_doc(doc, vocab_dict, wnl, REMOVE_HEADER) for doc in tqdm(raw_train.data) ] raw_tokens_test = [ process_raw_doc(doc, vocab_dict, wnl, REMOVE_HEADER) for doc in tqdm(raw_test.data)
indir = Path("../data/20ng") outdir = Path("processed-dev") outdir = Path(indir, outdir) outdir.mkdir(exist_ok=True) # copy over the train files shutil.copy(Path(indir, "train.jsonlist"), Path(outdir, "train.jsonlist")) shutil.copy(Path(indir, "processed/train.npz"), Path(outdir, "train.npz")) shutil.copy(Path(indir, "processed/train.ids.json"), Path(outdir, "train.ids.json")) shutil.copy(Path(indir, "processed/train.vocab.json"), Path(outdir, "train.vocab.json")) # read in test test_jsonlist = utils.load_jsonlist(Path(indir, "test.jsonlist")) test_counts = utils.load_sparse(Path(indir, "processed/test.npz")) test_ids = utils.load_json(Path(indir, "processed/test.ids.json")) # split into a dev set dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = ( train_test_split(test_jsonlist, test_counts, test_ids, test_size=0.5, random_state=11225)) # save utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist")) utils.save_sparse(dev_counts, Path(outdir, "dev.npz")) utils.save_json(dev_ids, Path(outdir, "dev.ids.json"))
import utils if __name__ == "__main__": outdir = Path("processed-dev") outdir.mkdir(exist_ok=True) # copy over the train files shutil.copy("train.jsonlist", Path(outdir, "train.jsonlist")) shutil.copy("processed/train.npz", Path(outdir, "train.npz")) shutil.copy("processed/train.ids.json", Path(outdir, "train.ids.json")) shutil.copy("processed/train.vocab.json", Path(outdir, "train.vocab.json")) # read in test test_jsonlist = utils.load_jsonlist("test.jsonlist") test_counts = utils.load_sparse("processed/test.npz") test_ids = utils.load_json("processed/test.ids.json") # split into a dev set dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = ( train_test_split(test_jsonlist, test_counts, test_ids, test_size=0.5, random_state=11225)) # save utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist")) utils.save_sparse(dev_counts, Path(outdir, "dev.npz")) utils.save_json(dev_ids, Path(outdir, "dev.ids.json"))
from utils import load_jsonlist, save_jsonlist, load_sparse, save_sparse, load_json, save_json if __name__ == "__main__": dev_dir = Path("./aligned/dev") dev_dir.mkdir(exist_ok=True) for fpath in Path("./aligned").glob("*"): if fpath.is_file(): shutil.copy(str(fpath), str(Path(dev_dir, fpath.name))) # Load in the ids from the replicated fpath repl_train_ids = load_json("./replicated/dev/train.ids.json") repl_dev_ids = load_json("./replicated/dev/dev.ids.json") data = load_jsonlist(Path(dev_dir, "train.jsonlist")) counts = load_sparse(Path(dev_dir, "train.npz")) ids = load_json(Path(dev_dir, "train.ids.json")) # split based on how the replication data was split data_train = [doc for doc in data if doc['id'] in repl_train_ids] data_dev = [doc for doc in data if doc['id'] in repl_dev_ids] counts_train = counts[ np.array([doc['id'] in repl_train_ids for doc in data]), :] counts_dev = counts[np.array([doc['id'] in repl_dev_ids for doc in data]), :] ids_train = [id for id in ids if id in repl_train_ids] ids_dev = [id for id in ids if id in repl_dev_ids] assert (len(data_train) == counts_train.shape[0] == len(ids_train))