def load_data(input_dir, input_prefix, log_file, vocab=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() X = np.array(temp, dtype='float32') if vocab is None: vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) #lists_of_indices = fh.read_json(os.path.join(input_dir, input_prefix + '.indices.json')) #index_arrays = [np.array(l, dtype='int32') for l in lists_of_indices] n_items, vocab_size = X.shape #print(n_items, len(index_arrays)) assert vocab_size == len(vocab) #assert n_items == len(index_arrays) print(X[0, :]) label_file = os.path.join(input_dir, input_prefix + '.labels.npz') if os.path.exists(label_file): print("Loading labels") temp = fh.load_sparse(label_file).todense() labels = np.array(temp, dtype='float32') else: print("Label file not found") labels = np.zeros([n_items, 1], dtype='float32') assert len(labels) == n_items counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) return X, vocab, labels
def load_word_counts(input_dir, input_prefix, vocab=None): print("Loading data") # laod the word counts and convert to a dense matrix #temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() #X = np.array(temp, dtype='float32') X = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).tocsr() # load the vocabulary if vocab is None: vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json')) n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) ids = fh.read_json(os.path.join(input_dir, input_prefix + '.ids.json')) # filter out empty documents and return a boolean selector for filtering labels and covariates #row_selector = np.array(X.sum(axis=1) > 0, dtype=bool) row_sums = np.array(X.sum(axis=1)).reshape((n_items,)) row_selector = np.array(row_sums > 0, dtype=bool) print("Found %d non-empty documents" % np.sum(row_selector)) X = X[row_selector, :] ids = [doc_id for i, doc_id in enumerate(ids) if row_selector[i]] return X, vocab, row_selector, ids
def load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, n_vals, cols_to_skip=0, output_file=None): print("Loading reference counts") ref_vocab = fh.read_json(ref_vocab_file) ref_counts = fh.load_sparse(ref_counts_file).tocsc() compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip, output_file)
def load_data(input_dir: str, input_prefix: str, vocab_size=None, vocab=None, col_sel=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() n_items, temp_size = temp.shape print("Loaded %d documents with %d features" % (n_items, temp_size)) if vocab is None: col_sel = None vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) # filter vocabulary by word frequency if vocab_size is not None: print("Filtering vocabulary to the most common %d terms" % int(vocab_size)) col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), )) order = list(np.argsort(col_sums)) order.reverse() col_sel = np.array(np.zeros(len(vocab)), dtype=bool) for i in range(int(vocab_size)): col_sel[order[i]] = True temp = temp[:, col_sel] vocab = [word for i, word in enumerate(vocab) if col_sel[i]] elif col_sel is not None: print("Using given vocabulary") temp = temp[:, col_sel] X = np.array(temp, dtype='float32') n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents non_empty_sel = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(non_empty_sel)) X = X[non_empty_sel, :] counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) num = list(vocab[i] for i in order[:200]) return X, vocab, col_sel, num
def load_word_counts(input_dir, input_prefix, vocab=None): print("Loading data") # laod the word counts and convert to a dense matrix temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() X = np.array(temp, dtype='float32') # load the vocabulary if vocab is None: vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents and return a boolean selector for filtering labels and covariates row_selector = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(row_selector)) X = X[row_selector, :] return X, vocab, row_selector
def load_data(input_dir, input_prefix, label_file_name=None, covar_file_names=None, vocab_size=None, vocab=None, col_sel=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() n_items, temp_size = temp.shape print("Loaded %d documents with %d features" % (n_items, temp_size)) if vocab is None: col_sel = None vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json')) # filter vocabulary by word frequency if vocab_size is not None: print("Filtering vocabulary to the most common %d terms" % int(vocab_size)) col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), )) order = list(np.argsort(col_sums)) order.reverse() col_sel = np.array(np.zeros(len(vocab)), dtype=bool) for i in range(int(vocab_size)): col_sel[order[i]] = True temp = temp[:, col_sel] vocab = [word for i, word in enumerate(vocab) if col_sel[i]] elif col_sel is not None: print("Using given vocabulary") temp = temp[:, col_sel] X = np.array(temp, dtype='float32') n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents non_empty_sel = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(non_empty_sel)) X = X[non_empty_sel, :] n_items, vocab_size = X.shape if label_file_name is not None: label_file = os.path.join(input_dir, input_prefix + '.' + label_file_name + '.csv') if os.path.exists(label_file): print("Loading labels from", label_file) temp = pd.read_csv(label_file, header=0, index_col=0) label_names = temp.columns if 'NA' in label_names: na_label_index = list(label_names).index('NA') else: na_label_index = len(label_names) + 1 labels = np.array(temp.values) labels = labels[non_empty_sel, :] n, n_labels = labels.shape assert n == n_items print("%d labels" % n_labels) else: print("Label file not found:", label_file) sys.exit() if (np.sum(labels, axis=1) == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1) == labels.size): label_type = 'categorical' elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size: label_type = 'bernoulli' else: label_type = 'real' print("Found labels of type %s" % label_type) else: labels = None label_names = None label_type = None na_label_index = None if covar_file_names is not None: covariate_list = [] covariate_names_list = [] covar_file_names = covar_file_names.split(',') for covar_file_name in covar_file_names: covariates_file = os.path.join(input_dir, input_prefix + '.' + covar_file_name + '.csv') if os.path.exists(covariates_file): print("Loading covariates from", covariates_file) temp = pd.read_csv(covariates_file, header=0, index_col=0) covariate_names = temp.columns covariates = np.array(temp.values, dtype=np.float32) covariates = covariates[non_empty_sel, :] n, n_covariates = covariates.shape assert n == n_items covariate_list.append(covariates) covariate_names_list.extend(covariate_names) else: print("Covariates file not found:", covariates_file) sys.exit() covariates = np.hstack(covariate_list) covariate_names = covariate_names_list n, n_covariates = covariates.shape if (np.sum(covariates, axis=1) == 1).all() and (np.sum(covariates == 0) + np.sum(covariates == 1) == covariates.size): covariates_type = 'categorical' else: covariates_type = 'other' print("Found covariates of type %s" % covariates_type) assert n == n_items print("%d covariates" % n_covariates) else: covariates = None covariate_names = None covariates_type = None counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) return X, vocab, labels, label_names, na_label_index, label_type, covariates, covariate_names, covariates_type, col_sel
def load_data(input_dir, input_prefix, label_file_name=None, covar_file_names=None, vocab=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() X = np.array(temp, dtype='float32') if vocab is None: vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents non_empty_sel = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(non_empty_sel)) X = X[non_empty_sel, :] n_items, vocab_size = X.shape if label_file_name is not None: label_file = os.path.join( input_dir, input_prefix + '.' + label_file_name + '.csv') if os.path.exists(label_file): print("Loading labels from", label_file) temp = pd.read_csv(label_file, header=0, index_col=0) label_names = temp.columns labels = np.array(temp.values) labels = labels[non_empty_sel, :] n, n_labels = labels.shape assert n == n_items print("%d labels" % n_labels) else: print("Label file not found:", label_file) sys.exit() if (np.sum(labels, axis=1) == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1) == labels.size): label_type = 'categorical' elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size: label_type = 'bernoulli' else: label_type = 'real' print("Found labels of type %s" % label_type) else: labels = None label_names = None label_type = None if covar_file_names is not None: covariate_list = [] covariate_names_list = [] covar_file_names = covar_file_names.split(',') for covar_file_name in covar_file_names: covariates_file = os.path.join( input_dir, input_prefix + '.' + covar_file_name + '.csv') if os.path.exists(covariates_file): print("Loading covariates from", covariates_file) temp = pd.read_csv(covariates_file, header=0, index_col=0) covariate_names = temp.columns covariates = np.array(temp.values, dtype=np.float32) covariates = covariates[non_empty_sel, :] n, n_covariates = covariates.shape assert n == n_items covariate_list.append(covariates) covariate_names_list.extend(covariate_names) else: print("Covariates file not found:", covariates_file) sys.exit() covariates = np.hstack(covariate_list) covariate_names = covariate_names_list n, n_covariates = covariates.shape if (np.sum(covariates, axis=1) == 1).all() and (np.sum(covariates == 0) + np.sum(covariates == 1) == covariates.size): covariates_type = 'categorical' else: covariates_type = 'other' print("Found covariates of type %s" % covariates_type) assert n == n_items print("%d covariates" % n_covariates) else: covariates = None covariate_names = None covariates_type = None counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) return X, vocab, labels, label_names, label_type, covariates, covariate_names, covariates_type
run_parser.add_argument("--dev-folds", type=int) run_parser.add_argument("--npmi-words", type=int, default=10) run_parser.add_argument("--min-acceptable-npmi", type=float, default=0.) run_parser.add_argument( "--ext-counts-fpath", ) run_parser.add_argument( "--ext-vocab-fpath", ) run_args, additional_args = run_parser.parse_known_args() outdir_parser = argparse.ArgumentParser() outdir_parser.add_argument("-o") outdir_args, _ = outdir_parser.parse_known_args(additional_args) nyt_counts = fh.load_sparse(run_args.ext_counts_fpath) nyt_vocab = fh.read_json(run_args.ext_vocab_fpath) np.random.seed(run_args.global_seed) run_seeds = iter([ 121958, 671155, 131932, 365838, 259178, 921881, 616685, 919314, 130398, 5591, 11235, 2020, 19, 8000, 1001, 12345, ]) # copy over code Path(outdir_args.o).mkdir(parents=True, exist_ok=True) shutil.copy("run_scholar.py", Path(outdir_args.o, "run_scholar.py")) shutil.copy("scholar.py", Path(outdir_args.o, "scholar.py")) if Path(outdir_args.o, "dev_metrics.csv").exists(): old_path = Path(outdir_args.o, "dev_metrics.csv")
def get_results_data( basedir, pattern, ignore_cols_with_same_vals=True, coherence_reference_dir="/fs/clip-political/scholar/congress_votes_dwnom" ): """ Get the results data in folders matching `pattern` in `basedir` """ dirs = [(p.name, p) for p in Path(basedir).glob(pattern) if p.is_dir()] ref_vocab = fh.read_json(Path(coherence_reference_dir, "train.vocab.json")) ref_counts = fh.load_sparse(Path(coherence_reference_dir, "test.npz")).tocsc() experiments = pd.DataFrame() column_names = [] for run_name, run_dir in tqdm.tqdm(dirs): model_path = Path(run_dir, 'torch_model.pt') try: checkpoint = torch.load(model_path, map_location='cpu') except FileNotFoundError: continue npmi_internal = None try: topics = fh.read_text(Path(run_dir, "topic.txt")) except FileNotFoundError: print( f"topics.txt not found for {run_name}. Will not calculate npmi" ) pass else: npmi_internal = compute_npmi_at_n( topics=topics, ref_vocab=ref_vocab, ref_counts=ref_counts, n=10, # could change? silent=True, ) model_time = (datetime.fromtimestamp( model_path.stat().st_mtime).strftime('%Y-%m-%d %H:%M')) run_data = { 'run_name': run_name, 'git_hash': checkpoint['git_hash'], 'date': model_time, # hyperparameters **checkpoint['options'].__dict__, # works if we switch to argparse as well # results 'saved_at_epoch': checkpoint['epoch'], 'accuracy_train': read_result_from_file(Path(run_dir, 'accuracy.train.txt')), 'accuracy_dev': read_result_from_file(Path(run_dir, 'accuracy.dev.txt')), 'accuracy_dev_from_chkpt': checkpoint['dev_metrics']['accuracy'], 'accuracy_test': read_result_from_file(Path(run_dir, 'accuracy.test.txt')), 'perplexity_dev': read_result_from_file(Path(run_dir, 'perplexity.dev.txt')), 'perplexity_test': read_result_from_file(Path(run_dir, 'perplexity.test.txt')), 'maw': read_result_from_file(Path(run_dir, 'maw.txt')) } # keep longest set of cols for data ordering (python>=3.6 keeps dict key order) if len(run_data.keys()) > len(column_names): column_names = list(run_data.keys()) experiments = experiments.append(run_data, ignore_index=True) # reorder columns experiments = experiments[column_names] if ignore_cols_with_same_vals: # remove any columns where the values have not been altered run-to-run # see https://stackoverflow.com/a/39658662/5712749 nunique_vals = experiments.apply(pd.Series.nunique) cols_to_drop = nunique_vals[nunique_vals <= 1].index experiments = experiments.drop(cols_to_drop, axis=1) return experiments.sort_values(by=['date'])