def load_word_counts(input_dir, input_prefix, vocab=None): print("Loading data") # laod the word counts and convert to a dense matrix #temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() #X = np.array(temp, dtype='float32') X = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).tocsr() # load the vocabulary if vocab is None: vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json')) n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) ids = fh.read_json(os.path.join(input_dir, input_prefix + '.ids.json')) # filter out empty documents and return a boolean selector for filtering labels and covariates #row_selector = np.array(X.sum(axis=1) > 0, dtype=bool) row_sums = np.array(X.sum(axis=1)).reshape((n_items,)) row_selector = np.array(row_sums > 0, dtype=bool) print("Found %d non-empty documents" % np.sum(row_selector)) X = X[row_selector, :] ids = [doc_id for i, doc_id in enumerate(ids) if row_selector[i]] return X, vocab, row_selector, ids
def load_from_files(self, debug=False, debug_index=None): vocab = vocabulary_with_counts.VocabWithCounts(self.get_prefix(), add_oov=True, read_from_filename=self.get_vocab_filename()) index = fh.read_json(self.get_index_filename()) feature_counts = fh.unpickle_data(self.get_feature_filename()) oov_counts = fh.read_json(self.get_oov_count_filename()) # TESTING if debug: if debug_index is None: item_index = random.randint(0, len(index)) else: item_index = debug_index item = index[item_index] counts = feature_counts[item_index, :] print item print counts.indices print counts.data print vocab.get_tokens(counts.indices) print oov_counts[item_index] self.feature_counts = feature_counts self.index = index self.vocab = vocab self.oov_counts = oov_counts
def load_data(input_dir, input_prefix, log_file, vocab=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() X = np.array(temp, dtype='float32') if vocab is None: vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) lists_of_indices = fh.read_json( os.path.join(input_dir, input_prefix + '.indices.json')) index_arrays = [np.array(l, dtype='int32') for l in lists_of_indices] n_items, vocab_size = X.shape print(n_items, len(index_arrays)) assert vocab_size == len(vocab) assert n_items == len(index_arrays) log(log_file, "Loaded %d documents with %d features" % (n_items, vocab_size)) label_file = os.path.join(input_dir, input_prefix + '.labels.npz') if os.path.exists(label_file): print("Loading labels") temp = fh.load_sparse(label_file).todense() labels = np.array(temp, dtype='float32') else: print("Label file not found") labels = np.zeros([n_items, 1], dtype='float32') assert len(labels) == n_items counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) return X, vocab, index_arrays, labels
def read_from_file(self, filename): json_obj = fh.read_json(filename) self.index2token = json_obj['index2token'] self.counts = Counter(json_obj['counts']) self.doc_counts = Counter(json_obj['doc_counts']) self.token2index = dict( zip(self.index2token, range(len(self.index2token))))
def extract_features(self, source, write_to_file=True, vocab_source=None): print "Extracting ngram tokens" # read in a dict of {document_key: text} data = fh.read_json(source) all_items = data.keys() tokens = self.extract_tokens_from_text(data) if vocab_source is None: vocab = self.make_vocabulary(tokens, all_items) vocab.prune(self.min_df) self.vocab = vocab else: vocab = self.load_vocabulary(vocab_source) self.vocab = vocab feature_counts, index = self.extract_feature_counts(all_items, tokens, vocab) if write_to_file: vocab.write_to_file(self.get_vocab_filename()) fh.write_to_json(index, self.get_index_filename(), sort_keys=False) fh.pickle_data(feature_counts, self.get_feature_filename()) self.feature_counts = feature_counts self.index = index self.column_names = np.array(self.vocab.index2token) self.do_transformations()
def main(call=None): # handle command line parser = argparse.ArgumentParser() parser.add_argument("model_path", help="path for model directory") parser.add_argument("-n", dest="n_words", type=int, default=30, help="number of words to show in each topic") options = parser.parse_args(call) model_path = options.model_path n_words = options.n_words ## load Beta beta = np.load(os.path.join(model_path, 'beta.npz'))['beta'] ## load vocab vocab = fh.read_json(os.path.join(model_path, 'vocab.json')) # get and print topics topics = get_top_n_topic_words(beta, vocab, n_words) for topic in topics: topicstring = ' '.join(topic) print(topicstring)
def load_from_files(self, vocab_source=None): self.vocab = self.load_vocabulary(vocab_source=vocab_source) index = fh.read_json(self.get_index_filename()) feature_counts = fh.unpickle_data(self.get_feature_filename()) self.feature_counts = feature_counts self.index = index self.column_names = np.array(self.vocab.index2token) self.do_transformations()
def load_and_compute_npmi(topics_file, ref_vocab_file, ref_counts_file, n_vals, cols_to_skip=0, output_file=None): print("Loading reference counts") ref_vocab = fh.read_json(ref_vocab_file) ref_counts = fh.load_sparse(ref_counts_file).tocsc() compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip, output_file)
def main(): usage = "%prog msa_db.csv data_dir output_file.jsonlist" parser = OptionParser(usage=usage) #parser.add_option('--keyword', dest='key', default=None, # help='Keyword argument: default=%default') #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False, # help='Keyword argument: default=%default') (options, args) = parser.parse_args() msa_db = args[0] data_dir = args[1] output_filename = args[2] articles = [] exclude = [ 'murderpedia.org', 'www.gunviolencearchive.org', 'www.fbi.gov', 'en.wikipedia.org', 'www.history.com', 'web.archive.org' ] df = pd.read_csv(msa_db, header=0) index = df.index for i in index: row = df.loc[i] caseid = row['CaseID'] title = row['Title'] names = row['Shooter Name'].split() #subdirs = glob.glob(os.path.join(data_dir, '*_*')) subdir = os.path.join(data_dir, str(caseid) + '_' + '_'.join(names)) if not os.path.exists(subdir): files = glob.glob( os.path.join(data_dir, str(caseid) + '_*', '*.json')) else: files = glob.glob(os.path.join(subdir, '*.json')) print(subdir, len(files)) for f in files: data = fh.read_json(f) text = data['text'] url = data['url'] parts = url.split('/') domain = parts[2] if len(text) > 200: if domain not in exclude: articles.append({ 'id': str(i), 'caseid': str(caseid), 'event_name': title, 'text': text }) fh.write_jsonlist(articles, output_filename, sort_keys=False)
def load_background_freq(input_dir, input_prefix, vocab): word_freq_file = os.path.join(input_dir, input_prefix + '.word_freq.json') if os.path.exists(word_freq_file): print("Loading background frequencies") log_word_freq = np.log(np.array(fh.read_json(word_freq_file))) order = np.argsort(log_word_freq) for i in range(10): print('%d %s %0.3f' % (i, vocab[order[-i - 1]], np.exp(log_word_freq[order[-i - 1]]))) else: print("*** Background word frequency file not found! ***") log_word_freq = None return log_word_freq
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix): df = pd.read_csv(csv_file, header=0, index_col=0) n_rows, n_columns = df.shape print(df.shape) files = glob.glob(os.path.join(parsed_dir, '*.json')) n_files = len(files) #assert n_files == n_rows coref_input = [] pos_tags_all = set() print("Parsing %d documents" % n_files) for i in range(n_files): if i % 1000 == 0 and i > 0: print(i) valid = df.loc[i, 'matching'] name = str(df.loc[i, 'shooter_names']) # fix an important name error name = re.sub('Marteen', 'Mateen', name) names = name.split() age = str(df.loc[i, 'age']) if valid: filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json') parse = fh.read_json(filename) # get the text and convert to tokens sentences, lemmas, pos_tags, speakers, dependencies, target_mentions, age_pos_tags = process_parse(parse, names, age) pos_tags_all.update(age_pos_tags) # write output for e2e-coref coref_input.append({"id": i, "clusters": [], "doc_key": "nw", "sentences": sentences, "lemmas": lemmas, "speakers": speakers, "pos_tags": pos_tags, "dependencies": dependencies, "coref": [target_mentions] }) print(i, names, age, len(target_mentions)) fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
def main(): print("Reading model 1") beta1 = np.load(os.path.join(model_path1, 'beta.npz'))['beta'] vocab1 = fh.read_json(os.path.join(model_path1, 'vocab.json')) topics1 = get_top_n_topic_words(beta1, vocab1, n_words) print("Reading model 2") beta2 = np.load(os.path.join(model_path2, 'beta.npz'))['beta'] vocab2 = fh.read_json(os.path.join(model_path2, 'vocab.json')) topics2 = get_top_n_topic_words(beta2, vocab2, n_words) print("Matching topics") topic_match_tuples, topic_match_scores = get_topic_matched_pairs(beta1, beta2) for pair, score in zip(topic_match_tuples, topic_match_scores): print(str(score) + "\t" + str(pair)) topicnum1 = pair[0] topicnum2 = pair[1] topicstring1 = ' '.join(topics1[topicnum1]) topicstring2 = ' '.join(topics2[topicnum2]) print(topicstring1) print(topicstring2)
def load_data(input_dir: str, input_prefix: str, vocab_size=None, vocab=None, col_sel=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() n_items, temp_size = temp.shape print("Loaded %d documents with %d features" % (n_items, temp_size)) if vocab is None: col_sel = None vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) # filter vocabulary by word frequency if vocab_size is not None: print("Filtering vocabulary to the most common %d terms" % int(vocab_size)) col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), )) order = list(np.argsort(col_sums)) order.reverse() col_sel = np.array(np.zeros(len(vocab)), dtype=bool) for i in range(int(vocab_size)): col_sel[order[i]] = True temp = temp[:, col_sel] vocab = [word for i, word in enumerate(vocab) if col_sel[i]] elif col_sel is not None: print("Using given vocabulary") temp = temp[:, col_sel] X = np.array(temp, dtype='float32') n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents non_empty_sel = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(non_empty_sel)) X = X[non_empty_sel, :] counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) num = list(vocab[i] for i in order[:200]) return X, vocab, col_sel, num
def load_word_counts(input_dir, input_prefix, vocab=None): print("Loading data") # laod the word counts and convert to a dense matrix temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() X = np.array(temp, dtype='float32') # load the vocabulary if vocab is None: vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents and return a boolean selector for filtering labels and covariates row_selector = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(row_selector)) X = X[row_selector, :] return X, vocab, row_selector
def extract_features(self, source, write_to_file=True): print "Extracting ngram tokens:" # read in a dict of {document_key: text} data = fh.read_json(source) all_items = data.keys() tokens = self.extract_tokens_from_file(data, self.get_n()) vocab = self.make_vocabulary(tokens, all_items) feature_counts, oov_counts = self.extract_feature_counts(all_items, tokens, vocab) if write_to_file: vocab.write_to_file(self.get_vocab_filename()) fh.write_to_json(all_items, self.get_index_filename(), sort_keys=False) fh.pickle_data(feature_counts, self.get_feature_filename()) fh.write_to_json(oov_counts, self.get_oov_count_filename(), sort_keys=False) self.feature_counts = feature_counts self.index = all_items self.vocab = vocab
def load_and_process_data(infile, vocab_size, parser, strip_html=False, vocab=None, label_list=None, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1): mallet_stopwords = None if use_mallet_stopwords: print("Using MALLET stopwords") mallet_stopwords = fh.read_text('mallet_stopwords.txt') mallet_stopwords = {s.strip() for s in mallet_stopwords} print("Reading data files") item_dict = fh.read_json(infile) n_items = len(item_dict) parsed = [] labels = [] print("Parsing %d documents" % n_items) word_counts = Counter() doc_counts = Counter() keys = list(item_dict.keys()) keys.sort() for i, k in enumerate(keys): item = item_dict[k] if i % 1000 == 0 and i > 0: print(i) text = item['text'] label = item['label'] labels.append(label) if strip_html: # remove each pair of angle brackets and everything within them text = re.sub('<[^>]+>', '', text) parse = parser(text) # remove white space from tokens if lemmatize: words = [re.sub('\s', '', token.lemma_) for token in parse] else: words = [re.sub('\s', '', token.orth_) for token in parse] # convert to lower case and drop empty strings words = [word.lower() for word in words if len(word) >= min_length] # remove stop words if use_mallet_stopwords: words = [word for word in words if word not in mallet_stopwords] # remove tokens that don't contain letters or numbers if only_alpha: words = [word for word in words if re.match('^[a-zA-A]*$', word) is not None] if not keep_nonalphanum: words = [word for word in words if re.match('[a-zA-A0-9]', word) is not None] # convert numbers to a number symbol if replace_num: words = ['<NUM>' if re.match('[0-9]', word) is not None else word for word in words] # store the parsed documents parsed.append(words) # keep track fo the number of documents with each word word_counts.update(words) doc_counts.update(set(words)) print("Size of full vocabulary=%d" % len(word_counts)) if vocab is None: most_common = doc_counts.most_common(n=vocab_size) words, counts = zip(*most_common) print("Most common words:") for w in range(20): print(words[w], doc_counts[words[w]], word_counts[words[w]]) vocab = list(words) vocab.sort() total_words = np.sum(list(word_counts.values())) word_freqs = np.array([word_counts[v] for v in vocab]) / float(total_words) else: word_freqs = None vocab_index = dict(zip(vocab, range(vocab_size))) if label_list is None: label_list = list(set(labels)) label_list.sort() n_labels = len(label_list) label_index = dict(zip(label_list, range(n_labels))) X = np.zeros([n_items, vocab_size], dtype=int) y = [] dat_strings = [] svm_strings = [] mallet_strings = [] lists_of_indices = [] # an alternative representation of each document as a list of indices print("First document:") print(' '.join(parsed[0])) counter = Counter() print("Converting to count representations") count = 0 total_tokens = 0 for i, words in enumerate(parsed): indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) # only include non-empty documents if len(counter.keys()) > 0: # udpate the counts mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) values = list(counter.values()) if log_transform: # apply the log transform from Salakhutdinov and Hinton values = np.array(np.round(np.log(1 + np.array(values, dtype='float'))), dtype=int) X[np.ones(len(counter.keys()), dtype=int) * count, list(counter.keys())] += values total_tokens += len(word_subset) y_vector = np.zeros(n_labels) y_vector[label_index[labels[i]]] = 1 y.append(y_vector) #y.append(label_index[labels[i]]) # save the list of indices lists_of_indices.append(indices) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([str(int(k)) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))]) dat_strings.append(dat_string) svm_string = 'target ' svm_string += ' '.join([vocab[int(k)] + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))]) svm_strings.append(svm_string) #text_map[count] = words count += 1 print("Found %d non-empty documents" % count) print("Total tokens = %d" % total_tokens) # drop the items that don't have any words in the vocabualry X = np.array(X[:count, :], dtype=int) temp = np.array(y) y = np.array(temp[:count], dtype=int) sparse_y = sparse.csr_matrix(y) # convert to a sparse representation sparse_X = sparse.csr_matrix(X) sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size,), dtype=np.object) vocab_for_sage[:] = vocab tr_aspect = np.ones([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 sage_output = {'tr_data': sparse_X_sage, 'tr_aspect': tr_aspect, 'widx': widx, 'vocab': vocab_for_sage} return sparse_X, vocab, lists_of_indices, sparse_y, label_list, word_freqs, dat_strings[:count], mallet_strings[:count], sage_output, svm_strings[:count]
def load_data(input_dir, input_prefix, label_file_name=None, covar_file_names=None, vocab_size=None, vocab=None, col_sel=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() n_items, temp_size = temp.shape print("Loaded %d documents with %d features" % (n_items, temp_size)) if vocab is None: col_sel = None vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json')) # filter vocabulary by word frequency if vocab_size is not None: print("Filtering vocabulary to the most common %d terms" % int(vocab_size)) col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), )) order = list(np.argsort(col_sums)) order.reverse() col_sel = np.array(np.zeros(len(vocab)), dtype=bool) for i in range(int(vocab_size)): col_sel[order[i]] = True temp = temp[:, col_sel] vocab = [word for i, word in enumerate(vocab) if col_sel[i]] elif col_sel is not None: print("Using given vocabulary") temp = temp[:, col_sel] X = np.array(temp, dtype='float32') n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents non_empty_sel = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(non_empty_sel)) X = X[non_empty_sel, :] n_items, vocab_size = X.shape if label_file_name is not None: label_file = os.path.join(input_dir, input_prefix + '.' + label_file_name + '.csv') if os.path.exists(label_file): print("Loading labels from", label_file) temp = pd.read_csv(label_file, header=0, index_col=0) label_names = temp.columns if 'NA' in label_names: na_label_index = list(label_names).index('NA') else: na_label_index = len(label_names) + 1 labels = np.array(temp.values) labels = labels[non_empty_sel, :] n, n_labels = labels.shape assert n == n_items print("%d labels" % n_labels) else: print("Label file not found:", label_file) sys.exit() if (np.sum(labels, axis=1) == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1) == labels.size): label_type = 'categorical' elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size: label_type = 'bernoulli' else: label_type = 'real' print("Found labels of type %s" % label_type) else: labels = None label_names = None label_type = None na_label_index = None if covar_file_names is not None: covariate_list = [] covariate_names_list = [] covar_file_names = covar_file_names.split(',') for covar_file_name in covar_file_names: covariates_file = os.path.join(input_dir, input_prefix + '.' + covar_file_name + '.csv') if os.path.exists(covariates_file): print("Loading covariates from", covariates_file) temp = pd.read_csv(covariates_file, header=0, index_col=0) covariate_names = temp.columns covariates = np.array(temp.values, dtype=np.float32) covariates = covariates[non_empty_sel, :] n, n_covariates = covariates.shape assert n == n_items covariate_list.append(covariates) covariate_names_list.extend(covariate_names) else: print("Covariates file not found:", covariates_file) sys.exit() covariates = np.hstack(covariate_list) covariate_names = covariate_names_list n, n_covariates = covariates.shape if (np.sum(covariates, axis=1) == 1).all() and (np.sum(covariates == 0) + np.sum(covariates == 1) == covariates.size): covariates_type = 'categorical' else: covariates_type = 'other' print("Found covariates of type %s" % covariates_type) assert n == n_items print("%d covariates" % n_covariates) else: covariates = None covariate_names = None covariates_type = None counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) return X, vocab, labels, label_names, na_label_index, label_type, covariates, covariate_names, covariates_type, col_sel
def preprocess_data(csv_file, parsed_dir, output_dir, output_prefix, parse_prefix): df = pd.read_csv(csv_file, header=0, index_col=0) n_rows, n_columns = df.shape print(df.shape) files = glob.glob(os.path.join(parsed_dir, '*.json')) n_files = len(files) #assert n_files == n_rows coref_input = [] pos_tags_all = set() print("Parsing %d documents" % n_files) #for i in range(n_files): for i in range(n_files): if i % 1000 == 0 and i > 0: print(i) valid = df.loc[i, 'matching'] name = str(df.loc[i, 'shooter_names']) # fix an important name error name = re.sub('Marteen', 'Mateen', name) names = name.split() age = str(df.loc[i, 'age']) event_name = 'msa-' + re.sub('\s', '-', df.loc[i, 'title']) msa_index = int(df.loc[i, 'df_index']) if msa_index == 272: # Kalamzoo duplicate print("Skipping", i, event_name) elif msa_index == 276: # Belfair duplicate print("Skipping", i, event_name) elif msa_index == 293: # Sherman, Texas duplicate print("Skipping", i, event_name) elif msa_index == 280: # Chelsea, MA duplicate print("Skipping", i, event_name) elif msa_index == 283: # Kansas City duplicate print("Skipping", i, event_name) elif msa_index == 331: # Cape Coral print("Skipping", i, event_name) elif valid: filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json') parse = fh.read_json(filename) # get the text and convert to tokens sentences, sentences_tagged, target_mentions, pos_tags, dependencies = process_parse(parse, names, age, event_name) sentences_pruned = [] for sent in sentences_tagged: tokens = [token for token in sent if token != '__DROP__'] sentences_pruned.append(' '.join(tokens)) text_pruned = ' '.join(sentences_pruned) # write output for e2e-coref coref_input.append({"id": i, "clusters": [], "doc_key": "nw", "sentences": sentences, "text_tagged": text_pruned, "pos_tags": pos_tags, "dependencies": dependencies, "coref": [target_mentions] }) print(i, names, age, len(target_mentions)) fh.write_jsonlist(coref_input, os.path.join(output_dir, output_prefix + '.parsed.jsonlist'))
def read_from_file(self, filename): self.index2token = fh.read_json(filename) self.token2index = dict( zip(self.index2token, range(len(self.index2token))))
def get_results_data( basedir, pattern, ignore_cols_with_same_vals=True, coherence_reference_dir="/fs/clip-political/scholar/congress_votes_dwnom" ): """ Get the results data in folders matching `pattern` in `basedir` """ dirs = [(p.name, p) for p in Path(basedir).glob(pattern) if p.is_dir()] ref_vocab = fh.read_json(Path(coherence_reference_dir, "train.vocab.json")) ref_counts = fh.load_sparse(Path(coherence_reference_dir, "test.npz")).tocsc() experiments = pd.DataFrame() column_names = [] for run_name, run_dir in tqdm.tqdm(dirs): model_path = Path(run_dir, 'torch_model.pt') try: checkpoint = torch.load(model_path, map_location='cpu') except FileNotFoundError: continue npmi_internal = None try: topics = fh.read_text(Path(run_dir, "topic.txt")) except FileNotFoundError: print( f"topics.txt not found for {run_name}. Will not calculate npmi" ) pass else: npmi_internal = compute_npmi_at_n( topics=topics, ref_vocab=ref_vocab, ref_counts=ref_counts, n=10, # could change? silent=True, ) model_time = (datetime.fromtimestamp( model_path.stat().st_mtime).strftime('%Y-%m-%d %H:%M')) run_data = { 'run_name': run_name, 'git_hash': checkpoint['git_hash'], 'date': model_time, # hyperparameters **checkpoint['options'].__dict__, # works if we switch to argparse as well # results 'saved_at_epoch': checkpoint['epoch'], 'accuracy_train': read_result_from_file(Path(run_dir, 'accuracy.train.txt')), 'accuracy_dev': read_result_from_file(Path(run_dir, 'accuracy.dev.txt')), 'accuracy_dev_from_chkpt': checkpoint['dev_metrics']['accuracy'], 'accuracy_test': read_result_from_file(Path(run_dir, 'accuracy.test.txt')), 'perplexity_dev': read_result_from_file(Path(run_dir, 'perplexity.dev.txt')), 'perplexity_test': read_result_from_file(Path(run_dir, 'perplexity.test.txt')), 'maw': read_result_from_file(Path(run_dir, 'maw.txt')) } # keep longest set of cols for data ordering (python>=3.6 keeps dict key order) if len(run_data.keys()) > len(column_names): column_names = list(run_data.keys()) experiments = experiments.append(run_data, ignore_index=True) # reorder columns experiments = experiments[column_names] if ignore_cols_with_same_vals: # remove any columns where the values have not been altered run-to-run # see https://stackoverflow.com/a/39658662/5712749 nunique_vals = experiments.apply(pd.Series.nunique) cols_to_drop = nunique_vals[nunique_vals <= 1].index experiments = experiments.drop(cols_to_drop, axis=1) return experiments.sort_values(by=['date'])
def read_from_file(self, filename): self.index2token = fh.read_json(filename) self.token2index = dict(zip(self.index2token, range(len(self.index2token))))
def load_data(input_dir, input_prefix, label_file_name=None, covar_file_names=None, vocab=None): print("Loading data") temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense() X = np.array(temp, dtype='float32') if vocab is None: vocab = fh.read_json( os.path.join(input_dir, input_prefix + '.vocab.json')) n_items, vocab_size = X.shape assert vocab_size == len(vocab) print("Loaded %d documents with %d features" % (n_items, vocab_size)) # filter out empty documents non_empty_sel = X.sum(axis=1) > 0 print("Found %d non-empty documents" % np.sum(non_empty_sel)) X = X[non_empty_sel, :] n_items, vocab_size = X.shape if label_file_name is not None: label_file = os.path.join( input_dir, input_prefix + '.' + label_file_name + '.csv') if os.path.exists(label_file): print("Loading labels from", label_file) temp = pd.read_csv(label_file, header=0, index_col=0) label_names = temp.columns labels = np.array(temp.values) labels = labels[non_empty_sel, :] n, n_labels = labels.shape assert n == n_items print("%d labels" % n_labels) else: print("Label file not found:", label_file) sys.exit() if (np.sum(labels, axis=1) == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1) == labels.size): label_type = 'categorical' elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size: label_type = 'bernoulli' else: label_type = 'real' print("Found labels of type %s" % label_type) else: labels = None label_names = None label_type = None if covar_file_names is not None: covariate_list = [] covariate_names_list = [] covar_file_names = covar_file_names.split(',') for covar_file_name in covar_file_names: covariates_file = os.path.join( input_dir, input_prefix + '.' + covar_file_name + '.csv') if os.path.exists(covariates_file): print("Loading covariates from", covariates_file) temp = pd.read_csv(covariates_file, header=0, index_col=0) covariate_names = temp.columns covariates = np.array(temp.values, dtype=np.float32) covariates = covariates[non_empty_sel, :] n, n_covariates = covariates.shape assert n == n_items covariate_list.append(covariates) covariate_names_list.extend(covariate_names) else: print("Covariates file not found:", covariates_file) sys.exit() covariates = np.hstack(covariate_list) covariate_names = covariate_names_list n, n_covariates = covariates.shape if (np.sum(covariates, axis=1) == 1).all() and (np.sum(covariates == 0) + np.sum(covariates == 1) == covariates.size): covariates_type = 'categorical' else: covariates_type = 'other' print("Found covariates of type %s" % covariates_type) assert n == n_items print("%d covariates" % n_covariates) else: covariates = None covariate_names = None covariates_type = None counts_sum = X.sum(axis=0) order = list(np.argsort(counts_sum).tolist()) order.reverse() print("Most common words: ", ' '.join([vocab[i] for i in order[:10]])) return X, vocab, labels, label_names, label_type, covariates, covariate_names, covariates_type
run_parser.add_argument("--npmi-words", type=int, default=10) run_parser.add_argument("--min-acceptable-npmi", type=float, default=0.) run_parser.add_argument( "--ext-counts-fpath", ) run_parser.add_argument( "--ext-vocab-fpath", ) run_args, additional_args = run_parser.parse_known_args() outdir_parser = argparse.ArgumentParser() outdir_parser.add_argument("-o") outdir_args, _ = outdir_parser.parse_known_args(additional_args) nyt_counts = fh.load_sparse(run_args.ext_counts_fpath) nyt_vocab = fh.read_json(run_args.ext_vocab_fpath) np.random.seed(run_args.global_seed) run_seeds = iter([ 121958, 671155, 131932, 365838, 259178, 921881, 616685, 919314, 130398, 5591, 11235, 2020, 19, 8000, 1001, 12345, ]) # copy over code Path(outdir_args.o).mkdir(parents=True, exist_ok=True) shutil.copy("run_scholar.py", Path(outdir_args.o, "run_scholar.py")) shutil.copy("scholar.py", Path(outdir_args.o, "scholar.py")) if Path(outdir_args.o, "dev_metrics.csv").exists(): old_path = Path(outdir_args.o, "dev_metrics.csv") ctime = datetime.fromtimestamp(old_path.stat().st_ctime).strftime("%Y-%m-%d")
def read_from_file(self, filename): json_obj = fh.read_json(filename) self.index2token = json_obj['index2token'] self.counts = Counter(json_obj['counts']) self.doc_counts = Counter(json_obj['doc_counts']) self.token2index = dict(zip(self.index2token, range(len(self.index2token))))
def main(): usage = "%prog model_file.npz vocab_file.json" parser = OptionParser(usage=usage) parser.add_option('--sparsity_thresh', dest='sparsity_thresh', default=1e-3, help='Sparsity threshold: default=%default') parser.add_option('--interactions', action="store_true", dest="interactions", default=False, help='Print interaction topics: default=%default') parser.add_option( '--n_pos', dest='n_pos', default=7, help='Number of positive terms to display: default=%default') parser.add_option( '--n_neg', dest='n_neg', default=4, help='Number of negative terms to display: default=%default') parser.add_option( '--max_classes', dest='max_classes', default=None, help='Maximum number of classes to display: default=%default') (options, args) = parser.parse_args() model_file = args[0] vocab_file = args[1] params = np.load(model_file) vocab = fh.read_json(vocab_file) n_pos = int(options.n_pos) n_neg = int(options.n_neg) max_classes = options.max_classes sparsity_threshold = options.sparsity_thresh interactions = options.interactions dv = params['d_v'] n_topics = params['d_t'] n_classes = params['n_classes'] if max_classes is not None: n_classes = int(max_classes) if n_topics > 1: print("\nTopics:") weights = np.array(params['W_decoder']) mean_sparsity = 0.0 for j in range(n_topics): order = list(np.argsort(weights[:, j]).tolist()) order.reverse() highest = ' '.join([vocab[i] for i in order[:n_pos]]) lowest = ' '.join([vocab[i] for i in order[-n_neg:]]) min_w = weights[:, j].min() max_w = weights[:, j].max() mean_w = weights[:, j].mean() sparsity = np.array(np.abs(weights[:, j]) < sparsity_threshold, dtype=float).sum() / float(dv) mean_sparsity += sparsity print("%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" % (j, highest, lowest, min_w, mean_w, max_w, sparsity)) sparsity = np.array(np.abs(weights) < sparsity_threshold, dtype=float).sum() / float(dv * n_topics) print("Topic sparsity = %0.3f" % sparsity) if n_classes > 1: print("\nClasses:") weights = np.array(params['W_decoder_label']) mean_sparsity = 0.0 for j in range(n_classes): order = list(np.argsort(weights[:, j]).tolist()) order.reverse() highest = ' '.join([vocab[i] for i in order[:n_pos]]) lowest = ' '.join([vocab[i] for i in order[-n_neg:]]) min_w = weights[:, j].min() max_w = weights[:, j].max() mean_w = weights[:, j].mean() sparsity = np.array(np.abs(weights[:, j]) < sparsity_threshold, dtype=float).sum() / float(dv) mean_sparsity += sparsity print("%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" % (j, highest, lowest, min_w, mean_w, max_w, sparsity)) sparsity = np.array(np.abs(weights) < sparsity_threshold, dtype=float).sum() / float(dv * n_classes) print("Covariate sparsity = %0.3f" % sparsity) if params['use_interactions']: print("\nInteractions:") interaction_weights = np.array(params['W_decoder_inter']) if interactions: mean_sparsity = 0.0 for j in range(n_topics): for k in range(n_classes): index = k + j * n_classes weights_sum = interaction_weights[:, index] order = list(np.argsort(weights_sum).tolist()) order.reverse() highest = ' '.join([vocab[i] for i in order[:n_pos]]) lowest = ' '.join([vocab[i] for i in order[-n_neg:]]) min_w = weights_sum.min() max_w = weights_sum.max() mean_w = weights_sum.mean() sparsity = np.array( np.abs(weights_sum) < sparsity_threshold, dtype=float).sum() / float(dv) mean_sparsity += sparsity print("%d/%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" % (j, k, highest, lowest, min_w, mean_w, max_w, sparsity)) sparsity = np.array(np.abs(interaction_weights) < sparsity_threshold, dtype=float).sum() / float( dv * n_topics * n_classes) print("Interaction sparsity = %0.3f" % sparsity) print("\nWith interactions (but no labels):") topic_weights = np.array(params['W_decoder']) interaction_weights = np.array(params['W_decoder_inter']) if interactions: mean_sparsity = 0.0 for j in range(n_topics): print(j) for k in range(n_classes): index = k + j * n_classes weights_sum = topic_weights[:, j] + interaction_weights[:, index] order = list(np.argsort(weights_sum).tolist()) order.reverse() highest = ' '.join([vocab[i] for i in order[:n_pos]]) lowest = ' '.join([vocab[i] for i in order[-n_neg:]]) min_w = weights_sum.min() max_w = weights_sum.max() mean_w = weights_sum.mean() sparsity = np.array( np.abs(weights_sum) < sparsity_threshold, dtype=float).sum() / float(dv) mean_sparsity += sparsity print("%d/%d %s / %s (%0.3f, %0.3f, %0.3f) [%0.5f]" % (j, k, highest, lowest, min_w, mean_w, max_w, sparsity))
def main(): usage = "%prog msa_db.csv articles.csv parsed_dir output_file.csv" parser = OptionParser(usage=usage) parser.add_option('--prefix', dest='parse_prefix', default='all', help='Prefix of parsed files: default=%default') #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False, # help='Keyword argument: default=%default') (options, args) = parser.parse_args() msa_csv = args[0] articles_csv = args[1] parsed_dir = args[2] outfile = args[3] parse_prefix = options.parse_prefix if os.path.exists(outfile): raise FileExistsError("outfile already exists!") msa_df = pd.read_csv(msa_csv, header=0) print(msa_df.shape) df = pd.read_csv(articles_csv, header=0, index_col=0) n_rows, n_columns = df.shape print(df.shape) files = glob.glob(os.path.join(parsed_dir, '*.json')) n_files = len(files) assert n_files == n_rows msa_df['n_total_articles'] = 0 msa_df['n_valid_articles'] = 0 msa_df['n_terrorism_mentions'] = 0 msa_df['n_unnegated_terrorism_mentions'] = 0 msa_df['n_mental_mentions'] = 0 msa_df['n_islam_mentions'] = 0 msa_df['n_immigrant_mentions'] = 0 for i in msa_df.index: date = pd.to_datetime(msa_df.loc[i, 'Date']) msa_df.loc[i, 'date'] = date msa_df.loc[i, 'year'] = date.year #msa_df = msa_df[msa_df.year >= 1990] for i in range(n_files): if i % 100 == 0 and i > 0: print(i) msa_id = df.loc[i, 'df_index'] caseid = df.loc[i, 'caseid'] name = str(df.loc[i, 'shooter_names']) # fix an important name error name = re.sub('Marteen', 'Mateen', name) names = name.split() age = str(df.loc[i, 'age']) age_string = str(age) + '-year-old' city = str(df.loc[i, 'city']) title = df.loc[i, 'title'] if msa_id == 272: # Kalamzoo duplicate print("Skipping", i, title) elif msa_id == 276: # Belfair duplicate print("Skipping", i, title) elif msa_id == 293: # Sherman, Texas duplicate print("Skipping", i, title) elif msa_id == 280: # Chelsea, MA duplicate print("Skipping", i, title) elif msa_id == 283: # Kansas City duplicate print("Skipping", i, title) elif msa_id == 331: # Cape Coral print("Skipping", i, title) else: age_found = False name_found = False city_found = False filename = os.path.join(parsed_dir, parse_prefix + '_' + str(i) + '.json') parse = fh.read_json(filename) sentences = parse['sentences'] for sentence in sentences: tokens = [token['word'] for token in sentence['tokens']] lower_tokens = [token.lower() for token in tokens] sentence_text = ' '.join(tokens) if age_string in lower_tokens: age_found = True if city in sentence_text: city_found = True for name in names: if name in tokens: name_found = True msa_df.loc[msa_id, 'n_total_articles'] += 1 if age_found or city_found or name_found: msa_df.loc[msa_id, 'n_valid_articles'] += 1 terrorism_mention = False unnegated_terrorism_mention = False mental_mention = False islam_mention = False immigrant_mention = False for sentence in sentences: tokens = [ token['word'].lower() for token in sentence['tokens'] ] sentence_text = ' '.join(tokens) if 'terrorism' in tokens or 'terrorist' in tokens: terrorism_mention = True if 'not' in tokens or re.match('no\s*\S* evidence', sentence_text): print(sentence_text) else: unnegated_terrorism_mention = True if 'mental' in tokens: mental_mention = True if 'islam' in tokens or 'islamic' in tokens or 'muslim' in tokens or 'muslims' in tokens: islam_mention = True if 'immigrant' in tokens or 'migrant' in tokens or 'naturalized' in tokens or 'immigrated' in tokens: immigrant_mention = True if terrorism_mention: msa_df.loc[msa_id, 'n_terrorism_mentions'] += 1 if unnegated_terrorism_mention: msa_df.loc[msa_id, 'n_unnegated_terrorism_mentions'] += 1 if mental_mention: msa_df.loc[msa_id, 'n_mental_mentions'] += 1 if islam_mention: msa_df.loc[msa_id, 'n_islam_mentions'] += 1 if immigrant_mention: msa_df.loc[msa_id, 'n_immigrant_mentions'] += 1 msa_df.to_csv(outfile) print(msa_df.n_valid_articles.sum())