def main(): usage = "%prog tagged.jsonlist output_file.txt" parser = OptionParser(usage=usage) #parser.add_option('--keyword', dest='key', default=None, # help='Keyword argument: default=%default') #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False, # help='Keyword argument: default=%default') (options, args) = parser.parse_args() infile = args[0] outfile = args[1] outlines = [] lines = fh.read_jsonlist(infile) n_lines = len(lines) order = np.arange(n_lines) np.random.shuffle(order) print(order[:10]) for i in order: line = lines[i] text = line['text_tagged'] text = re.sub('_', '-', text) text = re.sub('\d', '#', text) text = text.lower() outlines.append(text) fh.write_list_to_text(outlines, outfile)
def predict_labels_and_evaluate(model, X, Y, PC, TC, output_dir=None, subset='train', batch_size=200): # Predict labels for all instances using the classifier network and evaluate the accuracy pred_probs = predict_label_probs(model, X, PC, TC, batch_size, eta_bn_prop=0.0) np.savez(os.path.join(output_dir, 'pred_probs.' + subset + '.npz'), pred_probs=pred_probs) predictions = np.argmax(pred_probs, axis=1) accuracy = float( np.sum(predictions == np.argmax(Y, axis=1)) / float(len(Y))) print(subset, "accuracy on labels = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.' + subset + '.txt'))
def print_and_save_weights(options, model, vocab, prior_covar_names=None, topic_covar_names=None): # print background bg = model.get_bg() if not options.no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(options.output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(options.output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(options.output_dir, 'sparsity.txt')) if prior_covar_names is not None: prior_weights = model.get_prior_weights() print("Topic prior associations:") print("Covariates:", ' '.join(prior_covar_names)) for k in range(options.n_topics): output = str(k) + ': ' for c in range(len(prior_covar_names)): output += '%.4f ' % prior_weights[c, k] print(output) if options.output_dir is not None: np.savez(os.path.join(options.output_dir, 'prior_w.npz'), weights=prior_weights, names=prior_covar_names) if topic_covar_names is not None: beta_c = model.get_covar_weights() print("Covariate deviations:") maw, sparsity = print_top_words(beta_c, vocab, topic_covar_names) print("sparsity in covariates = %0.4f" % sparsity) if options.output_dir is not None: np.savez(os.path.join(options.output_dir, 'beta_c.npz'), beta=beta_c, names=topic_covar_names) if options.interactions: print("Covariate interactions") beta_ci = model.get_covar_interaction_weights() print(beta_ci.shape) if topic_covar_names is not None: names = [str(k) + ':' + c for k in range(options.n_topics) for c in topic_covar_names] else: names = None maw, sparsity = print_top_words(beta_ci, vocab, names) if options.output_dir is not None: np.savez(os.path.join(options.output_dir, 'beta_ci.npz'), beta=beta_ci, names=names) print("sparsity in covariate interactions = %0.4f" % sparsity)
def save_weights(output_dir, beta, bg, feature_names, sparsity_threshold=1e-5): np.savez(os.path.join(output_dir, 'beta.npz'), beta=beta) if bg is not None: np.savez(os.path.join(output_dir, 'bg.npz'), bg=bg) fh.write_to_json(feature_names, os.path.join(output_dir, 'vocab.json'), sort_keys=False) topics_file = os.path.join(output_dir, 'topics.txt') lines = [] for i in range(len(beta)): order = list(np.argsort(beta[i])) order.reverse() pos_words = [feature_names[j] for j in order[:100] if beta[i][j] > sparsity_threshold] output = ' '.join(pos_words) lines.append(output) fh.write_list_to_text(lines, topics_file)
def predict_labels_and_evaluate(model, X, Y, C, output_dir=None, subset='train'): """ Predict labels for all instances using the classifier network and evaluate the accuracy """ predictions = predict_labels(model, X, C) accuracy = float( np.sum(predictions == np.argmax(Y, axis=1)) / float(len(Y))) print(subset, "accuracy on labels = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.' + subset + '.txt'))
def compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip=0, output_file=None): print("Loading topics") topics = fh.read_text(topics_file) mean_vals = [] for n in n_vals: mean_npmi = compute_npmi_at_n(topics, ref_vocab, ref_counts, n, cols_to_skip=cols_to_skip) mean_vals.append(mean_npmi) if output_file is not None: lines = [str(n) + ' ' + str(v) for n, v in zip(n_vals, mean_vals)] fh.write_list_to_text(lines, output_file)
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3): print("Loading SpaCy") parser = English() with codecs.open(train_infile, 'r', encoding='utf-8') as f: lines = f.readlines() n_items = len(lines) n_test = int(test_prop * n_items) n_train = n_items - n_test train_indices = np.random.choice(range(n_items), n_train, replace=False) test_indices = list(set(range(n_items)) - set(train_indices)) train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length) test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length) fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz')) fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json')) fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json')) fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz')) fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz')) fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json')) fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz')) fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json')) fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat')) n_labels = len(label_list) label_dict = dict(zip(range(n_labels), label_list)) fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json')) fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt')) fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt')) train_sage_output['te_data'] = test_sage_output['tr_data'] train_sage_output['te_aspect'] = test_sage_output['tr_aspect'] savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output) fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt')) fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
def process_subset(items, parsed, label_fields, label_lists, vocab, output_dir, output_prefix): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) ids = [] for i, item in enumerate(items): if 'id' in item: ids.append(item['id']) if len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations for label_field in label_fields: label_list = label_lists[label_field] n_labels = len(label_list) label_list_strings = [str(label) for label in label_list] label_index = dict(zip(label_list_strings, range(n_labels))) # convert labels to a data frame if n_labels > 0: label_matrix = np.zeros([n_items, n_labels], dtype=int) label_vector = np.zeros(n_items, dtype=int) for i, item in enumerate(items): label = item[label_field] label_matrix[i, label_index[str(label)]] = 1 label_vector[i] = label_index[str(label)] labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings) labels_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '.csv')) label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field]) if n_labels == 2: label_vector_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '_vector.csv')) rows = [] cols = [] vals = [] dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))]) dat_strings.append(dat_string) # for dat formart, assume just one label is given if len(label_fields) > 0: label = items[i][label_fields[-1]] dat_labels.append(str(label_index[str(label)])) values = list(counter.values()) rows.extend([i] * len(counter)) token_indices = sorted(counter.keys()) cols.extend(list(token_indices)) vals.extend([counter[k] for k in token_indices]) # convert to a sparse representation sparse_X = sparse.coo_matrix((vals, (rows, cols)), shape=(n_items, vocab_size)).tocsr() fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz')) print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json')) # save output for Mallet fh.write_list_to_text(mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt')) # save output for David Blei's LDA/SLDA code fh.write_list_to_text(dat_strings, os.path.join(output_dir, output_prefix + '.data.dat')) if len(dat_labels) > 0: fh.write_list_to_text(dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat')) # save output for Jacob Eisenstein's SAGE code: #sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size,), dtype=np.object) vocab_for_sage[:] = vocab # for SAGE, assume only a single label has been given if len(label_fields) > 0: # convert array to vector of labels for SAGE sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1 else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1): print("Loading SpaCy") parser = English() train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length) test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length) fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz')) fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json')) fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json')) fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz')) fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz')) fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json')) fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz')) n_labels = len(label_list) label_dict = dict(zip(range(n_labels), label_list)) fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json')) fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json')) # save output for David Blei's lda-c code fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat')) fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat')) # save output for Mallet fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt')) fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt')) # save output for Jacob Eisenstein's SAGE code: train_sage_output['te_data'] = test_sage_output['tr_data'] train_sage_output['te_aspect'] = test_sage_output['tr_aspect'] savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output) # save output in SVM format fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt')) fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
def predict_labels_and_evaluate(model, network_architecture, X, Y, C, output_dir=None, subset='train'): predictions = predict_labels(model, network_architecture, X, C) accuracy = float(np.sum(predictions == np.argmax(Y, axis=1)) / float(len(Y))) print(subset, "accuracy on labels = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.' + subset + '.txt'))
def main(): usage = "%prog input_dir train_prefix" parser = OptionParser(usage=usage) parser.add_option('-a', dest='alpha', default=1.0, help='Hyperparameter for logistic normal prior: default=%default') parser.add_option('-k', dest='n_topics', default=20, help='Size of latent representation (~num topics): default=%default') parser.add_option('-b', dest='batch_size', default=200, help='Size of minibatches: default=%default') parser.add_option('-l', dest='learning_rate', default=0.002, help='Initial learning rate: default=%default') parser.add_option('-m', dest='momentum', default=0.99, help='beta1 for Adam: default=%default') parser.add_option('-e', dest='epochs', default=250, help='Number of epochs: default=%default') parser.add_option('--en_layers', dest='encoder_layers', default=1, help='Number of encoder layers [0|1|2]: default=%default') parser.add_option('--emb_dim', dest='embedding_dim', default=300, help='Dimension of input embeddings: default=%default') parser.add_option('--en_short', action="store_true", dest="encoder_shortcuts", default=False, help='Use shortcut connections on encoder: default=%default') parser.add_option('--labels', dest='label_name', default=None, help='Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default') parser.add_option('--covars', dest='covar_names', default=None, help='Read covars from files with these names (comma-separated): default=%default') parser.add_option('--label_emb_dim', dest='label_emb_dim', default=0, help='Class embedding dimension [0 = identity]: default=%default') parser.add_option('--covar_emb_dim', dest='covar_emb_dim', default=0, help='Covariate embedding dimension [0 = identity]: default=%default') parser.add_option('--min_covar_count', dest='min_covar_count', default=None, help='Drop binary covariates that occur less than this in training: default=%default') parser.add_option('--covar_inter', action="store_true", dest="covar_interactions", default=False, help='Use covariate interactions in model: default=%default') parser.add_option('--c_layers', dest='classifier_layers', default=1, help='Number of layers in (generative) classifier [0|1|2]: default=%default') parser.add_option('--exclude_covars', action="store_true", dest="exclude_covars", default=False, help='Exclude covariates from the classifier: default=%default') parser.add_option('-r', action="store_true", dest="regularize", default=False, help='Apply adaptive regularization for sparsity in topics: default=%default') parser.add_option('-t', dest='test_prefix', default=None, help='Prefix of test set: default=%default') parser.add_option('-f', dest='final_evaluate', default=None, help='perform final evaluation on test set') parser.add_option('-d', dest='dev_prefix', default=None, help='Prefix of dev set: default=%default') parser.add_option('-o', dest='output_dir', default='output', help='Output directory: default=%default') parser.add_option('--w2v', dest='word2vec_file', default=None, help='Use this word2vec .bin file to initialize and fix embeddings: default=%default') parser.add_option('--vocab_size', dest='vocab_size', default=None, help='Filter the vocabulary keeping the most common n words: default=%default') parser.add_option('--update_bg', action="store_true", dest="update_bg", default=False, help='Update background parameters: default=%default') parser.add_option('--no_bg', action="store_true", dest="no_bg", default=False, help='Do not use background freq: default=%default') parser.add_option('--no_bn_anneal', action="store_true", dest="no_bn_anneal", default=False, help='Do not anneal away from batchnorm: default=%default') parser.add_option('--dev_folds', dest='dev_folds', default=0, help='Number of dev folds: default=%default') parser.add_option('--dev_fold', dest='dev_fold', default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') parser.add_option('--opt', dest='optimizer', default='adam', help='Optimization algorithm to use [adam|adagrad|sgd]: default=%default') parser.add_option('--threads', dest='threads', default=8, help='Use this to limit the number of CPUs: default=%default') parser.add_option('--seed', dest='seed', default=None, help='Random seed: default=%default') (options, args) = parser.parse_args() input_dir = args[0] train_prefix = args[1] alpha = float(options.alpha) n_topics = int(options.n_topics) batch_size = int(options.batch_size) learning_rate = float(options.learning_rate) adam_beta1 = float(options.momentum) n_epochs = int(options.epochs) encoder_layers = int(options.encoder_layers) embedding_dim = int(options.embedding_dim) encoder_shortcuts = options.encoder_shortcuts label_file_name = options.label_name covar_file_names = options.covar_names use_covar_interactions = options.covar_interactions label_emb_dim = int(options.label_emb_dim) covar_emb_dim = int(options.covar_emb_dim) min_covar_count = options.min_covar_count classifier_layers = int(options.classifier_layers) covars_in_classifier = not options.exclude_covars auto_regularize = options.regularize test_prefix = options.test_prefix dev_prefix = options.dev_prefix output_dir = options.output_dir word2vec_file = options.word2vec_file vocab_size = options.vocab_size update_background = options.update_bg no_bg = options.no_bg bn_anneal = not options.no_bn_anneal dev_folds = int(options.dev_folds) final_evaluate = options.final_evaluate dev_fold = int(options.dev_fold) optimizer = options.optimizer seed = options.seed threads = int(options.threads) if seed is not None: seed = int(seed) rng = np.random.RandomState(seed) else: rng = np.random.RandomState(np.random.randint(0, 100000)) train_X, vocab, train_labels, label_names, na_label_index, label_type, train_covariates, covariate_names, covariates_type, col_sel = load_data(input_dir, train_prefix, label_file_name, covar_file_names, vocab_size=vocab_size) n_train, dv = train_X.shape if train_labels is not None: _, n_labels = train_labels.shape # convert binary labels to a single dimensional vector #if binary and n_classes == 2 and not generative: # train_labels = np.argmax(train_labels, axis=1) # train_labels = train_labels.reshape((n_train, 1)) # n_classes = 1 else: n_labels = 0 if train_covariates is not None: _, n_covariates = train_covariates.shape if min_covar_count is not None and int(min_covar_count) > 0: print("Removing rare covariates") covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, )) covariate_selector = covar_sums > int(min_covar_count) train_covariates = train_covariates[:, covariate_selector] covariate_names = [name for i, name in enumerate(covariate_names) if covariate_selector[i]] n_covariates = len(covariate_names) else: n_covariates = 0 if dev_prefix is not None: dev_X, _, dev_labels, _, _, _, dev_covariates, _, _, _ = load_data(input_dir, dev_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel) n_dev, _ = dev_X.shape if dev_labels is not None: _, n_labels_dev = dev_labels.shape assert n_labels_dev == n_labels #if binary and n_classes == 2 and not generative: # test_labels = np.argmax(test_labels, axis=1) # test_labels = test_labels.reshape((n_test, 1)) # n_classes = 1 if dev_covariates is not None: if min_covar_count is not None and int(min_covar_count) > 0: dev_covariates = dev_covariates[:, covariate_selector] _, n_covariates_dev = dev_covariates.shape assert n_covariates_dev == n_covariates else: dev_X = None n_dev = 0 dev_labels = None dev_covariates = None if test_prefix is not None: if final_evaluate: test_X, _, test_labels, _, _, _, test_covariates, _, _, _ = load_data(input_dir, test_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel) n_test, _ = test_X.shape if test_labels is not None: _, n_labels_test = test_labels.shape assert n_labels_test == n_labels #if binary and n_classes == 2 and not generative: # test_labels = np.argmax(test_labels, axis=1) # test_labels = test_labels.reshape((n_test, 1)) # n_classes = 1 if test_covariates is not None: if min_covar_count is not None and int(min_covar_count) > 0: test_covariates = test_covariates[:, covariate_selector] _, n_covariates_test = test_covariates.shape assert n_covariates_test == n_covariates else: test_X = None n_test = 0 test_labels = None test_covariates = None is_labeled = pd.read_csv(os.path.join(input_dir, "train.is_labeled.csv"), names=['labeled']).labeled init_bg = get_init_bg(train_X) init_beta = None update_beta = True if no_bg: if n_topics == 1: init_beta = init_bg.copy() init_beta = init_beta.reshape([1, len(vocab)]) update_beta = False init_bg = np.zeros_like(init_bg) network_architecture = make_network(dv, encoder_layers, embedding_dim, n_topics, encoder_shortcuts, label_type, n_labels, label_emb_dim, covariates_type, n_covariates, covar_emb_dim, use_covar_interactions, classifier_layers, covars_in_classifier) # make_network() print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # load pretrained word vectors if word2vec_file is not None: vocab_size = len( vocab) vocab_dict = dict(zip(vocab, range(vocab_size))) embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5, dtype=np.float32) count = 0 print("Loading word vectors") pretrained = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=False) for word, index in vocab_dict.items(): if word in pretrained: count += 1 embeddings[index, :] = pretrained[word] print("Found embeddings for %d words" % count) update_embeddings = False else: embeddings = None update_embeddings = True tf.reset_default_graph() model = Student(network_architecture, alpha=alpha, learning_rate=learning_rate, batch_size=batch_size, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, update_background=update_background, init_beta=init_beta, update_beta=update_beta, threads=threads, regularize=auto_regularize, optimizer=optimizer, adam_beta1=adam_beta1, seed=seed) # train full model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, train_covariates, is_labeled=is_labeled, regularize=auto_regularize, training_epochs=n_epochs, batch_size=batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, C_dev=dev_covariates, bn_anneal=bn_anneal) fh.makedirs(output_dir) # print background bg = model.get_bg() if not no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt')) if n_covariates > 0: emb_c = model.get_covar_weights() print("Covariate deviations:") if covar_emb_dim == 0: maw, sparsity = print_top_words(emb_c, vocab, covariate_names, n_top_words=16) else: maw, sparsity = print_top_words(emb_c, vocab) print("sparsity in covariates = %0.4f" % sparsity) if use_covar_interactions: print("Covariate interactions") emb_ci = model.get_covar_inter_weights() print(emb_ci.shape) if covariate_names is not None: names = [str(k) + ':' + c for k in range(n_topics) for c in covariate_names] else: names = None maw, sparsity = print_top_words(emb_ci, vocab, names) print("sparsity in covariate interactions = %0.4f" % sparsity) print("Combined covariates and interactions:") if covar_emb_dim > 0: print_covariate_embeddings(model, covariate_names, output_dir) # Evaluate perplexity on dev and test dataa if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_covariates, eta_bn_prop=0.0) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt')) if test_X is not None: if final_evaluate: perplexity = evaluate_perplexity(model, test_X, test_labels, test_covariates, eta_bn_prop=0.0) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt')) if n_covariates > 0 and covariates_type == 'categorical': print("Predicting categorical covariates") predictions = infer_categorical_covariate(model, network_architecture, train_X, train_labels) accuracy = float(np.sum(predictions == np.argmax(train_covariates, axis=1)) / float(len(train_covariates))) print("Train accuracy on covariates = %0.4f" % accuracy) if dev_X is not None: predictions = infer_categorical_covariate(model, network_architecture, dev_X, dev_labels) accuracy = float(np.sum(predictions == np.argmax(dev_covariates, axis=1)) / float(len(dev_covariates))) print("Dev accuracy on covariates = %0.4f" % accuracy) if test_X is not None: if final_evaluate: predictions = infer_categorical_covariate(model, network_architecture, test_X, test_labels) accuracy = float(np.sum(predictions == np.argmax(test_covariates, axis=1)) / float(len(test_covariates))) print("Test accuracy on covariates = %0.4f" % accuracy) if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, network_architecture, train_X, train_labels, train_covariates, output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, network_architecture, dev_X, dev_labels, dev_covariates, output_dir, subset='dev') if test_X is not None: if final_evaluate: predict_labels_and_evaluate(model, network_architecture, test_X, test_labels, test_covariates, output_dir, subset='test') # Print associations between topics and labels if n_labels > 0 and n_labels < 7: print("Label probabilities based on topics") print("Labels:", ' '.join([name for name in label_names])) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 if n_covariates > 0: C = np.zeros([1, n_covariates]).astype('float32') else: C = None probs = model.predict_from_topics(Z, C) output = str(k) + ': ' for i in range(n_labels): output += '%.4f ' % probs[0, i] print(output) if n_covariates > 0: all_probs = np.zeros([n_covariates, n_topics]) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 Y = None for c in range(n_covariates): C = np.zeros([1, n_covariates]).astype('float32') C[0, c] = 1.0 probs = model.predict_from_topics(Z, C) all_probs[c, k] = probs[0, 0] np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'), probs=all_probs) # save document representations theta = model.compute_theta(train_X, train_labels, train_covariates) np.savez(os.path.join(output_dir, 'train.theta.npz'), theta=theta) if dev_X is not None: if dev_labels is None: dev_Y = None else: dev_Y = np.zeros_like(dev_labels) theta = model.compute_theta(dev_X, dev_Y, dev_covariates) np.savez(os.path.join(output_dir, 'dev.theta.npz'), theta=theta) if n_test > 0: if final_evaluate: if test_labels is None: test_Y = None else: test_Y = np.zeros_like(test_labels) theta = model.compute_theta(test_X, test_Y, test_covariates) np.savez(os.path.join(output_dir, 'test.theta.npz'), theta=theta)
def main(): usage = "%prog input_dir train_prefix" parser = OptionParser(usage=usage) parser.add_option( '-a', dest='alpha', default=1.0, help='Hyperparameter for logistic normal prior: default=%default') parser.add_option( '-k', dest='n_topics', default=20, help='Size of latent representation (~num topics): default=%default') parser.add_option('-b', dest='batch_size', default=200, help='Size of minibatches: default=%default') parser.add_option('-l', dest='learning_rate', default=0.002, help='Initial learning rate: default=%default') parser.add_option('-m', dest='momentum', default=0.99, help='beta1 for Adam: default=%default') parser.add_option('-e', dest='epochs', default=200, help='Number of epochs: default=%default') parser.add_option('--emb_dim', dest='embedding_dim', default=300, help='Dimension of input embeddings: default=%default') parser.add_option( '--labels', dest='label_name', default=None, help= 'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default' ) parser.add_option( '--covars', dest='covar_names', default=None, help= 'Read covars from files with these names (comma-separated): default=%default' ) parser.add_option( '--label_emb_dim', dest='label_emb_dim', default=-1, help='Class embedding dimension [0 = identity]: default=%default') parser.add_option( '--covar_emb_dim', dest='covar_emb_dim', default=-1, help='Covariate embedding dimension [0 = identity]: default=%default') parser.add_option( '--min_covar_count', dest='min_covar_count', default=None, help= 'Drop binary covariates that occur less than this in training: default=%default' ) parser.add_option( '--c_layers', dest='classifier_layers', default=1, help= 'Number of layers in (generative) classifier [0|1|2]: default=%default' ) parser.add_option('-t', dest='test_prefix', default=None, help='Prefix of test set: default=%default') parser.add_option('-o', dest='output_dir', default='output', help='Output directory: default=%default') parser.add_option( '--w2v', dest='word2vec_file', default=None, help= 'Use this word2vec .bin file to initialize and fix embeddings: default=%default' ) parser.add_option('--update_bg', action="store_true", dest="update_bg", default=False, help='Update background parameters: default=%default') parser.add_option('--no_bg', action="store_true", dest="no_bg", default=False, help='Do not use background freq: default=%default') parser.add_option( '--no_bn_anneal', action="store_true", dest="no_bn_anneal", default=False, help='Do not anneal away from batchnorm: default=%default') parser.add_option( '--test_samples', dest='test_samples', default=20, help= 'Number of samples to use in computing test perplexity: default=%default' ) parser.add_option('--dev_folds', dest='dev_folds', default=0, help='Number of dev folds: default=%default') parser.add_option( '--dev_fold', dest='dev_fold', default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') (options, args) = parser.parse_args() input_dir = args[0] train_prefix = args[1] alpha = float(options.alpha) n_topics = int(options.n_topics) batch_size = int(options.batch_size) learning_rate = float(options.learning_rate) adam_beta1 = float(options.momentum) n_epochs = int(options.epochs) embedding_dim = int(options.embedding_dim) label_file_name = options.label_name covar_file_names = options.covar_names label_emb_dim = int(options.label_emb_dim) covar_emb_dim = int(options.covar_emb_dim) min_covar_count = options.min_covar_count classifier_layers = int(options.classifier_layers) test_prefix = options.test_prefix output_dir = options.output_dir word2vec_file = options.word2vec_file update_background = options.update_bg no_bg = options.no_bg bn_anneal = not options.no_bn_anneal test_samples = int(options.test_samples) dev_folds = int(options.dev_folds) dev_fold = int(options.dev_fold) rng = np.random.RandomState(np.random.randint(0, 100000)) # load the training data train_X, vocab, train_labels, label_names, label_type, train_covariates, covariate_names, covariates_type = load_data( input_dir, train_prefix, label_file_name, covar_file_names) n_train, dv = train_X.shape if train_labels is not None: _, n_labels = train_labels.shape else: n_labels = 0 if train_covariates is not None: _, n_covariates = train_covariates.shape if min_covar_count is not None and int(min_covar_count) > 0: print("Removing rare covariates") covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, )) covariate_selector = covar_sums > int(min_covar_count) train_covariates = train_covariates[:, covariate_selector] covariate_names = [ name for i, name in enumerate(covariate_names) if covariate_selector[i] ] n_covariates = len(covariate_names) else: n_covariates = 0 # split into train and dev if dev_folds > 0: n_dev = int(n_train / dev_folds) indices = np.array(range(n_train), dtype=int) rng.shuffle(indices) if dev_fold < dev_folds - 1: dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)] else: dev_indices = indices[n_dev * dev_fold:] train_indices = list(set(indices) - set(dev_indices)) dev_X = train_X[dev_indices, :] train_X = train_X[train_indices, :] if train_labels is not None: dev_labels = train_labels[dev_indices, :] train_labels = train_labels[train_indices, :] else: dev_labels = None if train_covariates is not None: dev_covariates = train_covariates[dev_indices, :] train_covariates = train_covariates[train_indices, :] else: dev_covariates = None n_train = len(train_indices) else: dev_X = None dev_labels = None dev_covariates = None n_dev = 0 # load the test data if test_prefix is not None: test_X, _, test_labels, _, _, test_covariates, _, _ = load_data( input_dir, test_prefix, label_file_name, covar_file_names, vocab=vocab) n_test, _ = test_X.shape if test_labels is not None: _, n_labels_test = test_labels.shape assert n_labels_test == n_labels if test_covariates is not None: if min_covar_count is not None and int(min_covar_count) > 0: test_covariates = test_covariates[:, covariate_selector] _, n_covariates_test = test_covariates.shape assert n_covariates_test == n_covariates else: test_X = None n_test = 0 test_labels = None test_covariates = None # initialize the background using overall word frequencies init_bg = get_init_bg(train_X) if no_bg: init_bg = np.zeros_like(init_bg) # combine the network configuration parameters into a dictionary network_architecture = make_network(dv, embedding_dim, n_topics, label_type, n_labels, label_emb_dim, covariates_type, n_covariates, covar_emb_dim, classifier_layers) # make_network() print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # load pretrained word vectors if word2vec_file is not None: vocab_size = len(vocab) vocab_dict = dict(zip(vocab, range(vocab_size))) embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5, dtype=np.float32) count = 0 print("Loading word vectors") pretrained = gensim.models.KeyedVectors.load_word2vec_format( word2vec_file, binary=True) for word, index in vocab_dict.items(): if word in pretrained: count += 1 embeddings[index, :] = pretrained[word] print("Found embeddings for %d words" % count) update_embeddings = False else: embeddings = None update_embeddings = True # create the model model = Scholar(network_architecture, alpha=alpha, learning_rate=learning_rate, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, update_background=update_background, adam_beta1=adam_beta1) # train the model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, train_covariates, training_epochs=n_epochs, batch_size=batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, C_dev=dev_covariates, bn_anneal=bn_anneal) # make output directory fh.makedirs(output_dir) # print background bg = model.get_bg() if not no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt')) if n_covariates > 0: beta_c = model.get_covar_weights() print("Covariate deviations:") if covar_emb_dim == 0: maw, sparsity = print_top_words(beta_c, vocab, covariate_names) else: maw, sparsity = print_top_words(beta_c, vocab) print("sparsity in covariates = %0.4f" % sparsity) if output_dir is not None: np.savez(os.path.join(output_dir, 'beta_c.npz'), beta=beta_c, names=covariate_names) # Evaluate perplexity on dev and test dataa if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_covariates, eta_bn_prop=0.0, n_samples=test_samples) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt')) if test_X is not None: perplexity = evaluate_perplexity(model, test_X, test_labels, test_covariates, eta_bn_prop=0.0, n_samples=test_samples) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt')) # evaluate accuracy on predicting categorical covariates if n_covariates > 0 and covariates_type == 'categorical': print("Predicting categorical covariates") predictions = infer_categorical_covariate(model, network_architecture, train_X, train_labels) accuracy = float( np.sum(predictions == np.argmax(train_covariates, axis=1)) / float(len(train_covariates))) print("Train accuracy on covariates = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.train.txt')) if dev_X is not None: predictions = infer_categorical_covariate(model, network_architecture, dev_X, dev_labels) accuracy = float( np.sum(predictions == np.argmax(dev_covariates, axis=1)) / float(len(dev_covariates))) print("Dev accuracy on covariates = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.dev.txt')) if test_X is not None: predictions = infer_categorical_covariate(model, network_architecture, test_X, test_labels) accuracy = float( np.sum(predictions == np.argmax(test_covariates, axis=1)) / float(len(test_covariates))) print("Test accuracy on covariates = %0.4f" % accuracy) if output_dir is not None: fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.test.txt')) # evaluate accuracy on predicting labels if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, train_X, train_labels, train_covariates, output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, dev_X, dev_labels, dev_covariates, output_dir, subset='dev') if test_X is not None: predict_labels_and_evaluate(model, test_X, test_labels, test_covariates, output_dir, subset='test') # Print associations between topics and labels if n_labels > 0 and n_labels < 7: print("Label probabilities based on topics") print("Labels:", ' '.join([name for name in label_names])) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 Y = None if n_covariates > 0: C = np.zeros([1, n_covariates]).astype('float32') else: C = None probs = model.predict_from_topics(Z, C) output = str(k) + ': ' for i in range(n_labels): output += '%.4f ' % probs[0, i] print(output) if n_covariates > 0: all_probs = np.zeros([n_covariates, n_topics]) for k in range(n_topics): Z = np.zeros([1, n_topics]).astype('float32') Z[0, k] = 1.0 Y = None for c in range(n_covariates): C = np.zeros([1, n_covariates]).astype('float32') C[0, c] = 1.0 probs = model.predict_from_topics(Z, C) all_probs[c, k] = probs[0, 0] np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'), probs=all_probs) # save document representations print("Getting topic proportions") theta = model.compute_theta(train_X, train_labels, train_covariates) print("Saving topic proportions") np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta) if dev_X is not None: dev_Y = np.zeros_like(dev_labels) print("Getting topic proportions for dev data") theta = model.compute_theta(dev_X, dev_Y, dev_covariates) print("Saving topic proportions") np.savez(os.path.join(output_dir, 'theta.dev.npz'), theta=theta) if n_test > 0: test_Y = np.zeros_like(test_labels) print("Getting topic proportions for test data") theta = model.compute_theta(test_X, test_Y, test_covariates) print("Saving topic proportions") np.savez(os.path.join(output_dir, 'theta.test.npz'), theta=theta)
def main(): usage = "%prog input_dir" parser = OptionParser(usage=usage) parser.add_option( '-k', dest='n_topics', type=int, default=20, help='Size of latent representation (~num topics): default=%default') parser.add_option('-l', dest='learning_rate', type=float, default=0.002, help='Initial learning rate: default=%default') parser.add_option('-m', dest='momentum', type=float, default=0.99, help='beta1 for Adam: default=%default') parser.add_option('--batch-size', dest='batch_size', type=int, default=200, help='Size of minibatches: default=%default') parser.add_option('--epochs', type=int, default=200, help='Number of epochs: default=%default') parser.add_option('--train-prefix', type=str, default='train', help='Prefix of train set: default=%default') parser.add_option('--test-prefix', type=str, default=None, help='Prefix of test set: default=%default') parser.add_option( '--labels', type=str, default=None, help= 'Read labels from input_dir/[train|test].labels.csv: default=%default') parser.add_option( '--prior-covars', type=str, default=None, help= 'Read prior covariates from files with these names (comma-separated): default=%default' ) parser.add_option( '--topic-covars', type=str, default=None, help= 'Read topic covariates from files with these names (comma-separated): default=%default' ) parser.add_option( '--interactions', action="store_true", default=False, help= 'Use interactions between topics and topic covariates: default=%default' ) parser.add_option( '--min-prior-covar-count', type=int, default=None, help= 'Drop prior covariates with less than this many non-zero values in the training dataa: default=%default' ) parser.add_option( '--min-topic-covar-count', type=int, default=None, help= 'Drop topic covariates with less than this many non-zero values in the training dataa: default=%default' ) parser.add_option( '--l1-topics', type=float, default=0.0, help='Regularization strength on topic weights: default=%default') parser.add_option( '--l1-topic-covars', type=float, default=0.0, help= 'Regularization strength on topic covariate weights: default=%default') parser.add_option( '--l1-interactions', type=float, default=0.0, help= 'Regularization strength on topic covariate interaction weights: default=%default' ) parser.add_option( '--l2-prior-covars', type=float, default=0.0, help= 'Regularization strength on prior covariate weights: default=%default') parser.add_option('-o', dest='output_dir', type=str, default='output', help='Output directory: default=%default') parser.add_option('--emb-dim', type=int, default=300, help='Dimension of input embeddings: default=%default') parser.add_option( '--w2v', dest='word2vec_file', type=str, default=None, help= 'Use this word2vec .bin file to initialize and fix embeddings: default=%default' ) parser.add_option( '--alpha', type=float, default=1.0, help='Hyperparameter for logistic normal prior: default=%default') parser.add_option('--no-bg', action="store_true", default=False, help='Do not use background freq: default=%default') parser.add_option('--dev-folds', type=int, default=0, help='Number of dev folds: default=%default') parser.add_option( '--dev-fold', type=int, default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') parser.add_option('--device', type=int, default=None, help='GPU to use: default=%default') parser.add_option('--seed', type=int, default=None, help='Random seed: default=%default') options, args = parser.parse_args() input_dir = args[0] if options.seed is not None: rng = np.random.RandomState(options.seed) else: rng = np.random.RandomState(np.random.randint(0, 100000)) # load the training data train_X, vocab, row_selector = load_word_counts(input_dir, options.train_prefix) train_labels, label_type, label_names, n_labels = load_labels( input_dir, options.train_prefix, row_selector, options) train_prior_covars, prior_covar_selector, prior_covar_names, n_prior_covars = load_covariates( input_dir, options.train_prefix, row_selector, options.prior_covars, options.min_prior_covar_count) train_topic_covars, topic_covar_selector, topic_covar_names, n_topic_covars = load_covariates( input_dir, options.train_prefix, row_selector, options.topic_covars, options.min_topic_covar_count) options.n_train, vocab_size = train_X.shape options.n_labels = n_labels if n_labels > 0: print("Train label proportions:", np.mean(train_labels, axis=0)) # split into training and dev if desired train_indices, dev_indices = train_dev_split(options, rng) train_X, dev_X = split_matrix(train_X, train_indices, dev_indices) train_labels, dev_labels = split_matrix(train_labels, train_indices, dev_indices) train_prior_covars, dev_prior_covars = split_matrix( train_prior_covars, train_indices, dev_indices) train_topic_covars, dev_topic_covars = split_matrix( train_topic_covars, train_indices, dev_indices) n_train, _ = train_X.shape # load the test data if options.test_prefix is not None: test_X, _, row_selector = load_word_counts(input_dir, options.test_prefix, vocab=vocab) test_labels, _, _, _ = load_labels(input_dir, options.test_prefix, row_selector, options) test_prior_covars, _, _, _ = load_covariates( input_dir, options.test_prefix, row_selector, options.prior_covars, covariate_selector=prior_covar_selector) test_topic_covars, _, _, _ = load_covariates( input_dir, options.test_prefix, row_selector, options.topic_covars, covariate_selector=topic_covar_selector) n_test, _ = test_X.shape else: test_X = None n_test = 0 test_labels = None test_prior_covars = None test_topic_covars = None # initialize the background using overall word frequencies init_bg = get_init_bg(train_X) if options.no_bg: init_bg = np.zeros_like(init_bg) # combine the network configuration parameters into a dictionary network_architecture = make_network(options, vocab_size, label_type, n_labels, n_prior_covars, n_topic_covars) print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # load word vectors embeddings, update_embeddings = load_word_vectors(options, rng, vocab) # create the model model = Scholar(network_architecture, alpha=options.alpha, learning_rate=options.learning_rate, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, adam_beta1=options.momentum, device=options.device) # train the model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, train_prior_covars, train_topic_covars, training_epochs=options.epochs, batch_size=options.batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, PC_dev=dev_prior_covars, TC_dev=dev_topic_covars) # make output directory fh.makedirs(options.output_dir) # display and save weights print_and_save_weights(options, model, vocab, prior_covar_names, topic_covar_names) # Evaluate perplexity on dev and test data if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_prior_covars, dev_topic_covars, options.batch_size, eta_bn_prop=0.0) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(options.output_dir, 'perplexity.dev.txt')) if test_X is not None: perplexity = evaluate_perplexity(model, test_X, test_labels, test_prior_covars, test_topic_covars, options.batch_size, eta_bn_prop=0.0) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(options.output_dir, 'perplexity.test.txt')) # evaluate accuracy on predicting labels if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, train_X, train_labels, train_prior_covars, train_topic_covars, options.output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, dev_X, dev_labels, dev_prior_covars, dev_topic_covars, options.output_dir, subset='dev') if test_X is not None: predict_labels_and_evaluate(model, test_X, test_labels, test_prior_covars, test_topic_covars, options.output_dir, subset='test') # print label probabilities for each topic print_topic_label_associations(options, label_names, model, n_prior_covars, n_topic_covars) # save document representations print("Saving document representations") save_document_representations(model, train_X, train_labels, train_prior_covars, train_topic_covars, options.output_dir, 'train', batch_size=options.batch_size) if dev_X is not None: save_document_representations(model, dev_X, dev_labels, dev_prior_covars, dev_topic_covars, options.output_dir, 'dev', batch_size=options.batch_size) if n_test > 0: save_document_representations(model, test_X, test_labels, test_prior_covars, test_topic_covars, options.output_dir, 'test', batch_size=options.batch_size)
def process_subset(items, parsed, label_field, label_list, vocab, output_dir, output_prefix): n_items = len(items) n_labels = len(label_list) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) ids = [] for i, item in enumerate(items): if 'id' in item: ids.append(item['id']) if len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations label_list_strings = [str(label) for label in label_list] label_index = dict(zip(label_list_strings, range(n_labels))) # convert labels to a data frame if n_labels > 0: label_matrix = np.zeros([n_items, n_labels], dtype=int) label_vector = np.zeros(n_items, dtype=int) for i, item in enumerate(items): id = ids[i] label = item[label_field] label_matrix[i, label_index[str(label)]] = 1 label_vector[i] = label_index[str(label)] labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings) labels_df.to_csv( os.path.join(output_dir, output_prefix + '.' + label_field + '.csv')) label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field]) label_vector_df.to_csv( os.path.join(output_dir, output_prefix + '.label_vector.csv')) else: print("No labels found") X = np.zeros([n_items, vocab_size], dtype=int) dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append( str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([ str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values())) ]) dat_strings.append(dat_string) if label_field is not None: label = items[i][label_field] dat_labels.append(str(label_index[str(label)])) values = list(counter.values()) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz')) print(sparse_X.shape) print(len(dat_strings)) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json')) # save output for Mallet fh.write_list_to_text( mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt')) # save output for David Blei's LDA/SLDA code fh.write_list_to_text( dat_strings, os.path.join(output_dir, output_prefix + '.data.dat')) if len(dat_labels) > 0: fh.write_list_to_text( dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat')) # save output for Jacob Eisenstein's SAGE code: sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object) vocab_for_sage[:] = vocab if n_labels > 0: # convert array to vector of labels for SAGE sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1 else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def process_subset( items, ids, parsed, labels, label_fields, label_lists, vocab, output_dir, output_prefix, count_dtype=np.int, ): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) if not ids or len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations if labels: labels_df = pd.DataFrame.from_records(labels, index=ids) for label_field in label_fields: labels_df_subset = pd.get_dummies(labels_df[label_field]) # for any classes not present in the subset, add 0 columns # (handles case where classes appear in only one of train or test) for category in label_lists[label_field]: if category not in labels_df_subset: labels_df_subset[category] = 0 labels_df_subset.to_csv( os.path.join(output_dir, output_prefix + "." + label_field + ".csv")) if labels_df[label_field].nunique() == 2: labels_df_subset.iloc[:, 1].to_csv( os.path.join( output_dir, output_prefix + "." + label_field + "_vector.csv"), header=[label_field], ) # used later label_index = dict( zip(labels_df_subset.columns, range(len(labels_df_subset)))) X = np.zeros([n_items, vocab_size], dtype=count_dtype) dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary words = words.split() indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append( str(i) + "\t" + "en" + "\t" + " ".join(word_subset)) dat_string = str(int(len(counter))) + " " dat_string += " ".join([ str(k) + ":" + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values())) ]) dat_strings.append(dat_string) # for dat formart, assume just one label is given if len(label_fields) > 0: label = labels[i][label_fields[-1]] dat_labels.append(str(label_index[str(label)])) values = np.array(list(counter.values()), dtype=count_dtype) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + ".npz")) print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + ".ids.json")) # save output for Mallet fh.write_list_to_text( mallet_strings, os.path.join(output_dir, output_prefix + ".mallet.txt")) # save output for David Blei's LDA/SLDA code fh.write_list_to_text( dat_strings, os.path.join(output_dir, output_prefix + ".data.dat")) if len(dat_labels) > 0: fh.write_list_to_text( dat_labels, os.path.join(output_dir, output_prefix + "." + label_field + ".dat"), ) # save output for Jacob Eisenstein's SAGE code: sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object) vocab_for_sage[:] = vocab # for SAGE, assume only a single label has been given if len(label_fields) > 0: # convert array to vector of labels for SAGE sage_aspect = ( np.argmax(np.array(labels_df_subset.values, dtype=float), axis=1) + 1) else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def main(): usage = "%prog input_dir" parser = OptionParser(usage=usage) parser.add_option( '-k', dest='n_topics', default=100, help='Size of latent representation (~num topics): default=%default') parser.add_option( '-r', action="store_true", dest="regularize", default=False, help= 'Apply adaptive regularization for sparsity in topics: default=%default' ) parser.add_option('-o', dest='output_dir', default='output', help='Output directory: default=%default') parser.add_option( '--vocab-size', dest='vocab_size', default=None, help= 'Filter the vocabulary keeping the most common n words: default=%default' ) parser.add_option('--no-bg', action="store_true", dest="no_bg", default=False, help='Do not use background freq: default=%default') parser.add_option( '--no-bn-anneal', action="store_true", dest="no_bn_anneal", default=False, help='Do not anneal away from batchnorm: default=%default') parser.add_option( '--opt', dest='optimizer', default='adam', help= 'Optimization algorithm to use [adam|adagrad|sgd]: default=%default') parser.add_option('--dev-folds', dest='dev_folds', default=0, help='Number of dev folds: default=%default') parser.add_option( '--dev-fold', dest='dev_fold', default=0, help='Fold to use as dev (if dev_folds > 0): default=%default') parser.add_option('--test-prefix', dest='test_prefix', default=None, help='Prefix of test set: default=%default') parser.add_option( '--labels', dest='label_name', default=None, help= 'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default' ) (options, args) = parser.parse_args() input_dir = args[0] dev_folds = int(options.dev_folds) dev_fold = int(options.dev_fold) label_file_name = options.label_name alpha = 1.0 n_topics = int(options.n_topics) batch_size = 200 # learning_rate = 0.002 learning_rate = 0.001 adam_beta1 = 0.99 n_epochs = 450 encoder_layers = 1 #Number of encoder layers [0|1|2] encoder_shortcuts = False classifier_layers = 1 #[0|1|2] auto_regularize = options.regularize output_dir = options.output_dir # word2vec_file = "/home/lcw2/share/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin" word2vec_file = "../embeddings/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.bin" # word2vec_file = "C:\\\\Soft\\share\\GoogleNews-vectors-negative300.bin" embedding_dim = 200 vocab_size = options.vocab_size update_background = False no_bg = options.no_bg bn_anneal = True optimizer = options.optimizer seed = 1 threads = 4 if seed is not None: seed = int(seed) rng = np.random.RandomState(seed) else: rng = np.random.RandomState(np.random.randint(0, 100000)) # kb embedding file # kb2vec_file = "/home/lcw2/github/my_vaetm/data/kb2vec/WikiData.KB.100d.zh.pickle" kb2vec_file = "./data/kb2vec/WikiData.KB.100d.zh.v2.pickle" kb_dim = 100 test_prefix = 'test' # load the training data train_prefix = 'train' train_X, vocab, train_labels, label_names, label_type, col_sel, num = load_data( input_dir, train_prefix, label_file_name, vocab_size=vocab_size) n_train, dv = train_X.shape if train_labels is not None: _, n_labels = train_labels.shape print('n_labels:', n_labels) else: n_labels = 0 if test_prefix == 'test': test_X, _, test_labels, _, _, _, _ = load_data(input_dir, test_prefix, label_file_name, vocab=vocab) n_test, _ = test_X.shape if test_labels is not None: _, n_labels_test = test_labels.shape assert n_labels_test == n_labels # split training data into train and dev if dev_folds > 0: n_dev = int(n_train / dev_folds) indices = np.array(range(n_train), dtype=int) rng.shuffle(indices) if dev_fold < dev_folds - 1: dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)] else: dev_indices = indices[n_dev * dev_fold:] train_indices = list(set(indices) - set(dev_indices)) dev_X = train_X[dev_indices, :] train_X = train_X[train_indices, :] n_train = len(train_indices) else: dev_X = None # initialize the background using the overall frequency of terms init_bg = get_init_bg(train_X) init_beta = None update_beta = True # if no_bg: # if n_topics == 1: # init_beta = init_bg.copy() # init_beta = init_beta.reshape([1, len(vocab)]) # update_beta = False # init_bg = np.zeros_like(init_bg) label_emb_dim = -1 # create the network configuration network_architecture = make_network(dv, encoder_layers, embedding_dim, n_topics, encoder_shortcuts, label_type, n_labels, label_emb_dim, classifier_layers) print("Network architecture:") for key, val in network_architecture.items(): print(key + ':', val) # # load pretrained word vectors if word2vec_file is not None: vocab_size = len(vocab) vocab_dict = dict(zip(vocab, range(vocab_size))) embeddings = np.array(rng.rand(vocab_size, embedding_dim) * 0.25 - 0.5, dtype=np.float32) count = 0 print("Loading word vectors") if word2vec_file[-3:] == 'bin': pretrained = gensim.models.KeyedVectors.load(word2vec_file) else: pretrained = gensim.models.KeyedVectors.load_word2vec_format( word2vec_file, binary=False) for word, index in vocab_dict.items(): if word in pretrained: count += 1 embeddings[index, :] = pretrained[word] print("Found word embeddings for %d words" % count) print('shape of word embeddings:', embeddings.shape) else: print("No embeddings for words!") exit() # load pretrained entity vectors # if kb2vec_file is not None: # vocab_size = len(vocab) # vocab_dict = dict(zip(vocab, range(vocab_size))) # entity_embeddings = np.array(rng.rand(vocab_size, kb_dim) * 0.25 - 0.5, dtype=np.float32) # count = 0 # # print("Loading emtity vectors...") # pretrained = None # with open(kb2vec_file, 'rb') as f: # pretrained = pickle.load(f) # print('# of entities:', len(pretrained)) # vocab_counter = collections.Counter() # vocab_counter.update(s for s in num if s in pretrained) # print(vocab_counter.most_common(10)) # h = open('./output/topics.txt', 'r', encoding='utf-8') # read_data = h.read() # a = read_data.split() # print('#of topic',len(a)) # for word, index in vocab_dict.items(): # if word in pretrained and word in a: # print(word) # if word in pretrained: # # elif word in pretrained and word not in a: # count += 1 # entity_embeddings[index, :] = pretrained[word] # # print("Found entity embeddings for %d words" % count) # print('shape of entity embeddings:', entity_embeddings.shape) # else: # print("No embeddings for knowledge entities!") # exit() tf.reset_default_graph() # create the model model = VaeTm(network_architecture, alpha=alpha,\ learning_rate=learning_rate, \ batch_size=batch_size, # init_embeddings=embeddings,\ # entity_embeddings=entity_embeddings,\ init_bg=init_bg,\ update_background=update_background, init_beta=init_beta,\ update_beta=update_beta, threads=threads,\ regularize=auto_regularize, optimizer=optimizer,\ adam_beta1=adam_beta1, seed=seed) # train the model print("Optimizing full model") model = train(model, network_architecture, train_X, train_labels, vocab, regularize=auto_regularize, training_epochs=n_epochs, batch_size=batch_size, rng=rng, bn_anneal=bn_anneal, X_dev=dev_X) # create output directory fh.makedirs(output_dir) # print background bg = model.get_bg() if not no_bg: print_top_bg(bg, vocab) # print topics emb = model.get_weights() print("Topics:") maw, sparsity, topics = print_top_words(emb, vocab) print("sparsity in topics = %0.4f" % sparsity) save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5) fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt')) fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt')) # print('Predicting training representations...') # reps, preds = model.predict(train_X) # # print('rep-0:', reps[0]) # # print('rep-0:', reps[1]) # fh.write_matrix_to_text(reps, os.path.join(output_dir, 'train_representation.txt')) # if test_X is not None: # print('Predicting testing representations...') # reps, preds = model.predict(test_X) # # print('rep-0:', reps[0]) # # print('rep-0:', reps[1]) # fh.write_matrix_to_text(reps, os.path.join(output_dir, 'test_representation.txt')) # Evaluate perplexity on dev and test dataa if dev_X is not None: perplexity = evaluate_perplexity(model, dev_X, eta_bn_prop=0.0) print("Dev perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt')) if test_X is not None: perplexity = evaluate_perplexity(model, test_X, test_labels, eta_bn_prop=0.0) print("Test perplexity = %0.4f" % perplexity) fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt')) # evaluate accuracy on labels if n_labels > 0: print("Predicting labels") predict_labels_and_evaluate(model, train_X, train_labels, None, output_dir, subset='train') if dev_X is not None: predict_labels_and_evaluate(model, dev_X, dev_labels, None, output_dir, subset='dev') if test_X is not None: predict_labels_and_evaluate(model, test_X, test_labels, None, output_dir, subset='test') # save document representations theta = model.compute_theta(train_X, train_labels) np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta) compute_npmi_at_n(topics, vocab, train_X)