Example #1
0
def main():
    usage = "%prog tagged.jsonlist output_file.txt"
    parser = OptionParser(usage=usage)
    #parser.add_option('--keyword', dest='key', default=None,
    #                  help='Keyword argument: default=%default')
    #parser.add_option('--boolarg', action="store_true", dest="boolarg", default=False,
    #                  help='Keyword argument: default=%default')

    (options, args) = parser.parse_args()
    infile = args[0]
    outfile = args[1]

    outlines = []
    lines = fh.read_jsonlist(infile)
    n_lines = len(lines)
    order = np.arange(n_lines)
    np.random.shuffle(order)
    print(order[:10])

    for i in order:
        line = lines[i]
        text = line['text_tagged']
        text = re.sub('_', '-', text)
        text = re.sub('\d', '#', text)
        text = text.lower()
        outlines.append(text)

    fh.write_list_to_text(outlines, outfile)
Example #2
0
def predict_labels_and_evaluate(model,
                                X,
                                Y,
                                PC,
                                TC,
                                output_dir=None,
                                subset='train',
                                batch_size=200):
    # Predict labels for all instances using the classifier network and evaluate the accuracy
    pred_probs = predict_label_probs(model,
                                     X,
                                     PC,
                                     TC,
                                     batch_size,
                                     eta_bn_prop=0.0)
    np.savez(os.path.join(output_dir, 'pred_probs.' + subset + '.npz'),
             pred_probs=pred_probs)
    predictions = np.argmax(pred_probs, axis=1)
    accuracy = float(
        np.sum(predictions == np.argmax(Y, axis=1)) / float(len(Y)))
    print(subset, "accuracy on labels = %0.4f" % accuracy)
    if output_dir is not None:
        fh.write_list_to_text([str(accuracy)],
                              os.path.join(output_dir,
                                           'accuracy.' + subset + '.txt'))
Example #3
0
def print_and_save_weights(options, model, vocab, prior_covar_names=None, topic_covar_names=None):

    # print background
    bg = model.get_bg()
    if not options.no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(options.output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(options.output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(options.output_dir, 'sparsity.txt'))

    if prior_covar_names is not None:
        prior_weights = model.get_prior_weights()
        print("Topic prior associations:")
        print("Covariates:", ' '.join(prior_covar_names))
        for k in range(options.n_topics):
            output = str(k) + ': '
            for c in range(len(prior_covar_names)):
                output += '%.4f ' % prior_weights[c, k]
            print(output)
        if options.output_dir is not None:
            np.savez(os.path.join(options.output_dir, 'prior_w.npz'), weights=prior_weights, names=prior_covar_names)

    if topic_covar_names is not None:
        beta_c = model.get_covar_weights()
        print("Covariate deviations:")
        maw, sparsity = print_top_words(beta_c, vocab, topic_covar_names)
        print("sparsity in covariates = %0.4f" % sparsity)
        if options.output_dir is not None:
            np.savez(os.path.join(options.output_dir, 'beta_c.npz'), beta=beta_c, names=topic_covar_names)

        if options.interactions:
            print("Covariate interactions")
            beta_ci = model.get_covar_interaction_weights()
            print(beta_ci.shape)
            if topic_covar_names is not None:
                names = [str(k) + ':' + c for k in range(options.n_topics) for c in topic_covar_names]
            else:
                names = None
            maw, sparsity = print_top_words(beta_ci, vocab, names)
            if options.output_dir is not None:
                np.savez(os.path.join(options.output_dir, 'beta_ci.npz'), beta=beta_ci, names=names)
            print("sparsity in covariate interactions = %0.4f" % sparsity)
Example #4
0
def save_weights(output_dir, beta, bg, feature_names, sparsity_threshold=1e-5):
    np.savez(os.path.join(output_dir, 'beta.npz'), beta=beta)
    if bg is not None:
        np.savez(os.path.join(output_dir, 'bg.npz'), bg=bg)
    fh.write_to_json(feature_names, os.path.join(output_dir, 'vocab.json'), sort_keys=False)

    topics_file = os.path.join(output_dir, 'topics.txt')
    lines = []
    for i in range(len(beta)):
        order = list(np.argsort(beta[i]))
        order.reverse()
        pos_words = [feature_names[j] for j in order[:100] if beta[i][j] > sparsity_threshold]
        output = ' '.join(pos_words)
        lines.append(output)

    fh.write_list_to_text(lines, topics_file)
Example #5
0
def predict_labels_and_evaluate(model,
                                X,
                                Y,
                                C,
                                output_dir=None,
                                subset='train'):
    """
    Predict labels for all instances using the classifier network and evaluate the accuracy
    """
    predictions = predict_labels(model, X, C)
    accuracy = float(
        np.sum(predictions == np.argmax(Y, axis=1)) / float(len(Y)))
    print(subset, "accuracy on labels = %0.4f" % accuracy)
    if output_dir is not None:
        fh.write_list_to_text([str(accuracy)],
                              os.path.join(output_dir,
                                           'accuracy.' + subset + '.txt'))
Example #6
0
def compute_npmi(topics_file,
                 ref_vocab,
                 ref_counts,
                 n_vals,
                 cols_to_skip=0,
                 output_file=None):
    print("Loading topics")
    topics = fh.read_text(topics_file)

    mean_vals = []
    for n in n_vals:
        mean_npmi = compute_npmi_at_n(topics,
                                      ref_vocab,
                                      ref_counts,
                                      n,
                                      cols_to_skip=cols_to_skip)
        mean_vals.append(mean_npmi)

    if output_file is not None:
        lines = [str(n) + ' ' + str(v) for n, v in zip(n_vals, mean_vals)]
        fh.write_list_to_text(lines, output_file)
Example #7
0
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3):

    print("Loading SpaCy")
    parser = English()

    with codecs.open(train_infile, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    n_items = len(lines)
    n_test = int(test_prop * n_items)
    n_train = n_items - n_test
    train_indices = np.random.choice(range(n_items), n_train, replace=False)
    test_indices = list(set(range(n_items)) - set(train_indices))

    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))

    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
Example #8
0
def process_subset(items, parsed, label_fields, label_lists, vocab, output_dir, output_prefix):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    ids = []
    for i, item in enumerate(items):
        if 'id' in item:
            ids.append(item['id'])
    if len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    for label_field in label_fields:
        label_list = label_lists[label_field]
        n_labels = len(label_list)
        label_list_strings = [str(label) for label in label_list]
        label_index = dict(zip(label_list_strings, range(n_labels)))

        # convert labels to a data frame
        if n_labels > 0:
            label_matrix = np.zeros([n_items, n_labels], dtype=int)
            label_vector = np.zeros(n_items, dtype=int)

            for i, item in enumerate(items):
                label = item[label_field]
                label_matrix[i, label_index[str(label)]] = 1
                label_vector[i] = label_index[str(label)]

            labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings)
            labels_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '.csv'))
            label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field])
            if n_labels == 2:
                label_vector_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '_vector.csv'))

    rows = []
    cols = []
    vals = []

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))

            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))])
            dat_strings.append(dat_string)

            # for dat formart, assume just one label is given
            if len(label_fields) > 0:
                label = items[i][label_fields[-1]]
                dat_labels.append(str(label_index[str(label)]))

            values = list(counter.values())
            rows.extend([i] * len(counter))
            token_indices = sorted(counter.keys())
            cols.extend(list(token_indices))
            vals.extend([counter[k] for k in token_indices])

    # convert to a sparse representation
    sparse_X = sparse.coo_matrix((vals, (rows, cols)), shape=(n_items, vocab_size)).tocsr()
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz'))

    print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape)

    fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json'))

    # save output for Mallet
    fh.write_list_to_text(mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt'))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(dat_strings, os.path.join(output_dir, output_prefix + '.data.dat'))
    if len(dat_labels) > 0:
        fh.write_list_to_text(dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat'))

    # save output for Jacob Eisenstein's SAGE code:
    #sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size,), dtype=np.object)
    vocab_for_sage[:] = vocab

    # for SAGE, assume only a single label has been given
    if len(label_fields) > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X, sage_aspect, sage_no_aspect, widx, vocab_for_sage
Example #9
0
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1):

    print("Loading SpaCy")
    parser = English()
    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))

    # save output for David Blei's lda-c code
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat'))

    # save output for Mallet
    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    # save output for Jacob Eisenstein's SAGE code:
    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    # save output in SVM format
    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
Example #10
0
def predict_labels_and_evaluate(model, network_architecture, X, Y, C, output_dir=None, subset='train'):
    predictions = predict_labels(model, network_architecture, X, C)
    accuracy = float(np.sum(predictions == np.argmax(Y, axis=1)) / float(len(Y)))
    print(subset, "accuracy on labels = %0.4f" % accuracy)
    if output_dir is not None:
        fh.write_list_to_text([str(accuracy)], os.path.join(output_dir, 'accuracy.' + subset + '.txt'))
Example #11
0
def main():
    usage = "%prog input_dir train_prefix"
    parser = OptionParser(usage=usage)
    parser.add_option('-a', dest='alpha', default=1.0,
                      help='Hyperparameter for logistic normal prior: default=%default')
    parser.add_option('-k', dest='n_topics', default=20,
                      help='Size of latent representation (~num topics): default=%default')
    parser.add_option('-b', dest='batch_size', default=200,
                      help='Size of minibatches: default=%default')
    parser.add_option('-l', dest='learning_rate', default=0.002,
                      help='Initial learning rate: default=%default')
    parser.add_option('-m', dest='momentum', default=0.99,
                      help='beta1 for Adam: default=%default')
    parser.add_option('-e', dest='epochs', default=250,
                      help='Number of epochs: default=%default')
    parser.add_option('--en_layers', dest='encoder_layers', default=1,
                      help='Number of encoder layers [0|1|2]: default=%default')
    parser.add_option('--emb_dim', dest='embedding_dim', default=300,
                      help='Dimension of input embeddings: default=%default')
    parser.add_option('--en_short', action="store_true", dest="encoder_shortcuts", default=False,
                      help='Use shortcut connections on encoder: default=%default')
    parser.add_option('--labels', dest='label_name', default=None,
                      help='Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default')
    parser.add_option('--covars', dest='covar_names', default=None,
                      help='Read covars from files with these names (comma-separated): default=%default')
    parser.add_option('--label_emb_dim', dest='label_emb_dim', default=0,
                      help='Class embedding dimension [0 = identity]: default=%default')
    parser.add_option('--covar_emb_dim', dest='covar_emb_dim', default=0,
                      help='Covariate embedding dimension [0 = identity]: default=%default')
    parser.add_option('--min_covar_count', dest='min_covar_count', default=None,
                      help='Drop binary covariates that occur less than this in training: default=%default')
    parser.add_option('--covar_inter', action="store_true", dest="covar_interactions", default=False,
                      help='Use covariate interactions in model: default=%default')
    parser.add_option('--c_layers', dest='classifier_layers', default=1,
                      help='Number of layers in (generative) classifier [0|1|2]: default=%default')
    parser.add_option('--exclude_covars', action="store_true", dest="exclude_covars", default=False,
                      help='Exclude covariates from the classifier: default=%default')
    parser.add_option('-r', action="store_true", dest="regularize", default=False,
                      help='Apply adaptive regularization for sparsity in topics: default=%default')
    parser.add_option('-t', dest='test_prefix', default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option('-f', dest='final_evaluate', default=None,
                      help='perform final evaluation on test set')
    parser.add_option('-d', dest='dev_prefix', default=None,
                      help='Prefix of dev set: default=%default')
    parser.add_option('-o', dest='output_dir', default='output',
                      help='Output directory: default=%default')
    parser.add_option('--w2v', dest='word2vec_file', default=None,
                      help='Use this word2vec .bin file to initialize and fix embeddings: default=%default')
    parser.add_option('--vocab_size', dest='vocab_size', default=None,
                      help='Filter the vocabulary keeping the most common n words: default=%default')
    parser.add_option('--update_bg', action="store_true", dest="update_bg", default=False,
                      help='Update background parameters: default=%default')
    parser.add_option('--no_bg', action="store_true", dest="no_bg", default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option('--no_bn_anneal', action="store_true", dest="no_bn_anneal", default=False,
                      help='Do not anneal away from batchnorm: default=%default')
    parser.add_option('--dev_folds', dest='dev_folds', default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option('--dev_fold', dest='dev_fold', default=0,
                      help='Fold to use as dev (if dev_folds > 0): default=%default')
    parser.add_option('--opt', dest='optimizer', default='adam',
                      help='Optimization algorithm to use [adam|adagrad|sgd]: default=%default')
    parser.add_option('--threads', dest='threads', default=8,
                      help='Use this to limit the number of CPUs: default=%default')
    parser.add_option('--seed', dest='seed', default=None,
                      help='Random seed: default=%default')

    (options, args) = parser.parse_args()

    input_dir = args[0]
    train_prefix = args[1]

    alpha = float(options.alpha)
    n_topics = int(options.n_topics)
    batch_size = int(options.batch_size)
    learning_rate = float(options.learning_rate)
    adam_beta1 = float(options.momentum)
    n_epochs = int(options.epochs)
    encoder_layers = int(options.encoder_layers)
    embedding_dim = int(options.embedding_dim)
    encoder_shortcuts = options.encoder_shortcuts
    label_file_name = options.label_name
    covar_file_names = options.covar_names
    use_covar_interactions = options.covar_interactions
    label_emb_dim = int(options.label_emb_dim)
    covar_emb_dim = int(options.covar_emb_dim)
    min_covar_count = options.min_covar_count
    classifier_layers = int(options.classifier_layers)
    covars_in_classifier = not options.exclude_covars
    auto_regularize = options.regularize
    test_prefix = options.test_prefix
    dev_prefix = options.dev_prefix
    output_dir = options.output_dir
    word2vec_file = options.word2vec_file
    vocab_size = options.vocab_size
    update_background = options.update_bg
    no_bg = options.no_bg
    bn_anneal = not options.no_bn_anneal
    dev_folds = int(options.dev_folds)
    final_evaluate = options.final_evaluate
    dev_fold = int(options.dev_fold)
    optimizer = options.optimizer
    seed = options.seed
    threads = int(options.threads)
    if seed is not None:
        seed = int(seed)
        rng = np.random.RandomState(seed)
    else:
        rng = np.random.RandomState(np.random.randint(0, 100000))

    train_X, vocab, train_labels, label_names, na_label_index, label_type, train_covariates, covariate_names, covariates_type, col_sel = load_data(input_dir, train_prefix, label_file_name, covar_file_names, vocab_size=vocab_size)
    n_train, dv = train_X.shape

    if train_labels is not None:
        _, n_labels = train_labels.shape
        # convert binary labels to a single dimensional vector
        #if binary and n_classes == 2 and not generative:
        #    train_labels = np.argmax(train_labels, axis=1)
        #    train_labels = train_labels.reshape((n_train, 1))
        #    n_classes = 1
    else:
        n_labels = 0

    if train_covariates is not None:
        _, n_covariates = train_covariates.shape
        if min_covar_count is not None and int(min_covar_count) > 0:
            print("Removing rare covariates")
            covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, ))
            covariate_selector = covar_sums > int(min_covar_count)
            train_covariates = train_covariates[:, covariate_selector]
            covariate_names = [name for i, name in enumerate(covariate_names) if covariate_selector[i]]
            n_covariates = len(covariate_names)

    else:
        n_covariates = 0

    if dev_prefix is not None:
        dev_X, _, dev_labels, _, _, _, dev_covariates, _, _, _ = load_data(input_dir, dev_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel)
        n_dev, _ = dev_X.shape
        if dev_labels is not None:
            _, n_labels_dev = dev_labels.shape
            assert n_labels_dev == n_labels
            #if binary and n_classes == 2 and not generative:
            #    test_labels = np.argmax(test_labels, axis=1)
            #    test_labels = test_labels.reshape((n_test, 1))
            #    n_classes = 1
        if dev_covariates is not None:
            if min_covar_count is not None and int(min_covar_count) > 0:
                dev_covariates = dev_covariates[:, covariate_selector]
            _, n_covariates_dev = dev_covariates.shape
            assert n_covariates_dev == n_covariates

    else:
        dev_X = None
        n_dev = 0
        dev_labels = None
        dev_covariates = None

    if test_prefix is not None:
        if final_evaluate:
            test_X, _, test_labels, _, _, _, test_covariates, _, _, _ = load_data(input_dir, test_prefix, label_file_name, covar_file_names, vocab=vocab, col_sel=col_sel)
            n_test, _ = test_X.shape
            if test_labels is not None:
                _, n_labels_test = test_labels.shape
                assert n_labels_test == n_labels
                #if binary and n_classes == 2 and not generative:
                #    test_labels = np.argmax(test_labels, axis=1)
                #    test_labels = test_labels.reshape((n_test, 1))
                #    n_classes = 1
            if test_covariates is not None:
                if min_covar_count is not None and int(min_covar_count) > 0:
                    test_covariates = test_covariates[:, covariate_selector]
                _, n_covariates_test = test_covariates.shape
                assert n_covariates_test == n_covariates

        else:
            test_X = None
            n_test = 0
            test_labels = None
            test_covariates = None

    is_labeled = pd.read_csv(os.path.join(input_dir, "train.is_labeled.csv"), names=['labeled']).labeled

    init_bg = get_init_bg(train_X)
    init_beta = None
    update_beta = True
    if no_bg:
        if n_topics == 1:
            init_beta = init_bg.copy()
            init_beta = init_beta.reshape([1, len(vocab)])
            update_beta = False
        init_bg = np.zeros_like(init_bg)


    network_architecture = make_network(dv, encoder_layers, embedding_dim,
                                        n_topics, encoder_shortcuts, label_type, n_labels, label_emb_dim,
                                        covariates_type, n_covariates, covar_emb_dim, use_covar_interactions,
                                        classifier_layers, covars_in_classifier)  # make_network()

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # load pretrained word vectors
    if word2vec_file is not None:
        vocab_size = len(
            vocab)
        vocab_dict = dict(zip(vocab, range(vocab_size)))
        embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5, dtype=np.float32)
        count = 0
        print("Loading word vectors")
        pretrained = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, binary=False)

        for word, index in vocab_dict.items():
            if word in pretrained:
                count += 1
                embeddings[index, :] = pretrained[word]

        print("Found embeddings for %d words" % count)
        update_embeddings = False
    else:
        embeddings = None
        update_embeddings = True

    tf.reset_default_graph()

    model = Student(network_architecture, alpha=alpha, learning_rate=learning_rate, batch_size=batch_size, init_embeddings=embeddings, update_embeddings=update_embeddings, init_bg=init_bg, update_background=update_background, init_beta=init_beta, update_beta=update_beta, threads=threads, regularize=auto_regularize, optimizer=optimizer, adam_beta1=adam_beta1, seed=seed)

    # train full model
    print("Optimizing full model")
    model = train(model, network_architecture, train_X, train_labels, train_covariates, is_labeled=is_labeled, regularize=auto_regularize, training_epochs=n_epochs, batch_size=batch_size, rng=rng, X_dev=dev_X, Y_dev=dev_labels, C_dev=dev_covariates, bn_anneal=bn_anneal)

    fh.makedirs(output_dir)

    # print background
    bg = model.get_bg()
    if not no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)], os.path.join(output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)], os.path.join(output_dir, 'sparsity.txt'))

    if n_covariates > 0:
        emb_c = model.get_covar_weights()
        print("Covariate deviations:")
        if covar_emb_dim == 0:
            maw, sparsity = print_top_words(emb_c, vocab, covariate_names, n_top_words=16)
        else:
            maw, sparsity = print_top_words(emb_c, vocab)
        print("sparsity in covariates = %0.4f" % sparsity)
        if use_covar_interactions:
            print("Covariate interactions")
            emb_ci = model.get_covar_inter_weights()
            print(emb_ci.shape)
            if covariate_names is not None:
                names = [str(k) + ':' + c for k in range(n_topics) for c in covariate_names]
            else:
                names = None
            maw, sparsity = print_top_words(emb_ci, vocab, names)
            print("sparsity in covariate interactions = %0.4f" % sparsity)
            print("Combined covariates and interactions:")

        if covar_emb_dim > 0:
            print_covariate_embeddings(model, covariate_names, output_dir)

    # Evaluate perplexity on dev and test dataa
    if dev_X is not None:
        perplexity = evaluate_perplexity(model, dev_X, dev_labels, dev_covariates, eta_bn_prop=0.0)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.dev.txt'))

    if test_X is not None:
        if final_evaluate:
            perplexity = evaluate_perplexity(model, test_X, test_labels, test_covariates, eta_bn_prop=0.0)
            print("Test perplexity = %0.4f" % perplexity)
            fh.write_list_to_text([str(perplexity)], os.path.join(output_dir, 'perplexity.test.txt'))

    if n_covariates > 0 and covariates_type == 'categorical':
        print("Predicting categorical covariates")
        predictions = infer_categorical_covariate(model, network_architecture, train_X, train_labels)
        accuracy = float(np.sum(predictions == np.argmax(train_covariates, axis=1)) / float(len(train_covariates)))
        print("Train accuracy on covariates = %0.4f" % accuracy)

        if dev_X is not None:
            predictions = infer_categorical_covariate(model, network_architecture, dev_X, dev_labels)
            accuracy = float(np.sum(predictions == np.argmax(dev_covariates, axis=1)) / float(len(dev_covariates)))
            print("Dev accuracy on covariates = %0.4f" % accuracy)

        if test_X is not None:
            if final_evaluate:
                predictions = infer_categorical_covariate(model, network_architecture, test_X, test_labels)
                accuracy = float(np.sum(predictions == np.argmax(test_covariates, axis=1)) / float(len(test_covariates)))
                print("Test accuracy on covariates = %0.4f" % accuracy)

    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model, network_architecture, train_X, train_labels, train_covariates, output_dir, subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model, network_architecture, dev_X, dev_labels, dev_covariates, output_dir, subset='dev')

        if test_X is not None:
            if final_evaluate:
                predict_labels_and_evaluate(model, network_architecture, test_X, test_labels, test_covariates, output_dir, subset='test')

    # Print associations between topics and labels
    if n_labels > 0 and n_labels < 7:
        print("Label probabilities based on topics")
        print("Labels:", ' '.join([name for name in label_names]))
        for k in range(n_topics):
            Z = np.zeros([1, n_topics]).astype('float32')
            Z[0, k] = 1.0
            if n_covariates > 0:
                C = np.zeros([1, n_covariates]).astype('float32')
            else:
                C = None
            probs = model.predict_from_topics(Z, C)
            output = str(k) + ': '
            for i in range(n_labels):
                output += '%.4f ' % probs[0, i]
            print(output)

        if n_covariates > 0:
            all_probs = np.zeros([n_covariates, n_topics])
            for k in range(n_topics):
                Z = np.zeros([1, n_topics]).astype('float32')
                Z[0, k] = 1.0
                Y = None
                for c in range(n_covariates):
                    C = np.zeros([1, n_covariates]).astype('float32')
                    C[0, c] = 1.0
                    probs = model.predict_from_topics(Z, C)
                    all_probs[c, k] = probs[0, 0]
            np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'), probs=all_probs)

    # save document representations
    theta = model.compute_theta(train_X, train_labels, train_covariates)
    np.savez(os.path.join(output_dir, 'train.theta.npz'), theta=theta)

    if dev_X is not None:
        if dev_labels is None:
            dev_Y = None
        else:
            dev_Y = np.zeros_like(dev_labels)
        theta = model.compute_theta(dev_X, dev_Y, dev_covariates)
        np.savez(os.path.join(output_dir, 'dev.theta.npz'), theta=theta)

    if n_test > 0:
        if final_evaluate:
            if test_labels is None:
                test_Y = None
            else:
                test_Y = np.zeros_like(test_labels)
            theta = model.compute_theta(test_X, test_Y, test_covariates)
            np.savez(os.path.join(output_dir, 'test.theta.npz'), theta=theta)
Example #12
0
def main():
    usage = "%prog input_dir train_prefix"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-a',
        dest='alpha',
        default=1.0,
        help='Hyperparameter for logistic normal prior: default=%default')
    parser.add_option(
        '-k',
        dest='n_topics',
        default=20,
        help='Size of latent representation (~num topics): default=%default')
    parser.add_option('-b',
                      dest='batch_size',
                      default=200,
                      help='Size of minibatches: default=%default')
    parser.add_option('-l',
                      dest='learning_rate',
                      default=0.002,
                      help='Initial learning rate: default=%default')
    parser.add_option('-m',
                      dest='momentum',
                      default=0.99,
                      help='beta1 for Adam: default=%default')
    parser.add_option('-e',
                      dest='epochs',
                      default=200,
                      help='Number of epochs: default=%default')
    parser.add_option('--emb_dim',
                      dest='embedding_dim',
                      default=300,
                      help='Dimension of input embeddings: default=%default')
    parser.add_option(
        '--labels',
        dest='label_name',
        default=None,
        help=
        'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default'
    )
    parser.add_option(
        '--covars',
        dest='covar_names',
        default=None,
        help=
        'Read covars from files with these names (comma-separated): default=%default'
    )
    parser.add_option(
        '--label_emb_dim',
        dest='label_emb_dim',
        default=-1,
        help='Class embedding dimension [0 = identity]: default=%default')
    parser.add_option(
        '--covar_emb_dim',
        dest='covar_emb_dim',
        default=-1,
        help='Covariate embedding dimension [0 = identity]: default=%default')
    parser.add_option(
        '--min_covar_count',
        dest='min_covar_count',
        default=None,
        help=
        'Drop binary covariates that occur less than this in training: default=%default'
    )
    parser.add_option(
        '--c_layers',
        dest='classifier_layers',
        default=1,
        help=
        'Number of layers in (generative) classifier [0|1|2]: default=%default'
    )
    parser.add_option('-t',
                      dest='test_prefix',
                      default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option('-o',
                      dest='output_dir',
                      default='output',
                      help='Output directory: default=%default')
    parser.add_option(
        '--w2v',
        dest='word2vec_file',
        default=None,
        help=
        'Use this word2vec .bin file to initialize and fix embeddings: default=%default'
    )
    parser.add_option('--update_bg',
                      action="store_true",
                      dest="update_bg",
                      default=False,
                      help='Update background parameters: default=%default')
    parser.add_option('--no_bg',
                      action="store_true",
                      dest="no_bg",
                      default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option(
        '--no_bn_anneal',
        action="store_true",
        dest="no_bn_anneal",
        default=False,
        help='Do not anneal away from batchnorm: default=%default')
    parser.add_option(
        '--test_samples',
        dest='test_samples',
        default=20,
        help=
        'Number of samples to use in computing test perplexity: default=%default'
    )
    parser.add_option('--dev_folds',
                      dest='dev_folds',
                      default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option(
        '--dev_fold',
        dest='dev_fold',
        default=0,
        help='Fold to use as dev (if dev_folds > 0): default=%default')

    (options, args) = parser.parse_args()

    input_dir = args[0]
    train_prefix = args[1]

    alpha = float(options.alpha)
    n_topics = int(options.n_topics)
    batch_size = int(options.batch_size)
    learning_rate = float(options.learning_rate)
    adam_beta1 = float(options.momentum)
    n_epochs = int(options.epochs)
    embedding_dim = int(options.embedding_dim)
    label_file_name = options.label_name
    covar_file_names = options.covar_names
    label_emb_dim = int(options.label_emb_dim)
    covar_emb_dim = int(options.covar_emb_dim)
    min_covar_count = options.min_covar_count
    classifier_layers = int(options.classifier_layers)
    test_prefix = options.test_prefix
    output_dir = options.output_dir
    word2vec_file = options.word2vec_file
    update_background = options.update_bg
    no_bg = options.no_bg
    bn_anneal = not options.no_bn_anneal
    test_samples = int(options.test_samples)
    dev_folds = int(options.dev_folds)
    dev_fold = int(options.dev_fold)
    rng = np.random.RandomState(np.random.randint(0, 100000))

    # load the training data
    train_X, vocab, train_labels, label_names, label_type, train_covariates, covariate_names, covariates_type = load_data(
        input_dir, train_prefix, label_file_name, covar_file_names)
    n_train, dv = train_X.shape

    if train_labels is not None:
        _, n_labels = train_labels.shape
    else:
        n_labels = 0

    if train_covariates is not None:
        _, n_covariates = train_covariates.shape
        if min_covar_count is not None and int(min_covar_count) > 0:
            print("Removing rare covariates")
            covar_sums = train_covariates.sum(axis=0).reshape((n_covariates, ))
            covariate_selector = covar_sums > int(min_covar_count)
            train_covariates = train_covariates[:, covariate_selector]
            covariate_names = [
                name for i, name in enumerate(covariate_names)
                if covariate_selector[i]
            ]
            n_covariates = len(covariate_names)
    else:
        n_covariates = 0

    # split into train and dev
    if dev_folds > 0:
        n_dev = int(n_train / dev_folds)
        indices = np.array(range(n_train), dtype=int)
        rng.shuffle(indices)
        if dev_fold < dev_folds - 1:
            dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)]
        else:
            dev_indices = indices[n_dev * dev_fold:]
        train_indices = list(set(indices) - set(dev_indices))
        dev_X = train_X[dev_indices, :]
        train_X = train_X[train_indices, :]
        if train_labels is not None:
            dev_labels = train_labels[dev_indices, :]
            train_labels = train_labels[train_indices, :]
        else:
            dev_labels = None
        if train_covariates is not None:
            dev_covariates = train_covariates[dev_indices, :]
            train_covariates = train_covariates[train_indices, :]
        else:
            dev_covariates = None
        n_train = len(train_indices)
    else:
        dev_X = None
        dev_labels = None
        dev_covariates = None
        n_dev = 0

    # load the test data
    if test_prefix is not None:
        test_X, _, test_labels, _, _, test_covariates, _, _ = load_data(
            input_dir,
            test_prefix,
            label_file_name,
            covar_file_names,
            vocab=vocab)
        n_test, _ = test_X.shape
        if test_labels is not None:
            _, n_labels_test = test_labels.shape
            assert n_labels_test == n_labels
        if test_covariates is not None:
            if min_covar_count is not None and int(min_covar_count) > 0:
                test_covariates = test_covariates[:, covariate_selector]
            _, n_covariates_test = test_covariates.shape
            assert n_covariates_test == n_covariates

    else:
        test_X = None
        n_test = 0
        test_labels = None
        test_covariates = None

    # initialize the background using overall word frequencies
    init_bg = get_init_bg(train_X)
    if no_bg:
        init_bg = np.zeros_like(init_bg)

    # combine the network configuration parameters into a dictionary
    network_architecture = make_network(dv, embedding_dim, n_topics,
                                        label_type, n_labels, label_emb_dim,
                                        covariates_type, n_covariates,
                                        covar_emb_dim,
                                        classifier_layers)  # make_network()

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # load pretrained word vectors
    if word2vec_file is not None:
        vocab_size = len(vocab)
        vocab_dict = dict(zip(vocab, range(vocab_size)))
        embeddings = np.array(rng.rand(vocab_size, 300) * 0.25 - 0.5,
                              dtype=np.float32)
        count = 0
        print("Loading word vectors")
        pretrained = gensim.models.KeyedVectors.load_word2vec_format(
            word2vec_file, binary=True)

        for word, index in vocab_dict.items():
            if word in pretrained:
                count += 1
                embeddings[index, :] = pretrained[word]

        print("Found embeddings for %d words" % count)
        update_embeddings = False
    else:
        embeddings = None
        update_embeddings = True

    # create the model
    model = Scholar(network_architecture,
                    alpha=alpha,
                    learning_rate=learning_rate,
                    init_embeddings=embeddings,
                    update_embeddings=update_embeddings,
                    init_bg=init_bg,
                    update_background=update_background,
                    adam_beta1=adam_beta1)

    # train the model
    print("Optimizing full model")
    model = train(model,
                  network_architecture,
                  train_X,
                  train_labels,
                  train_covariates,
                  training_epochs=n_epochs,
                  batch_size=batch_size,
                  rng=rng,
                  X_dev=dev_X,
                  Y_dev=dev_labels,
                  C_dev=dev_covariates,
                  bn_anneal=bn_anneal)

    # make output directory
    fh.makedirs(output_dir)

    # print background
    bg = model.get_bg()
    if not no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)],
                          os.path.join(output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)],
                          os.path.join(output_dir, 'sparsity.txt'))

    if n_covariates > 0:
        beta_c = model.get_covar_weights()
        print("Covariate deviations:")
        if covar_emb_dim == 0:
            maw, sparsity = print_top_words(beta_c, vocab, covariate_names)
        else:
            maw, sparsity = print_top_words(beta_c, vocab)
        print("sparsity in covariates = %0.4f" % sparsity)
        if output_dir is not None:
            np.savez(os.path.join(output_dir, 'beta_c.npz'),
                     beta=beta_c,
                     names=covariate_names)

    # Evaluate perplexity on dev and test dataa
    if dev_X is not None:
        perplexity = evaluate_perplexity(model,
                                         dev_X,
                                         dev_labels,
                                         dev_covariates,
                                         eta_bn_prop=0.0,
                                         n_samples=test_samples)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.dev.txt'))

    if test_X is not None:
        perplexity = evaluate_perplexity(model,
                                         test_X,
                                         test_labels,
                                         test_covariates,
                                         eta_bn_prop=0.0,
                                         n_samples=test_samples)
        print("Test perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.test.txt'))

    # evaluate accuracy on predicting categorical covariates
    if n_covariates > 0 and covariates_type == 'categorical':
        print("Predicting categorical covariates")
        predictions = infer_categorical_covariate(model, network_architecture,
                                                  train_X, train_labels)
        accuracy = float(
            np.sum(predictions == np.argmax(train_covariates, axis=1)) /
            float(len(train_covariates)))
        print("Train accuracy on covariates = %0.4f" % accuracy)
        if output_dir is not None:
            fh.write_list_to_text([str(accuracy)],
                                  os.path.join(output_dir,
                                               'accuracy.train.txt'))

        if dev_X is not None:
            predictions = infer_categorical_covariate(model,
                                                      network_architecture,
                                                      dev_X, dev_labels)
            accuracy = float(
                np.sum(predictions == np.argmax(dev_covariates, axis=1)) /
                float(len(dev_covariates)))
            print("Dev accuracy on covariates = %0.4f" % accuracy)
            if output_dir is not None:
                fh.write_list_to_text([str(accuracy)],
                                      os.path.join(output_dir,
                                                   'accuracy.dev.txt'))

        if test_X is not None:
            predictions = infer_categorical_covariate(model,
                                                      network_architecture,
                                                      test_X, test_labels)
            accuracy = float(
                np.sum(predictions == np.argmax(test_covariates, axis=1)) /
                float(len(test_covariates)))
            print("Test accuracy on covariates = %0.4f" % accuracy)
            if output_dir is not None:
                fh.write_list_to_text([str(accuracy)],
                                      os.path.join(output_dir,
                                                   'accuracy.test.txt'))

    # evaluate accuracy on predicting labels
    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model,
                                    train_X,
                                    train_labels,
                                    train_covariates,
                                    output_dir,
                                    subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model,
                                        dev_X,
                                        dev_labels,
                                        dev_covariates,
                                        output_dir,
                                        subset='dev')

        if test_X is not None:
            predict_labels_and_evaluate(model,
                                        test_X,
                                        test_labels,
                                        test_covariates,
                                        output_dir,
                                        subset='test')

    # Print associations between topics and labels
    if n_labels > 0 and n_labels < 7:
        print("Label probabilities based on topics")
        print("Labels:", ' '.join([name for name in label_names]))
        for k in range(n_topics):
            Z = np.zeros([1, n_topics]).astype('float32')
            Z[0, k] = 1.0
            Y = None
            if n_covariates > 0:
                C = np.zeros([1, n_covariates]).astype('float32')
            else:
                C = None
            probs = model.predict_from_topics(Z, C)
            output = str(k) + ': '
            for i in range(n_labels):
                output += '%.4f ' % probs[0, i]
            print(output)

        if n_covariates > 0:
            all_probs = np.zeros([n_covariates, n_topics])
            for k in range(n_topics):
                Z = np.zeros([1, n_topics]).astype('float32')
                Z[0, k] = 1.0
                Y = None
                for c in range(n_covariates):
                    C = np.zeros([1, n_covariates]).astype('float32')
                    C[0, c] = 1.0
                    probs = model.predict_from_topics(Z, C)
                    all_probs[c, k] = probs[0, 0]
            np.savez(os.path.join(output_dir, 'covar_topic_probs.npz'),
                     probs=all_probs)

    # save document representations
    print("Getting topic proportions")
    theta = model.compute_theta(train_X, train_labels, train_covariates)
    print("Saving topic proportions")
    np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta)

    if dev_X is not None:
        dev_Y = np.zeros_like(dev_labels)
        print("Getting topic proportions for dev data")
        theta = model.compute_theta(dev_X, dev_Y, dev_covariates)
        print("Saving topic proportions")
        np.savez(os.path.join(output_dir, 'theta.dev.npz'), theta=theta)

    if n_test > 0:
        test_Y = np.zeros_like(test_labels)
        print("Getting topic proportions for test data")
        theta = model.compute_theta(test_X, test_Y, test_covariates)
        print("Saving topic proportions")
        np.savez(os.path.join(output_dir, 'theta.test.npz'), theta=theta)
Example #13
0
def main():
    usage = "%prog input_dir"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-k',
        dest='n_topics',
        type=int,
        default=20,
        help='Size of latent representation (~num topics): default=%default')
    parser.add_option('-l',
                      dest='learning_rate',
                      type=float,
                      default=0.002,
                      help='Initial learning rate: default=%default')
    parser.add_option('-m',
                      dest='momentum',
                      type=float,
                      default=0.99,
                      help='beta1 for Adam: default=%default')
    parser.add_option('--batch-size',
                      dest='batch_size',
                      type=int,
                      default=200,
                      help='Size of minibatches: default=%default')
    parser.add_option('--epochs',
                      type=int,
                      default=200,
                      help='Number of epochs: default=%default')
    parser.add_option('--train-prefix',
                      type=str,
                      default='train',
                      help='Prefix of train set: default=%default')
    parser.add_option('--test-prefix',
                      type=str,
                      default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option(
        '--labels',
        type=str,
        default=None,
        help=
        'Read labels from input_dir/[train|test].labels.csv: default=%default')
    parser.add_option(
        '--prior-covars',
        type=str,
        default=None,
        help=
        'Read prior covariates from files with these names (comma-separated): default=%default'
    )
    parser.add_option(
        '--topic-covars',
        type=str,
        default=None,
        help=
        'Read topic covariates from files with these names (comma-separated): default=%default'
    )
    parser.add_option(
        '--interactions',
        action="store_true",
        default=False,
        help=
        'Use interactions between topics and topic covariates: default=%default'
    )
    parser.add_option(
        '--min-prior-covar-count',
        type=int,
        default=None,
        help=
        'Drop prior covariates with less than this many non-zero values in the training dataa: default=%default'
    )
    parser.add_option(
        '--min-topic-covar-count',
        type=int,
        default=None,
        help=
        'Drop topic covariates with less than this many non-zero values in the training dataa: default=%default'
    )
    parser.add_option(
        '--l1-topics',
        type=float,
        default=0.0,
        help='Regularization strength on topic weights: default=%default')
    parser.add_option(
        '--l1-topic-covars',
        type=float,
        default=0.0,
        help=
        'Regularization strength on topic covariate weights: default=%default')
    parser.add_option(
        '--l1-interactions',
        type=float,
        default=0.0,
        help=
        'Regularization strength on topic covariate interaction weights: default=%default'
    )
    parser.add_option(
        '--l2-prior-covars',
        type=float,
        default=0.0,
        help=
        'Regularization strength on prior covariate weights: default=%default')
    parser.add_option('-o',
                      dest='output_dir',
                      type=str,
                      default='output',
                      help='Output directory: default=%default')
    parser.add_option('--emb-dim',
                      type=int,
                      default=300,
                      help='Dimension of input embeddings: default=%default')
    parser.add_option(
        '--w2v',
        dest='word2vec_file',
        type=str,
        default=None,
        help=
        'Use this word2vec .bin file to initialize and fix embeddings: default=%default'
    )
    parser.add_option(
        '--alpha',
        type=float,
        default=1.0,
        help='Hyperparameter for logistic normal prior: default=%default')
    parser.add_option('--no-bg',
                      action="store_true",
                      default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option('--dev-folds',
                      type=int,
                      default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option(
        '--dev-fold',
        type=int,
        default=0,
        help='Fold to use as dev (if dev_folds > 0): default=%default')
    parser.add_option('--device',
                      type=int,
                      default=None,
                      help='GPU to use: default=%default')
    parser.add_option('--seed',
                      type=int,
                      default=None,
                      help='Random seed: default=%default')

    options, args = parser.parse_args()

    input_dir = args[0]

    if options.seed is not None:
        rng = np.random.RandomState(options.seed)
    else:
        rng = np.random.RandomState(np.random.randint(0, 100000))

    # load the training data
    train_X, vocab, row_selector = load_word_counts(input_dir,
                                                    options.train_prefix)
    train_labels, label_type, label_names, n_labels = load_labels(
        input_dir, options.train_prefix, row_selector, options)
    train_prior_covars, prior_covar_selector, prior_covar_names, n_prior_covars = load_covariates(
        input_dir, options.train_prefix, row_selector, options.prior_covars,
        options.min_prior_covar_count)
    train_topic_covars, topic_covar_selector, topic_covar_names, n_topic_covars = load_covariates(
        input_dir, options.train_prefix, row_selector, options.topic_covars,
        options.min_topic_covar_count)
    options.n_train, vocab_size = train_X.shape
    options.n_labels = n_labels

    if n_labels > 0:
        print("Train label proportions:", np.mean(train_labels, axis=0))

    # split into training and dev if desired
    train_indices, dev_indices = train_dev_split(options, rng)
    train_X, dev_X = split_matrix(train_X, train_indices, dev_indices)
    train_labels, dev_labels = split_matrix(train_labels, train_indices,
                                            dev_indices)
    train_prior_covars, dev_prior_covars = split_matrix(
        train_prior_covars, train_indices, dev_indices)
    train_topic_covars, dev_topic_covars = split_matrix(
        train_topic_covars, train_indices, dev_indices)

    n_train, _ = train_X.shape

    # load the test data
    if options.test_prefix is not None:
        test_X, _, row_selector = load_word_counts(input_dir,
                                                   options.test_prefix,
                                                   vocab=vocab)
        test_labels, _, _, _ = load_labels(input_dir, options.test_prefix,
                                           row_selector, options)
        test_prior_covars, _, _, _ = load_covariates(
            input_dir,
            options.test_prefix,
            row_selector,
            options.prior_covars,
            covariate_selector=prior_covar_selector)
        test_topic_covars, _, _, _ = load_covariates(
            input_dir,
            options.test_prefix,
            row_selector,
            options.topic_covars,
            covariate_selector=topic_covar_selector)
        n_test, _ = test_X.shape

    else:
        test_X = None
        n_test = 0
        test_labels = None
        test_prior_covars = None
        test_topic_covars = None

    # initialize the background using overall word frequencies
    init_bg = get_init_bg(train_X)
    if options.no_bg:
        init_bg = np.zeros_like(init_bg)

    # combine the network configuration parameters into a dictionary
    network_architecture = make_network(options, vocab_size, label_type,
                                        n_labels, n_prior_covars,
                                        n_topic_covars)

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # load word vectors
    embeddings, update_embeddings = load_word_vectors(options, rng, vocab)

    # create the model
    model = Scholar(network_architecture,
                    alpha=options.alpha,
                    learning_rate=options.learning_rate,
                    init_embeddings=embeddings,
                    update_embeddings=update_embeddings,
                    init_bg=init_bg,
                    adam_beta1=options.momentum,
                    device=options.device)

    # train the model
    print("Optimizing full model")
    model = train(model,
                  network_architecture,
                  train_X,
                  train_labels,
                  train_prior_covars,
                  train_topic_covars,
                  training_epochs=options.epochs,
                  batch_size=options.batch_size,
                  rng=rng,
                  X_dev=dev_X,
                  Y_dev=dev_labels,
                  PC_dev=dev_prior_covars,
                  TC_dev=dev_topic_covars)

    # make output directory
    fh.makedirs(options.output_dir)

    # display and save weights
    print_and_save_weights(options, model, vocab, prior_covar_names,
                           topic_covar_names)

    # Evaluate perplexity on dev and test data
    if dev_X is not None:
        perplexity = evaluate_perplexity(model,
                                         dev_X,
                                         dev_labels,
                                         dev_prior_covars,
                                         dev_topic_covars,
                                         options.batch_size,
                                         eta_bn_prop=0.0)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(options.output_dir,
                                           'perplexity.dev.txt'))

    if test_X is not None:
        perplexity = evaluate_perplexity(model,
                                         test_X,
                                         test_labels,
                                         test_prior_covars,
                                         test_topic_covars,
                                         options.batch_size,
                                         eta_bn_prop=0.0)
        print("Test perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(options.output_dir,
                                           'perplexity.test.txt'))

    # evaluate accuracy on predicting labels
    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model,
                                    train_X,
                                    train_labels,
                                    train_prior_covars,
                                    train_topic_covars,
                                    options.output_dir,
                                    subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model,
                                        dev_X,
                                        dev_labels,
                                        dev_prior_covars,
                                        dev_topic_covars,
                                        options.output_dir,
                                        subset='dev')

        if test_X is not None:
            predict_labels_and_evaluate(model,
                                        test_X,
                                        test_labels,
                                        test_prior_covars,
                                        test_topic_covars,
                                        options.output_dir,
                                        subset='test')

    # print label probabilities for each topic
    print_topic_label_associations(options, label_names, model, n_prior_covars,
                                   n_topic_covars)

    # save document representations
    print("Saving document representations")
    save_document_representations(model,
                                  train_X,
                                  train_labels,
                                  train_prior_covars,
                                  train_topic_covars,
                                  options.output_dir,
                                  'train',
                                  batch_size=options.batch_size)

    if dev_X is not None:
        save_document_representations(model,
                                      dev_X,
                                      dev_labels,
                                      dev_prior_covars,
                                      dev_topic_covars,
                                      options.output_dir,
                                      'dev',
                                      batch_size=options.batch_size)

    if n_test > 0:
        save_document_representations(model,
                                      test_X,
                                      test_labels,
                                      test_prior_covars,
                                      test_topic_covars,
                                      options.output_dir,
                                      'test',
                                      batch_size=options.batch_size)
Example #14
0
def process_subset(items, parsed, label_field, label_list, vocab, output_dir,
                   output_prefix):
    n_items = len(items)
    n_labels = len(label_list)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    ids = []
    for i, item in enumerate(items):
        if 'id' in item:
            ids.append(item['id'])
    if len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    label_list_strings = [str(label) for label in label_list]
    label_index = dict(zip(label_list_strings, range(n_labels)))

    # convert labels to a data frame
    if n_labels > 0:
        label_matrix = np.zeros([n_items, n_labels], dtype=int)
        label_vector = np.zeros(n_items, dtype=int)

        for i, item in enumerate(items):
            id = ids[i]
            label = item[label_field]
            label_matrix[i, label_index[str(label)]] = 1
            label_vector[i] = label_index[str(label)]

        labels_df = pd.DataFrame(label_matrix,
                                 index=ids,
                                 columns=label_list_strings)
        labels_df.to_csv(
            os.path.join(output_dir,
                         output_prefix + '.' + label_field + '.csv'))
        label_vector_df = pd.DataFrame(label_vector,
                                       index=ids,
                                       columns=[label_field])
        label_vector_df.to_csv(
            os.path.join(output_dir, output_prefix + '.label_vector.csv'))

    else:
        print("No labels found")

    X = np.zeros([n_items, vocab_size], dtype=int)

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(
                str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))

            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([
                str(k) + ':' + str(int(v))
                for k, v in zip(list(counter.keys()), list(counter.values()))
            ])
            dat_strings.append(dat_string)

            if label_field is not None:
                label = items[i][label_field]
                dat_labels.append(str(label_index[str(label)]))

            values = list(counter.values())
            X[np.ones(len(counter.keys()), dtype=int) * i,
              list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz'))

    print(sparse_X.shape)
    print(len(dat_strings))

    fh.write_to_json(ids, os.path.join(output_dir,
                                       output_prefix + '.ids.json'))

    # save output for Mallet
    fh.write_list_to_text(
        mallet_strings, os.path.join(output_dir,
                                     output_prefix + '.mallet.txt'))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(
        dat_strings, os.path.join(output_dir, output_prefix + '.data.dat'))
    if len(dat_labels) > 0:
        fh.write_list_to_text(
            dat_labels,
            os.path.join(output_dir,
                         output_prefix + '.' + label_field + '.dat'))

    # save output for Jacob Eisenstein's SAGE code:
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object)
    vocab_for_sage[:] = vocab
    if n_labels > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = np.argmax(np.array(labels_df.values, dtype=float),
                                axis=1) + 1
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
Example #15
0
def process_subset(
    items,
    ids,
    parsed,
    labels,
    label_fields,
    label_lists,
    vocab,
    output_dir,
    output_prefix,
    count_dtype=np.int,
):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    if not ids or len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    if labels:
        labels_df = pd.DataFrame.from_records(labels, index=ids)

        for label_field in label_fields:
            labels_df_subset = pd.get_dummies(labels_df[label_field])

            # for any classes not present in the subset, add 0 columns
            # (handles case where classes appear in only one of train or test)
            for category in label_lists[label_field]:
                if category not in labels_df_subset:
                    labels_df_subset[category] = 0

            labels_df_subset.to_csv(
                os.path.join(output_dir,
                             output_prefix + "." + label_field + ".csv"))
            if labels_df[label_field].nunique() == 2:
                labels_df_subset.iloc[:, 1].to_csv(
                    os.path.join(
                        output_dir,
                        output_prefix + "." + label_field + "_vector.csv"),
                    header=[label_field],
                )
            # used later
            label_index = dict(
                zip(labels_df_subset.columns, range(len(labels_df_subset))))
    X = np.zeros([n_items, vocab_size], dtype=count_dtype)

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        words = words.split()

        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(
                str(i) + "\t" + "en" + "\t" + " ".join(word_subset))

            dat_string = str(int(len(counter))) + " "
            dat_string += " ".join([
                str(k) + ":" + str(int(v))
                for k, v in zip(list(counter.keys()), list(counter.values()))
            ])
            dat_strings.append(dat_string)

            # for dat formart, assume just one label is given
            if len(label_fields) > 0:
                label = labels[i][label_fields[-1]]
                dat_labels.append(str(label_index[str(label)]))
            values = np.array(list(counter.values()), dtype=count_dtype)
            X[np.ones(len(counter.keys()), dtype=int) * i,
              list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + ".npz"))

    print("Size of {:s} document-term matrix:".format(output_prefix),
          sparse_X.shape)

    fh.write_to_json(ids, os.path.join(output_dir,
                                       output_prefix + ".ids.json"))

    # save output for Mallet
    fh.write_list_to_text(
        mallet_strings, os.path.join(output_dir,
                                     output_prefix + ".mallet.txt"))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(
        dat_strings, os.path.join(output_dir, output_prefix + ".data.dat"))
    if len(dat_labels) > 0:
        fh.write_list_to_text(
            dat_labels,
            os.path.join(output_dir,
                         output_prefix + "." + label_field + ".dat"),
        )

    # save output for Jacob Eisenstein's SAGE code:
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object)
    vocab_for_sage[:] = vocab

    # for SAGE, assume only a single label has been given
    if len(label_fields) > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = (
            np.argmax(np.array(labels_df_subset.values, dtype=float), axis=1) +
            1)
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
Example #16
0
def main():
    usage = "%prog input_dir"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-k',
        dest='n_topics',
        default=100,
        help='Size of latent representation (~num topics): default=%default')
    parser.add_option(
        '-r',
        action="store_true",
        dest="regularize",
        default=False,
        help=
        'Apply adaptive regularization for sparsity in topics: default=%default'
    )
    parser.add_option('-o',
                      dest='output_dir',
                      default='output',
                      help='Output directory: default=%default')
    parser.add_option(
        '--vocab-size',
        dest='vocab_size',
        default=None,
        help=
        'Filter the vocabulary keeping the most common n words: default=%default'
    )
    parser.add_option('--no-bg',
                      action="store_true",
                      dest="no_bg",
                      default=False,
                      help='Do not use background freq: default=%default')
    parser.add_option(
        '--no-bn-anneal',
        action="store_true",
        dest="no_bn_anneal",
        default=False,
        help='Do not anneal away from batchnorm: default=%default')
    parser.add_option(
        '--opt',
        dest='optimizer',
        default='adam',
        help=
        'Optimization algorithm to use [adam|adagrad|sgd]: default=%default')
    parser.add_option('--dev-folds',
                      dest='dev_folds',
                      default=0,
                      help='Number of dev folds: default=%default')
    parser.add_option(
        '--dev-fold',
        dest='dev_fold',
        default=0,
        help='Fold to use as dev (if dev_folds > 0): default=%default')
    parser.add_option('--test-prefix',
                      dest='test_prefix',
                      default=None,
                      help='Prefix of test set: default=%default')
    parser.add_option(
        '--labels',
        dest='label_name',
        default=None,
        help=
        'Read labels from input_dir/[train|test]_prefix.label_name.csv: default=%default'
    )

    (options, args) = parser.parse_args()

    input_dir = args[0]

    dev_folds = int(options.dev_folds)
    dev_fold = int(options.dev_fold)
    label_file_name = options.label_name

    alpha = 1.0
    n_topics = int(options.n_topics)
    batch_size = 200
    # learning_rate = 0.002
    learning_rate = 0.001
    adam_beta1 = 0.99
    n_epochs = 450
    encoder_layers = 1  #Number of encoder layers [0|1|2]
    encoder_shortcuts = False
    classifier_layers = 1  #[0|1|2]
    auto_regularize = options.regularize
    output_dir = options.output_dir
    # word2vec_file = "/home/lcw2/share/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
    word2vec_file = "../embeddings/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.bin"
    # word2vec_file = "C:\\\\Soft\\share\\GoogleNews-vectors-negative300.bin"
    embedding_dim = 200
    vocab_size = options.vocab_size
    update_background = False
    no_bg = options.no_bg
    bn_anneal = True
    optimizer = options.optimizer
    seed = 1
    threads = 4
    if seed is not None:
        seed = int(seed)
        rng = np.random.RandomState(seed)
    else:
        rng = np.random.RandomState(np.random.randint(0, 100000))
    # kb embedding file
    # kb2vec_file = "/home/lcw2/github/my_vaetm/data/kb2vec/WikiData.KB.100d.zh.pickle"
    kb2vec_file = "./data/kb2vec/WikiData.KB.100d.zh.v2.pickle"
    kb_dim = 100
    test_prefix = 'test'

    # load the training data
    train_prefix = 'train'
    train_X, vocab, train_labels, label_names, label_type, col_sel, num = load_data(
        input_dir, train_prefix, label_file_name, vocab_size=vocab_size)
    n_train, dv = train_X.shape
    if train_labels is not None:
        _, n_labels = train_labels.shape
        print('n_labels:', n_labels)
    else:
        n_labels = 0

    if test_prefix == 'test':
        test_X, _, test_labels, _, _, _, _ = load_data(input_dir,
                                                       test_prefix,
                                                       label_file_name,
                                                       vocab=vocab)
        n_test, _ = test_X.shape
        if test_labels is not None:
            _, n_labels_test = test_labels.shape
            assert n_labels_test == n_labels

    # split training data into train and dev
    if dev_folds > 0:
        n_dev = int(n_train / dev_folds)
        indices = np.array(range(n_train), dtype=int)
        rng.shuffle(indices)
        if dev_fold < dev_folds - 1:
            dev_indices = indices[n_dev * dev_fold:n_dev * (dev_fold + 1)]
        else:
            dev_indices = indices[n_dev * dev_fold:]
        train_indices = list(set(indices) - set(dev_indices))
        dev_X = train_X[dev_indices, :]
        train_X = train_X[train_indices, :]
        n_train = len(train_indices)
    else:
        dev_X = None

    # initialize the background using the overall frequency of terms
    init_bg = get_init_bg(train_X)
    init_beta = None
    update_beta = True
    # if no_bg:
    #     if n_topics == 1:
    #         init_beta = init_bg.copy()
    #         init_beta = init_beta.reshape([1, len(vocab)])
    #         update_beta = False
    #     init_bg = np.zeros_like(init_bg)

    label_emb_dim = -1
    # create the network configuration
    network_architecture = make_network(dv, encoder_layers, embedding_dim,
                                        n_topics, encoder_shortcuts,
                                        label_type, n_labels, label_emb_dim,
                                        classifier_layers)

    print("Network architecture:")
    for key, val in network_architecture.items():
        print(key + ':', val)

    # # load pretrained word vectors
    if word2vec_file is not None:
        vocab_size = len(vocab)
        vocab_dict = dict(zip(vocab, range(vocab_size)))
        embeddings = np.array(rng.rand(vocab_size, embedding_dim) * 0.25 - 0.5,
                              dtype=np.float32)
        count = 0
        print("Loading word vectors")
        if word2vec_file[-3:] == 'bin':
            pretrained = gensim.models.KeyedVectors.load(word2vec_file)
        else:
            pretrained = gensim.models.KeyedVectors.load_word2vec_format(
                word2vec_file, binary=False)

        for word, index in vocab_dict.items():
            if word in pretrained:
                count += 1
                embeddings[index, :] = pretrained[word]

        print("Found word embeddings for %d words" % count)
        print('shape of word embeddings:', embeddings.shape)
    else:
        print("No embeddings for words!")
        exit()

    # load pretrained entity vectors
    # if kb2vec_file is not None:
    #     vocab_size = len(vocab)
    #     vocab_dict = dict(zip(vocab, range(vocab_size)))
    #     entity_embeddings = np.array(rng.rand(vocab_size, kb_dim) * 0.25 - 0.5, dtype=np.float32)
    #     count = 0
    #
    #     print("Loading emtity vectors...")
    #     pretrained = None
    #     with open(kb2vec_file, 'rb') as f:
    #         pretrained = pickle.load(f)
    #     print('# of entities:', len(pretrained))
    #     vocab_counter = collections.Counter()
    #     vocab_counter.update(s for s in num if s in pretrained)
    #     print(vocab_counter.most_common(10))
    #     h = open('./output/topics.txt', 'r', encoding='utf-8')
    #     read_data = h.read()
    #     a = read_data.split()
    #     print('#of topic',len(a))
    #     for word, index in vocab_dict.items():
    #         if word in pretrained and word in a:
    #             print(word)
    #         if word in pretrained:
    #         # elif word in pretrained and word not in a:
    #             count += 1
    #             entity_embeddings[index, :] = pretrained[word]
    #
    #     print("Found entity embeddings for %d words" % count)
    #     print('shape of entity embeddings:', entity_embeddings.shape)
    # else:
    #     print("No embeddings for knowledge entities!")
    #     exit()

    tf.reset_default_graph()

    # create the model
    model = VaeTm(network_architecture, alpha=alpha,\
        learning_rate=learning_rate, \
        batch_size=batch_size,
        # init_embeddings=embeddings,\
        # entity_embeddings=entity_embeddings,\
        init_bg=init_bg,\
        update_background=update_background, init_beta=init_beta,\
        update_beta=update_beta, threads=threads,\
        regularize=auto_regularize, optimizer=optimizer,\
        adam_beta1=adam_beta1, seed=seed)

    # train the model
    print("Optimizing full model")
    model = train(model,
                  network_architecture,
                  train_X,
                  train_labels,
                  vocab,
                  regularize=auto_regularize,
                  training_epochs=n_epochs,
                  batch_size=batch_size,
                  rng=rng,
                  bn_anneal=bn_anneal,
                  X_dev=dev_X)

    # create output directory
    fh.makedirs(output_dir)

    # print background
    bg = model.get_bg()
    if not no_bg:
        print_top_bg(bg, vocab)

    # print topics
    emb = model.get_weights()
    print("Topics:")
    maw, sparsity, topics = print_top_words(emb, vocab)
    print("sparsity in topics = %0.4f" % sparsity)
    save_weights(output_dir, emb, bg, vocab, sparsity_threshold=1e-5)

    fh.write_list_to_text(['{:.4f}'.format(maw)],
                          os.path.join(output_dir, 'maw.txt'))
    fh.write_list_to_text(['{:.4f}'.format(sparsity)],
                          os.path.join(output_dir, 'sparsity.txt'))

    # print('Predicting training representations...')
    # reps, preds = model.predict(train_X)
    # # print('rep-0:', reps[0])
    # # print('rep-0:', reps[1])
    # fh.write_matrix_to_text(reps, os.path.join(output_dir, 'train_representation.txt'))

    # if test_X is not None:
    #     print('Predicting testing representations...')
    #     reps, preds = model.predict(test_X)
    #     # print('rep-0:', reps[0])
    #     # print('rep-0:', reps[1])
    #     fh.write_matrix_to_text(reps, os.path.join(output_dir, 'test_representation.txt'))

    # Evaluate perplexity on dev and test dataa
    if dev_X is not None:
        perplexity = evaluate_perplexity(model, dev_X, eta_bn_prop=0.0)
        print("Dev perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.dev.txt'))

    if test_X is not None:
        perplexity = evaluate_perplexity(model,
                                         test_X,
                                         test_labels,
                                         eta_bn_prop=0.0)
        print("Test perplexity = %0.4f" % perplexity)
        fh.write_list_to_text([str(perplexity)],
                              os.path.join(output_dir, 'perplexity.test.txt'))

        # evaluate accuracy on labels
    if n_labels > 0:
        print("Predicting labels")
        predict_labels_and_evaluate(model,
                                    train_X,
                                    train_labels,
                                    None,
                                    output_dir,
                                    subset='train')

        if dev_X is not None:
            predict_labels_and_evaluate(model,
                                        dev_X,
                                        dev_labels,
                                        None,
                                        output_dir,
                                        subset='dev')

        if test_X is not None:
            predict_labels_and_evaluate(model,
                                        test_X,
                                        test_labels,
                                        None,
                                        output_dir,
                                        subset='test')

    # save document representations
    theta = model.compute_theta(train_X, train_labels)
    np.savez(os.path.join(output_dir, 'theta.train.npz'), theta=theta)
    compute_npmi_at_n(topics, vocab, train_X)