Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_path', type=str, help='path to the train corpus file')
    parser.add_argument('test_path', type=str, help='path to the test corpus file')
    parser.add_argument('train_label', type=str, help='path to the train label file')
    parser.add_argument('test_label', type=str, help='path to the test label file')
    parser.add_argument('out_dir', type=str, help='path to the output dir')
    parser.add_argument('-nv', '--n_val', type=int, default=1000, help='validation set size')
    args = parser.parse_args()

    docs = load_corpus(args.train_path)['docs'].items()
    test_docs = load_corpus(args.test_path)['docs']

    np.random.seed(0)
    np.random.shuffle(docs)
    n_val = args.n_val
    train_docs = dict(docs[:-n_val])
    val_docs = dict(docs[-n_val:])

    # doc_labels = load_json(args.train_label)
    # test_labels = load_json(args.test_label)
    doc_labels = None
    test_labels = None
    train = corpus2libsvm(train_docs, doc_labels, os.path.join(args.out_dir, 'train.libsvm'))
    val = corpus2libsvm(val_docs, doc_labels, os.path.join(args.out_dir, 'val.libsvm'))
    test = corpus2libsvm(test_docs, test_labels, os.path.join(args.out_dir, 'test.libsvm'))

    import pdb;pdb.set_trace()
Esempio n. 2
0
def train(args):
    corpus = load_corpus(args.input)
    n_vocab, docs = len(corpus['vocab']), corpus['docs']
    corpus.clear()  # save memory

    X_docs = []
    for k in docs.keys():
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]

    np.random.seed(0)
    np.random.shuffle(X_docs)
    # X_docs_noisy = corrupted_matrix(np.r_[X_docs], 0.1)

    n_val = args.n_val
    # X_train = np.r_[X_docs[:-n_val]]
    # X_val = np.r_[X_docs[-n_val:]]
    X_train = np.r_[X_docs[:-n_val]]
    del X_docs[:-n_val]
    X_val = np.r_[X_docs]
    del X_docs

    start = timeit.default_timer()

    vae = VarAutoEncoder(n_vocab,
                         args.n_dim,
                         comp_topk=args.comp_topk,
                         ctype=args.ctype,
                         save_model=args.save_model)
    vae.fit([X_train, X_train], [X_val, X_val],
            nb_epoch=args.n_epoch,
            batch_size=args.batch_size)

    print 'runtime: %ss' % (timeit.default_timer() - start)
Esempio n. 3
0
def extract_dict(args):
    corpus = load_corpus(args.input)
    vocab = corpus['vocab']
    with io.open(os.path.join(args.output_dir, 'dict.corpus'),
                 'w',
                 encoding='utf-8') as f:
        f.write(json.dumps(vocab, ensure_ascii=False))
    print 'Generate the dictionary!'
Esempio n. 4
0
def test(args):
    corpus = load_corpus(args.corpus[0])
    docs, vocab_dict = corpus['docs'], corpus['vocab']
    doc_codes = doc_word2vec(docs,
                             revdict(vocab_dict),
                             args.load_model,
                             args.output,
                             avg=True)
Esempio n. 5
0
def test(args):
    corpus = load_corpus(args.input)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    ae = load_ae_model(args.load_model)
    doc_codes = ae.predict(X_docs)
    dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output)
    print 'Saved doc codes file to %s' % args.output

    if args.save_topics:
        topics_strength = get_topics_strength(ae, revdict(vocab), topn=50)
        print_topics(topics_strength)
        # save_topics_strength(topics_strength, args.save_topics)
        save_chinese_topics_strength(topics_strength, args.save_topics)
        # topics = get_topics(ae, revdict(vocab), topn=10)
        # write_file(topics, args.save_topics)
        print 'Saved topics file to %s' % args.save_topics

    if args.word_clouds:
        queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company',
            'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase',
            'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading',
            'tax', 'march', 'april', 'june', 'july']
        weights = ae.get_weights()[0]
        weights = unitmatrix(weights) # normalize
        word_cloud(weights, vocab, queries, save_file=args.word_clouds)

        print 'Saved word clouds file to %s' % args.word_clouds

    if args.sample_words:
        revocab = revdict(vocab)
        queries = ['weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space']
        words = []
        for each in queries:
            if each in vocab:
                words.append(get_similar_words(ae, vocab[each], revocab, topn=11))
        write_file(words, args.sample_words)
        print 'Saved sample words file to %s' % args.sample_words
    if args.translate_words:
        revocab = revdict(vocab)
        queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']]
        for each in queries:
            print each
            print translate_words(ae, each, vocab, revocab, topn=10)
    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(ae)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(ae)
        print 'Average squared deviation from 0 (90 degree): %s' % sd
def get_words(args):
    corpus = load_corpus(args.input_corpus)
    filename_corpus_dict = corpus['docs']
    vocab_dict = corpus['vocab']
    
    # we have to revort the dict
    dictionary = dict((v,k) for k, v in vocab_dict.iteritems())

    filename_label_dict = load_json(args.input_label)

    print 'Finish loading data'

    label_vocab_dict = {}

    # start counting words
    for filename in filename_corpus_dict:
        vocab_num_dict = filename_corpus_dict[filename]
        label = filename_label_dict[filename]
        try:
            label_vocab_dict[label]
        except:
            label_vocab_dict[label] = {}
        for vocab in vocab_num_dict:
            num = vocab_num_dict[vocab]
            # print 'If num is a int? : ', isinstance(num, int)
            try:
                label_vocab_dict[label][vocab] += num
            except:
                label_vocab_dict[label][vocab] = num

    print 'Finish counting word frequence'

    label_topword_dict = {}
    label_num = len(label_topword_dict)
    print 'Label num is ', label_num
    topn = args.topn
    for label in label_vocab_dict:
        vocab_num_dict = label_vocab_dict[label]
        label_topword_dict[label] = sorted(vocab_num_dict, key = vocab_num_dict.__getitem__, reverse = True)[:topn]

    print 'Finish sorting the top n word'

    dump_json(label_topword_dict, args.output_json)
    print 'Finish write the json file'

    for label in label_topword_dict:
        filename_o = args.output_dir + 'label-' + str(label) + '.txt'
        print 'filename =' , filename_o
        file_o = open(filename_o, 'w')
        for word_index in label_topword_dict[label]:
            # print 'Is word_index a int:', isinstance(word_index, int)
            text = dictionary[int(word_index)]
            text += '\n'
            file_o.write(text.encode('utf-8'))
        file_o.close()
    print 'Finish writing files!'
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('corpus', type=str, help='path to the corpus file')
    parser.add_argument('labels', type=str, help='path to the labels file')
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=100,
                        help='batch size (default 100)')
    parser.add_argument('out_dir', type=str, help='path to the output dir')
    args = parser.parse_args()

    corpus = load_corpus(args.corpus)
    doc_labels = load_json(args.labels)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)
    doc_names = docs.keys()
    X_docs = [doc2vec(x, n_vocab) for x in docs.values()]

    out_dir = args.out_dir
    # attributes
    attrs = zip(*sorted(vocab.items(), key=lambda d: [1]))[0]
    dump_pickle(attrs, os.path.join(out_dir, 'attributes.p'))

    # batches
    bs = args.batch_size
    batches = [bs * (x + 1) for x in range(int(len(docs) / bs) - 1)]
    batches.append(len(docs))
    dump_pickle(batches, os.path.join(out_dir, 'batches.p'))

    # bow_batch_x
    for i in range(len(batches)):
        dump_pickle(X_docs[batches[i - 1] if i > 0 else 0:batches[i]],
                    os.path.join(out_dir, 'bow_batch_%s.p' % batches[i]))

    # # docs_names_batch_x
    # for i in range(len(batches)):
    #     dump_pickle(doc_names[batches[i - 1] if i > 0 else 0: batches[i]], os.path.join(out_dir, 'docs_names_batch_%s.p' % batches[i]))

    # class_indices_batch_x
    for i in range(len(batches)):
        data = [
            doc_labels[doc_names[idx]]
            for idx in range(batches[i - 1] if i > 0 else 0, batches[i])
        ]
        dump_pickle(
            data, os.path.join(out_dir,
                               'class_indices_batch_%s.p' % batches[i]))

    import pdb
    pdb.set_trace()
Esempio n. 8
0
def test(args):
    corpus = load_corpus(args.input)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    model = AutoEncoder
    # model = DeepAutoEncoder
    ae = load_model(model, args.load_arch, args.load_weights)

    doc_codes = ae.encoder.predict(X_docs)
    dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output)
    print('Saved doc codes file to %s' % args.output)

    if args.save_topics:
        topics_strength = get_topics_strength(ae, revdict(vocab), topn=10)
        save_topics_strength(topics_strength, args.save_topics)
        # topics = get_topics(ae, revdict(vocab), topn=10)
        # write_file(topics, args.save_topics)
        print('Saved topics file to %s' % args.save_topics)

    if args.sample_words:
        revocab = revdict(vocab)
        queries = [
            'weapon', 'christian', 'compani', 'israel', 'law', 'hockey',
            'comput', 'space'
        ]
        words = []
        for each in queries:
            words.append(get_similar_words(ae, vocab[each], revocab, topn=11))
        write_file(words, args.sample_words)
        print('Saved sample words file to %s' % args.sample_words)
    if args.translate_words:
        revocab = revdict(vocab)
        queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']]
        for each in queries:
            print(each)
            print(translate_words(ae, each, vocab, revocab, topn=10))
    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(ae)
        # print('Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi))
        sd = calc_pairwise_dev(ae)
        print('Average squared deviation from 0 (90 degree): %s' % sd)
Esempio n. 9
0
def gen_docs(args):
    #corpus = load_corpus("./data/20news/output/test.corpus")
    corpus = load_corpus(args.input_corpus)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    #new docs
    new_docs = {}

    for doc_key in docs.keys():
        if doc_key.startswith(args.startswith):
            new_docs[doc_key] = docs[doc_key]

    print("{},共有文档:{}".format(args.startswith, len(new_docs)))

    dump_json({"vocab": vocab, "docs": new_docs}, args.output)
Esempio n. 10
0
def test(args):
    corpus = load_corpus(args.corpus)
    vocab, docs = corpus['vocab'], corpus['docs']
    doc_bow = {}
    for k in docs.keys():
        bows = []
        for idx, count in docs[k].iteritems():
            bows.append((int(idx), count))
        doc_bow[k] = bows
        del docs[k]

    lda = load_model(args.load_model)
    generate_doc_codes(lda, doc_bow, args.output)
    print 'Saved doc codes file to %s' % args.output

    if args.word_clouds:
        queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company',\
             'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase',\
             'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading',\
             'tax', 'march', 'june']
        # queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', \
        #      'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', \
        #      'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', \
        #      'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', \
        #      'business', 'transaction', 'govern', 'trading', 'tax', 'three', 'four', 'five', \
        #      'eleven', 'thirteen', 'fifteen', 'eighteen', 'twenty']
        weights = lda.state.get_lambda()
        weights = np.apply_along_axis(lambda x: x / x.sum(), 1,
                                      weights)  # get dist.
        # weights = unitmatrix(weights, axis=1) # normalize
        word_cloud(weights.T, vocab, queries, save_file=args.word_clouds)

        print 'Saved word clouds file to %s' % args.word_clouds

    if args.save_topics:
        topics_prob = show_topics_prob(lda)
        save_topics_prob(topics_prob, args.save_topics)
        # topics = show_topics(lda)
        # write_file(topics, args.save_topics)
        print 'Saved topics file to %s' % args.save_topics

    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(lda)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(lda)
        print 'Average squared deviation from 0 (90 degree): %s' % sd
Esempio n. 11
0
def test(args):
    corpus = load_corpus(args.input)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    vae = load_vae_model(args.load_model)

    doc_codes = vae.predict(X_docs)
    dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output)
    print 'Saved doc codes file to %s' % args.output
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--corpus',
                        required=True,
                        type=str,
                        help='path to the corpus file')
    parser.add_argument('-mf',
                        '--mod_file',
                        required=True,
                        type=str,
                        help='path to the word2vec mod file')
    parser.add_argument('-sw',
                        '--sample_words',
                        type=str,
                        help='path to the output sample words file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        help='path to the output doc codes file')
    args = parser.parse_args()

    corpus = load_corpus(args.corpus)
    docs, vocab_dict = corpus['docs'], corpus['vocab']
    w2v = load_w2v(args.mod_file)

    # doc_codes = doc_word2vec(w2v, docs, revdict(vocab_dict), args.output, avg=True)
    if args.sample_words:
        queries = [
            'weapon', 'christian', 'compani', 'israel', 'law', 'hockey',
            'comput', 'space'
        ]
        words = []
        for each in queries:
            words.append(get_similar_words(w2v, each, topn=5))
        write_file(words, args.sample_words)
        print('Saved sample words file to %s' % args.sample_words)

    import pdb
    pdb.set_trace()
Esempio n. 13
0
def train(args):
    corpus = load_corpus(args.corpus)
    docs, vocab_dict = corpus['docs'], corpus['vocab']
    doc_bow = []
    doc_keys = docs.keys()
    for k in doc_keys:
        bows = []
        for idx, count in docs[k].iteritems():
            bows.append((int(idx), count))
        doc_bow.append(bows)
        del docs[k]
    vocab_dict = dict([(int(y), x) for x, y in vocab_dict.iteritems()])

    n_samples = len(doc_bow)
    doc_bow = np.array(doc_bow)
    np.random.seed(0)
    val_idx = np.random.choice(range(n_samples), args.n_val, replace=False)
    train_idx = list(set(range(n_samples)) - set(val_idx))
    dbow_train = doc_bow[train_idx].tolist()
    dbow_val = doc_bow[val_idx].tolist()
    del doc_bow

    start = timeit.default_timer()
    lda = train_lda(dbow_train, vocab_dict, args.n_topics, args.n_iter,
                    args.save_model)
    print 'runtime: %ss' % (timeit.default_timer() - start)

    if args.output:
        doc_keys = np.array(doc_keys)
        generate_doc_codes(lda,
                           dict(zip(doc_keys[train_idx].tolist(), dbow_train)),
                           args.output + '.train')
        generate_doc_codes(lda, dict(zip(doc_keys[val_idx].tolist(),
                                         dbow_val)), args.output + '.val')
        print 'Saved doc codes file to %s and %s' % (args.output + '.train',
                                                     args.output + '.val')
Esempio n. 14
0
def train(args):
    corpus = load_corpus(args.input)
    n_vocab, docs = len(corpus['vocab']), corpus['docs']
    corpus.clear()  # save memory
    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    if args.noise == 'gs':
        X_docs_noisy = add_gaussian_noise(X_docs, 0.1)
    elif args.noise == 'sp':
        X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1)
        pass
    elif args.noise == 'mn':
        X_docs_noisy = add_masking_noise(X_docs, 0.01)
    else:
        pass

    n_samples = X_docs.shape[0]
    np.random.seed(0)
    val_idx = np.random.choice(range(n_samples), args.n_val, replace=False)
    train_idx = list(set(range(n_samples)) - set(val_idx))
    X_train = X_docs[train_idx]
    X_val = X_docs[val_idx]
    del X_docs

    if args.noise:
        # X_train_noisy = X_docs_noisy[:-n_val]
        # X_val_noisy = X_docs_noisy[-n_val:]
        X_train_noisy = X_docs_noisy[train_idx]
        X_val_noisy = X_docs_noisy[val_idx]
        print 'added %s noise' % args.noise
    else:
        X_train_noisy = X_train
        X_val_noisy = X_val

    start = timeit.default_timer()

    ae = AutoEncoder(n_vocab,
                     args.n_dim,
                     comp_topk=args.comp_topk,
                     ctype=args.ctype,
                     save_model=args.save_model)
    ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \
            batch_size=args.batch_size, contractive=args.contractive)

    print 'runtime: %ss' % (timeit.default_timer() - start)

    if args.output:
        train_doc_codes = ae.encoder.predict(X_train)
        val_doc_codes = ae.encoder.predict(X_val)
        doc_keys = np.array(doc_keys)
        dump_json(
            dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())),
            args.output + '.train')
        dump_json(
            dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())),
            args.output + '.val')
        print 'Saved doc codes file to %s and %s' % (args.output + '.train',
                                                     args.output + '.val')
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('train_doc_labels',
                        type=str,
                        help='path to the train doc labels file')
    parser.add_argument('test_doc_codes',
                        type=str,
                        help='path to the test doc codes file')
    parser.add_argument('test_doc_labels',
                        type=str,
                        help='path to the test doc labels file')
    parser.add_argument('-nv',
                        '--n_val',
                        type=int,
                        default=1000,
                        help='size of validation set (default 1000)')
    parser.add_argument(
        '-qi',
        '--query_info',
        type=str,
        help='path to the query corpus (for geting doc length info)')
    parser.add_argument('-ml',
                        '--multilabel',
                        action='store_true',
                        help='multilabel flag')
    args = parser.parse_args()

    # autoencoder
    train_doc_codes = load_json(args.train_doc_codes)
    train_doc_labels = load_json(args.train_doc_labels)
    test_doc_codes = load_json(args.test_doc_codes)
    test_doc_labels = load_json(args.test_doc_labels)
    X_train = np.r_[train_doc_codes.values()]
    Y_train = np.array([train_doc_labels[i] for i in train_doc_codes])
    X_test = np.r_[test_doc_codes.values()]
    Y_test = np.array([test_doc_labels[i] for i in test_doc_codes])

    # # DocNADE
    # train_doc_codes = load_json(args.train_doc_codes)
    # train_doc_labels = load_json(args.train_doc_labels)
    # test_doc_codes = load_json(args.test_doc_codes)
    # test_doc_labels = load_json(args.test_doc_labels)
    # X_train = []
    # for each in train_doc_codes.values():
    #     X_train.append([float(x) for x in each])
    # X_test = []
    # for each in test_doc_codes.values():
    #     X_test.append([float(x) for x in each])

    # X_train = np.r_[X_train]
    # Y_train = np.array([train_doc_labels[i] for i in train_doc_codes])
    # X_test = np.r_[X_test]
    # Y_test = np.array([test_doc_labels[i] for i in test_doc_codes])

    # # DBN
    # X_train = np.array(load_marshal(args.train_doc_codes))
    # Y_train = np.array(load_marshal(args.train_doc_labels))
    # X_test = np.array(load_marshal(args.test_doc_codes))
    # Y_test = np.array(load_marshal(args.test_doc_labels))

    seed = 7
    np.random.seed(seed)
    val_idx = np.random.choice(range(X_train.shape[0]),
                               args.n_val,
                               replace=False)
    train_idx = list(set(range(X_train.shape[0])) - set(val_idx))
    X_new_train = X_train[train_idx]
    Y_new_train = Y_train[train_idx]
    X_new_val = X_train[val_idx]
    Y_new_val = Y_train[val_idx]
    print 'train: %s, val: %s, test: %s' % (
        X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0])

    results = retrieval(X_new_train, Y_new_train, X_new_val, Y_new_val,\
                        fractions=[0.001], multilabel=args.multilabel)
    print 'precision on val set: %s' % results

    if not args.query_info:
        results = retrieval(X_train, Y_train, X_test, Y_test,\
                        fractions=[0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0], multilabel=args.multilabel)
    else:
        query_docs = load_corpus(args.query_info)['docs']
        len_test = [sum(query_docs[i].values()) for i in test_doc_codes]
        results = retrieval_by_doclength(X_train,
                                         Y_train,
                                         X_test,
                                         Y_test,
                                         len_test,
                                         fraction=0.001,
                                         multilabel=args.multilabel)
    print 'precision on test set: %s' % results
    import pdb
    pdb.set_trace()
Esempio n. 16
0
def train(args):
    corpus = load_corpus(args.input)
    n_vocab, docs = len(corpus['vocab']), corpus['docs']
    corpus.clear()
    # vocab = corpus['vocab']
    corpus.clear()  # save memory
    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    if args.noise == 'gs':
        X_docs_noisy = add_gaussian_noise(X_docs, 0.1)
    elif args.noise == 'sp':
        X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1)
        pass
    elif args.noise == 'mn':
        X_docs_noisy = add_masking_noise(X_docs, 0.01)
    else:
        pass

    n_samples = X_docs.shape[0]
    np.random.seed(0)
    val_idx = np.random.choice(range(n_samples), args.n_val, replace=False)
    train_idx = list(set(range(n_samples)) - set(val_idx))
    X_train = X_docs[train_idx]
    X_val = X_docs[val_idx]
    del X_docs

    if args.noise:
        # X_train_noisy = X_docs_noisy[:-n_val]
        # X_val_noisy = X_docs_noisy[-n_val:]
        X_train_noisy = X_docs_noisy[train_idx]
        X_val_noisy = X_docs_noisy[val_idx]
        print 'added %s noise' % args.noise
    else:
        X_train_noisy = X_train
        X_val_noisy = X_val

    start = timeit.default_timer()

    ae = AutoEncoder(n_vocab,
                     args.n_dim,
                     comp_topk=args.comp_topk,
                     ctype=args.ctype,
                     save_model=args.save_model)
    ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \
            batch_size=args.batch_size, contractive=args.contractive)

    print 'runtime: %ss' % (timeit.default_timer() - start)

    if args.output:
        train_doc_codes = ae.encoder.predict(X_train)
        val_doc_codes = ae.encoder.predict(X_val)
        doc_keys = np.array(doc_keys)
        dump_json(
            dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())),
            args.output + '.train')
        dump_json(
            dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())),
            args.output + '.val')
        print 'Saved doc codes file to %s and %s' % (args.output + '.train',
                                                     args.output + '.val')

    def unitmatrix(matrix, norm='l2', axis=1):
        if norm == 'l1':
            maxtrixlen = np.sum(np.abs(matrix), axis=axis)
        if norm == 'l2':
            maxtrixlen = np.linalg.norm(matrix, axis=axis)

        if np.any(maxtrixlen <= 0):
            return matrix
        else:
            maxtrixlen = maxtrixlen.reshape(
                1, len(maxtrixlen)) if axis == 0 else maxtrixlen.reshape(
                    len(maxtrixlen), 1)
            return matrix / maxtrixlen

    def calc_pairwise_dev(weights):
        # the average squared deviation from 0 (90 degree)
        weights = unitmatrix(weights, axis=0)  # normalize
        n = weights.shape[1]
        score = 0.
        for i in range(n):
            for j in range(i + 1, n):
                score += (weights[:, i].dot(weights[:, j]))**2

        return np.sqrt(2. * score / n / (n - 1))

    from keras.models import load_model
def kmeans2(args):
    sentense_vec_dic = load_corpus(args.input)
    vec_name_u = load_corpus(args.question_name)
    print("if sentense_vec is a dict:")
    print(isinstance(sentense_vec_dic,dict))
    print("if vec_name is a ls:")
    print(isinstance(vec_name_u,list))
    vec = []
    vec_name = []

    for key in vec_name_u:
        filename = key.encode('utf-8')
        if filename in sentense_vec_dic.keys():
            vec.append(sentense_vec_dic[filename])
            vec_name.append(filename)

    print "file number is ", len(vec_name)
    sentense_vec_X = np.array(vec)

    print "doing k-means...."
    kmeans = KMeans(n_clusters=args.cluster_num, random_state=0).fit(sentense_vec_X)

    print "generate label"
    label_ls = kmeans.labels_

    filename_label_dic = {}
    filesize = len(vec_name)
    for i in range(filesize):
        filename_label_dic[vec_name[i]] = label_ls[i]
    text_filename = args.text_file
    filename_text_dict = {}
    try:
        fp =  open(text_filename, 'r')
        count_doc = 0;
        while 1:
            lines = fp.readlines()
            if not lines:
                break
            for sentense in lines:
                # print(sentense)
                text = sentense.decode('utf-8').strip('\r\n')
                count_doc += 1
                doc_name = 'line-' + str(count_doc)
                filename_text_dict[doc_name] = text
    except Exception as e:
        raise e

    label_text_ls = []
    for i in range(args.cluster_num):
    	ls = []
    	label_text_ls.append(ls)

    for key in filename_label_dic:
    	label = filename_label_dic[key]
    	content = filename_text_dict[key]
    	# print 'content of ', content, 'and the label is [', label, ']'
    	label_text_ls[label].append(content)


    file_dict = {}
    for i in range(args.cluster_num):
        filename_o = args.output_dir + 'label-' + str(i) + '.txt'
        print 'filename =' , filename_o
        file_o = open(filename_o, 'w')
        for text in label_text_ls[i]:
    		text += '\n'
    		file_o.write(text.encode('utf-8'))
    	file_o.close()
def get_word_relationship(args):
    corpus = load_corpus(args.input_corpus)
    doc_vec_dict = corpus['docs']
    vocab_dict = corpus['vocab']
    print 'Load corpus'

    # we have to revort the dict
    dictionary = dict((v,k) for k, v in vocab_dict.iteritems())

    # Here the input top words path is the json file of the label-topwords_ls 
    # should be a dict, each key is a label and its value is the list of top words
    top_words_path = args.input_topwords
    label_topwordls = load_json(top_words_path)
    print 'Load top words of each label'

    label_topwords_vocabnum_dict = {}
    label_topwordindexls_dict = {}
    for label in label_topwordls:
        label_topwords_vocabnum_dict[label] = {}
        topwords_index_ls = []
        for word in label_topwordls[label]:
            topwords_index_ls.append(word)
            label_topwords_vocabnum_dict[label][word] = {}
        label_topwordindexls_dict[label] = topwords_index_ls

    print 'Finish change words into index'

    # in order to save memory and speed it up, I only calculate the word-words frequency of those 
    # in the top word list

    for label in label_topwordindexls_dict:
        print 'Doing label', str(label)
        topwords_idx_set = set(label_topwordindexls_dict[label])

        for filename in doc_vec_dict:
            word_vec_dict = doc_vec_dict[filename]
            result_word_ls = get_word_list(word_vec_dict, topwords_idx_set)
            for word in result_word_ls:
                for doc_word in word_vec_dict:
                    try:
                        label_topwords_vocabnum_dict[label][word][doc_word] += word_vec_dict[doc_word]
                    except:
                        label_topwords_vocabnum_dict[label][word][doc_word] = word_vec_dict[doc_word]

    print 'Finish building the dict of label-topwords-words-num!'

    # now we should get the top of words

    topn = args.topn

    # it is a dict-dict-ls ({label:{words:[top_relative words]}})
    label_topwords_relativewords = {}
    for label in label_topwords_vocabnum_dict:
        label_topwords_relativewords[label] = {}
        for word in label_topwords_vocabnum_dict[label]:
            vocab_num_dict = label_topwords_vocabnum_dict[label][word]
            label_topwords_relativewords[label][word] = sorted(vocab_num_dict,
                key=vocab_num_dict.__getitem__, reverse = True)[:topn]

    print 'Finish sorting the top n word'

    dump_json(label_topwords_relativewords, args.output_json)
    print 'Finish write the json file'

    for label in label_topwords_relativewords:
        filename_o = args.output_dir + 'label-' + str(label) + '.txt'
        print 'filename =' , filename_o
        file_o = open(filename_o, 'w')
        for word_index in label_topwords_relativewords[label]:
            # print 'Is word_index a int:', isinstance(word_index, int)
            text = dictionary[int(word_index)]
            text += ': '
            for top_relative_wordidx in label_topwords_relativewords[label][word_index]:
                text += dictionary[int(top_relative_wordidx)]
                text += ', '
            text += '\n'
            file_o.write(text.encode('utf-8'))
        file_o.close()
    print 'Finish writing files!'
Esempio n. 19
0
    def learn_embedding(self, graph=None, edge_f=None, is_weighted=False, no_python=False, path_output="", dataset=""):

        n_dim = self._d
        method = "sdne"
        input = path_output + '/train.corpus'
        path_graph_embedding = path_source + "embedding/" + dataset + "/embedding_gem_sdne_" + dataset + "_" + str(
            n_dim) + ".txt"
        path_graph_embedding_id = path_source + "embedding/" + dataset + "/id_gem_" + method + "_" + dataset + "_" + str(
            n_dim) + ".txt"

        save_model = 'model'
        optimizer = "adadelta"

        val_split = 0.0214

        batch_size = self._batch_size
        comp_topk = self._comp_topk
        optimizer = self._optimizer
        lr = self._lr
        alpha = self._alpha
        kfactor = self._kfactor
        gamma = self._gamma
        select_diff = self._select_diff
        select_loss = self._select_loss
        select_graph_np_diff = self._select_graph_np_diff

        contractive = None
        ctype = "kcomp"
        n_dim = 128
        nb_epoch = 1000
        save_model = 'model'

        if not graph and not edge_f:
            raise Exception('graph/edge_f needed')
        if not graph:
            graph = graph_util.loadGraphFromEdgeListTxt(edge_f)
        num_nodes = graph.number_of_nodes()
        graph3 = nx.DiGraph()
        graph3.add_nodes_from(range(0, num_nodes))
        f1 = csv.reader(open(edge_f, "r"), delimiter=' ')
        for x, y in f1:
            # print(x,y)
            graph3.add_edge(int(x), int(y))
        S = nx.to_scipy_sparse_matrix(graph, nodelist=sorted(graph.nodes()))
        t1 = time()
        S = (S + S.T) / 2
        node_num = graph.number_of_nodes()
        edges_num = graph.number_of_edges()
        dict_nodes = {k: v for v, k in enumerate(sorted(graph.nodes()))}

        ## Load Graph Embeddings
        if (path_graph_embedding.endswith(".txt")):
            print("Loading SDNE embeddings")
            graph_embeddings = np.loadtxt(path_graph_embedding, delimiter=',')
            with open(path_graph_embedding_id) as temp_file:
                graph_embedding_id = [line.rstrip('\n') for line in temp_file]
            dict_graph = {k: v for v, k in enumerate(graph_embedding_id)}

        else:
            raise Exception('sdne embeddings do not exist')
            graph_embeddings = pickle.load(open(path_graph_embedding, "rb"))

        ## Load text data
        print("Loading textual corpus")
        corpus = load_corpus(input)
        n_vocab = len(corpus['vocab'])
        docs = corpus['docs']
        corpus.clear()  # save memory
        doc_keys = np.array(list(docs))
        dict_doc = {int(k): v for v, k in enumerate((doc_keys))}

        X_docs = []
        for k in list(docs):
            X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
            del docs[k]
        X_docs = np.r_[X_docs]
        # dump_json(dict(zip(doc_keys.tolist(), X_docs.tolist())), path_source+'embedding\\'+dataset+'\\bow.txt')

        text_vector = self.get_node_representation(graph, X_docs, dict_doc)
        graph_vector = self.get_node_representation(graph, graph_embeddings, dict_nodes)

        # return S,node_num,edges_num,graph_embeddings, X_docs,n_vocab, doc_keys, text_vector, graph_vector

        train_data = [text_vector, text_vector, graph_vector]

        result, _Y, model = fit_quadruple_hyperas(n_vocab, n_dim, comp_topk=comp_topk, ctype=ctype,
                                                  save_model=save_model,
                                                  kfactor=kfactor, alpha=alpha, gamma=gamma, num_nodes=node_num,
                                                  num_edges=edges_num,
                                                  train_data=train_data, test_data=X_docs, val_split=val_split,
                                                  nb_epoch=nb_epoch, \
                                                  batch_size=batch_size, contractive=contractive, optimizer=optimizer,
                                                  lr=lr,
                                                  select_diff=select_diff, select_loss=select_loss,
                                                  select_graph_np_diff=select_graph_np_diff)

        dump_json(dict(zip(doc_keys.tolist(), _Y.tolist())),
                  path_source + 'embedding\\' + dataset + '\\predicted_cage_embedding.txt')
        print('Saved doc codes file')

        self._Y = _Y
        self._node_num = node_num
        self._X = X_docs
        _Y_id = doc_keys.tolist()

        return _Y, _Y_id, len(result.history["loss"]), t1
def kmeans(args):
    sentense_vec_dic = load_corpus(args.input)
    print("if sentense_vec is a dict:")
    print(isinstance(sentense_vec_dic, dict))
    vec = []
    vec_name = []
    for key in sentense_vec_dic:
        vec.append(sentense_vec_dic[key])
        vec_name.append(key)
    print "dict size is ", len(sentense_vec_dic)
    sentense_vec_X = np.array(vec)

    print "doing k-means...."
    if args.is_large_set:
        print "Do it in large data set"
        kmeans = MiniBatchKMeans(n_clusters=args.cluster_num,
                                 random_state=0).fit(sentense_vec_X)
    else:
        print "Do it in small data set"
        kmeans = KMeans(n_clusters=args.cluster_num,
                        random_state=0).fit(sentense_vec_X)

    print "generate label"
    label_ls = kmeans.labels_

    filename_label_dic = {}
    filesize = len(sentense_vec_dic)
    for i in range(filesize):
        filename_label_dic[vec_name[i]] = int(label_ls[i])

    if args.output_json:
        print 'Write the label to the json file'
        dump_json(filename_label_dic, args.output_json)
        # with io.open(args.output_json, 'w', encoding='utf-8') as f:
        #     f.write(json.dumps(filename_label_dic, ensure_ascii=False))
        print 'Finish writing filename_label dict to file'

    text_filename = args.text_file
    filename_text_dict = {}
    try:
        fp = open(text_filename, 'r')
        count_doc = 0
        while 1:
            lines = fp.readlines()
            if not lines:
                break
            for sentense in lines:
                # print(sentense)
                text = sentense.decode('utf-8').strip('\r\n')
                count_doc += 1
                doc_name = 'line-' + str(count_doc)
                filename_text_dict[doc_name] = text
    except Exception as e:
        raise e

    label_text_ls = []
    for i in range(args.cluster_num):
        ls = []
        label_text_ls.append(ls)

    for key in filename_label_dic:
        label = filename_label_dic[key]
        content = filename_text_dict[key]
        # print 'content of ', content, 'and the label is [', label, ']'
        label_text_ls[label].append(content)

    file_dict = {}
    for i in range(args.cluster_num):
        filename_o = args.output_dir + 'label-' + str(i) + '.txt'
        print 'filename =', filename_o
        file_o = open(filename_o, 'w')
        for text in label_text_ls[i]:
            text += '\n'
            file_o.write(text.encode('utf-8'))
        file_o.close()
Esempio n. 21
0
def test(args):
    corpus = load_corpus(args.input)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)

    doc_keys = list(docs.keys())
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    ae = load_ae_model(args.load_model)
    doc_codes = ae.predict(X_docs)
    dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output)
    print('Saved doc codes file to %s' % args.output)

    if args.save_topics:
        topics_strength = get_topics_strength(ae, revdict(vocab), topn=10)
        save_topics_strength(topics_strength, args.save_topics)
        # topics = get_topics(ae, revdict(vocab), topn=10)
        # write_file(topics, args.save_topics)
        print('Saved topics file to %s' % args.save_topics)

    if args.word_clouds:
        queries = [
            'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock',
            'share', 'award', 'risk', 'security', 'bank', 'company', 'service',
            'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus',
            'shareholder', 'income', 'financial', 'net', 'purchase',
            'position', 'management', 'loss', 'salary', 'stockholder', 'due',
            'business', 'transaction', 'govern', 'trading', 'tax', 'march',
            'april', 'june', 'july'
        ]
        weights = ae.get_weights()[0]
        weights = unitmatrix(weights)  # normalize
        word_cloud(weights, vocab, queries, save_file=args.word_clouds)

        print('Saved word clouds file to %s' % args.word_clouds)

    if args.sample_words:
        revocab = revdict(vocab)
        while True:
            print("----------------------------\n? ", end='')
            sys.stdout.flush()
            query = sys.stdin.readline()
            query = re.sub(r'[^\w\s-]', ' ',
                           query)  # remove punctuations except hyphen
            query_words = []
            for word in query.lower().split():  # convert to lowercase
                if word not in stopwords.words('english'):  # remove stop words
                    query_words.append(word)

            # ===== make the query length to be (32) = times_steps size
            """long_enough = False
                while not long_enough:
                        for word in query_words:
                                query_vectors.append(word2vec_map[word])
                                if len(query_vectors) == 32:
                                        long_enough = True
                                        break"""
            words = []
            for each in query_words:
                words.append(
                    get_similar_words(ae, vocab[each], revocab, topn=11))
                write_file(words, args.sample_words)
                print('Saved sample words file to %s' % args.sample_words)
    if args.translate_words:
        revocab = revdict(vocab)
        queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']]
        for each in queries:
            print(each)
            print(translate_words(ae, each, vocab, revocab, topn=10))
    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(ae)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(ae)
        print('Average squared deviation from 0 (90 degree): %s' % sd)