Example #1
0
def train_model(keyword2idx, idx2keyword, word2cnt, author2tags, model):
    evs = np.zeros((DIMENSION, len(idx2keyword)))
    for i in range(len(idx2keyword)):
        evs[:, i] = model[idx2keyword[i]]
    params = {
        "embedding_size": DIMENSION,
        "num_entities": len(idx2keyword),
        "slice_size": 4,
        "lamda": 1e-2,
        "batch_size": 1000,
        "corrupt_size": 10,
        "num_iterations": 500,
        "save_period": 10,
        "batch_iterations": 5,
        "ev_fixed": True,
        "threshold": -0.0,
        "save_file": "ntn_model.dump",
    }
    network = ntn.my_neural_tensor_network(params, init_evs=evs, load_file="ntn_model.dump")
    data = get_training_data(author2tags, keyword2idx, model, word2cnt)
    network.train(data)
Example #2
0
def test():
    model = gensim.models.Word2Vec.load('../embedding/author_word.model')
    keyword2idx, idx2keyword = run.get_reverse_index(model)

    evs = np.zeros((DIMENSION, len(idx2keyword)))
    for i in range(len(idx2keyword)):
        evs[:, i] = model[idx2keyword[i]]

    logging.info('loading neural tensor network')
    params = {'embedding_size': DIMENSION, 'num_entities': len(idx2keyword), 'slice_size': 4,
        'lamda': 1e-2, 'batch_size': 1000, 'corrupt_size': 10, 'num_iterations': 500, 'save_period': 10,
        'batch_iterations': 5, 'ev_fixed': True, 'threshold': -0.0, 'save_file': 'ntn_model.dump'}
    network = ntn.my_neural_tensor_network(params, init_evs = evs, load_file = 'ntn_model.dump')

    for jconf in ['KDD', 'ICML']:
        persons = load_persons(jconf)
        author2words = load_author2words(jconf)
        word2label = load_word_label(jconf)
        test_data, test_label = [], []
        for author in persons:
            words = author2words[author]
            for word in words:
                if word not in word2label or word not in keyword2idx: continue
                author_id, word_id = keyword2idx['A_' + str(author)], keyword2idx[word]
                test_data.append([author_id, word_id])
                label = 1 if word2label[word] == 1 else 0
                test_label.append(label)
        test_data = np.array(test_data, dtype = np.int32)

        logging.info('predicting %s' % jconf)

        p_data = network.predict(test_data, score = True)
        fout = open('ntn_predict_' + jconf + '.out', 'w')
        for i in range(len(test_label)):
            author = idx2keyword[test_data[i][0]]
            word = idx2keyword[test_data[i][1]]
            fout.write(",".join([author, word, str(test_label[i]), str(p_data[i])]) + '\n')
        fout.close()