def train_model(keyword2idx, idx2keyword, word2cnt, author2tags, model): evs = np.zeros((DIMENSION, len(idx2keyword))) for i in range(len(idx2keyword)): evs[:, i] = model[idx2keyword[i]] params = { "embedding_size": DIMENSION, "num_entities": len(idx2keyword), "slice_size": 4, "lamda": 1e-2, "batch_size": 1000, "corrupt_size": 10, "num_iterations": 500, "save_period": 10, "batch_iterations": 5, "ev_fixed": True, "threshold": -0.0, "save_file": "ntn_model.dump", } network = ntn.my_neural_tensor_network(params, init_evs=evs, load_file="ntn_model.dump") data = get_training_data(author2tags, keyword2idx, model, word2cnt) network.train(data)
def test(): model = gensim.models.Word2Vec.load('../embedding/author_word.model') keyword2idx, idx2keyword = run.get_reverse_index(model) evs = np.zeros((DIMENSION, len(idx2keyword))) for i in range(len(idx2keyword)): evs[:, i] = model[idx2keyword[i]] logging.info('loading neural tensor network') params = {'embedding_size': DIMENSION, 'num_entities': len(idx2keyword), 'slice_size': 4, 'lamda': 1e-2, 'batch_size': 1000, 'corrupt_size': 10, 'num_iterations': 500, 'save_period': 10, 'batch_iterations': 5, 'ev_fixed': True, 'threshold': -0.0, 'save_file': 'ntn_model.dump'} network = ntn.my_neural_tensor_network(params, init_evs = evs, load_file = 'ntn_model.dump') for jconf in ['KDD', 'ICML']: persons = load_persons(jconf) author2words = load_author2words(jconf) word2label = load_word_label(jconf) test_data, test_label = [], [] for author in persons: words = author2words[author] for word in words: if word not in word2label or word not in keyword2idx: continue author_id, word_id = keyword2idx['A_' + str(author)], keyword2idx[word] test_data.append([author_id, word_id]) label = 1 if word2label[word] == 1 else 0 test_label.append(label) test_data = np.array(test_data, dtype = np.int32) logging.info('predicting %s' % jconf) p_data = network.predict(test_data, score = True) fout = open('ntn_predict_' + jconf + '.out', 'w') for i in range(len(test_label)): author = idx2keyword[test_data[i][0]] word = idx2keyword[test_data[i][1]] fout.write(",".join([author, word, str(test_label[i]), str(p_data[i])]) + '\n') fout.close()