def main(): np.random.seed(133) # Load the relations with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} # Load the datasets print 'Loading the dataset...' train_set = load_dataset(args.dataset_prefix + '/train.tsv', relations) val_set = load_dataset(args.dataset_prefix + '/val.tsv', relations) test_set = load_dataset(args.dataset_prefix + '/test.tsv', relations) y_train = [relation_index[label] for label in train_set.values()] y_val = [relation_index[label] for label in val_set.values()] y_test = [relation_index[label] for label in test_set.values()] dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() print 'Done!' # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(args.corpus_prefix) print 'Done!' # Get the vocabulary vocabulary = get_vocabulary(corpus, dataset_keys) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index = load_embeddings(args.embeddings_file, vocabulary) word_inverted_index = {i: w for w, i in word_index.iteritems()} # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)] X_test = dataset_instances[len(train_set) + len(val_set):] x_y_vectors_train = x_y_vectors[:len(train_set)] x_y_vectors_val = x_y_vectors[len(train_set):len(train_set) + len(val_set)] x_y_vectors_test = x_y_vectors[len(train_set) + len(val_set):] # Tune the hyper-parameters using the validation set alphas = [0.001] word_dropout_rates = [0.0] # [0.0, 0.2, 0.4] f1_results = [] models = [] descriptions = [] for alpha in alphas: for word_dropout_rate in word_dropout_rates: # Create the classifier classifier = PathLSTMClassifier( num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=args.num_epochs, num_relations=len(relations), lemma_embeddings=word_vectors, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=True, num_hidden_layers=args.num_hidden_layers) print 'Training with learning rate = %f, dropout = %f...' % ( alpha, word_dropout_rate) classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train) pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val) precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False) print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \ (alpha, word_dropout_rate, precision, recall, f1) f1_results.append(f1) models.append(classifier) # Save intermediate models classifier.save_model( args.model_prefix_file + '.' + str(word_dropout_rate), [word_index, pos_index, dep_index, dir_index]) descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate)) best_index = np.argmax(f1_results) classifier = models[best_index] description = descriptions[best_index] print 'Best hyper-parameters: ' + description # Save the best model to a file print 'Saving the model...' classifier.save_model(args.model_prefix_file, [word_index, pos_index, dep_index, dir_index]) # Evaluate on the test set print 'Evaluation:' pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1) # Write the predictions to a file output_predictions(args.model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test) # Retrieve k-best scoring paths for each class all_paths = unique( [path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, relation_index, 0.7) for i, relation in enumerate(relations): with codecs.open(args.model_prefix_file + '.paths.' + relation, 'w', 'utf-8') as f_out: for path, score in top_k[i]: path_str = '_'.join([ reconstruct_edge(edge, word_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path ]) print >> f_out, '\t'.join([path_str, str(score)])
def main(): """ Trains the LSTM-based integrated pattern-based and distributional method for hypernymy detection :return: """ corpus_prefix = sys.argv[5] dataset_prefix = sys.argv[6] output_file = sys.argv[7] embeddings_file = sys.argv[8] alpha = float(sys.argv[9]) word_dropout_rate = float(sys.argv[10]) np.random.seed(133) relations = ['none', 'hypernym', 'hyponym', "synonym"] mappingDict = {key: idx for (idx, key) in enumerate(relations)} # Load the datasets print('Loading the dataset...') train_set = load_dataset(dataset_prefix + 'train.tsv') test_set = load_dataset(dataset_prefix + 'test.tsv') val_set = load_dataset(dataset_prefix + 'val.tsv') y_train = [mappingDict[train_set[key]] for key in list(train_set.keys())] y_test = [mappingDict[test_set[key]] for key in list(test_set.keys())] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # y_val = [1 if 'True' in val_set[key] else 0 for key in val_set.keys()] dataset_keys = list(train_set.keys()) + list(test_set.keys()) + list( val_set.keys()) print('Done!') # Load the word embeddings print('Initializing word embeddings...') if embeddings_file is not None: wv, lemma_index = load_embeddings(embeddings_file) lemma_inverted_index = {i: w for w, i in lemma_index.items()} # Load the paths and create the feature vectors print('Loading path files...') x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, \ pos_inverted_index, dep_inverted_index, dir_inverted_index = load_paths(corpus_prefix, dataset_keys, lemma_index) print('Done!') print('Number of lemmas %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(lemma_index), len(pos_index), len(dep_index), len(dir_index))) X_train = dataset_instances[:len(train_set)] X_test = dataset_instances[len(train_set):len(train_set) + len(test_set)] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # X_val = dataset_instances[len(train_set)+len(test_set):] x_y_vectors_train = x_y_vectors[:len(train_set)] x_y_vectors_test = x_y_vectors[len(train_set):len(train_set) + len(test_set)] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # x_y_vectors_val = x_y_vectors[len(train_set)+len(test_set):] # Create the classifier classifier = PathLSTMClassifier(num_lemmas=len(lemma_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=3, num_relations=NUM_CLASSES, lemma_embeddings=wv, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=True) # print 'Training with regularization = %f, learning rate = %f, dropout = %f...' % (reg, alpha, dropout) print('Training with learning rate = %f, dropout = %f...' % (alpha, word_dropout_rate)) classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train) print(X_train[:10]) print('Evaluation:') pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) p, r, f1, support = precision_recall_fscore_support(y_test, pred, average='micro') print('Precision: %.3f, Recall: %.3f, F1: %.3f' % (p, r, f1)) # Save the best model to a file classifier.save_model(output_file, [lemma_index, pos_index, dep_index, dir_index]) # Write the predictions to a file output_predictions(output_file + '.predictions', relations, pred, list(test_set.keys()), y_test) # Retrieve k-best scoring paths all_paths = unique( [path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, 1000) with codecs.open(output_file + '.paths', 'w', 'utf-8') as f_out: for path, score in top_k: path_str = '_'.join([ reconstruct_edge(edge, lemma_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path ]) print('\t'.join([path_str, str(score)]), file=f_out)
def main(): """ Trains the LSTM-based path-based method for hypernymy detection :return: """ corpus_prefix = sys.argv[5] dataset_prefix = sys.argv[6] output_file = sys.argv[7] embeddings_file = sys.argv[8] alpha = float(sys.argv[9]) word_dropout_rate = float(sys.argv[10]) np.random.seed(133) relations = ['False', 'True'] # Load the datasets print 'Loading the dataset...' train_set = load_dataset(dataset_prefix + 'train.tsv') test_set = load_dataset(dataset_prefix + 'test.tsv') val_set = load_dataset(dataset_prefix + 'val.tsv') y_train = [1 if 'True' in train_set[key] else 0 for key in train_set.keys()] y_test = [1 if 'True' in test_set[key] else 0 for key in test_set.keys()] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # y_val = [1 if 'True' in val_set[key] else 0 for key in val_set.keys()] dataset_keys = train_set.keys() + test_set.keys() + val_set.keys() print 'Done!' # Load the word embeddings print 'Initializing word embeddings...' if embeddings_file is not None: wv, lemma_index = load_embeddings(embeddings_file) lemma_inverted_index = { i : w for w, i in lemma_index.iteritems() } # Load the paths and create the feature vectors print 'Loading path files...' dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index = \ load_paths(corpus_prefix, dataset_keys, lemma_index) print 'Done!' print 'Number of lemmas %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(lemma_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_test = dataset_instances[len(train_set):len(train_set)+len(test_set)] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # X_val = dataset_instances[len(train_set)+len(test_set):] # Create the classifier classifier = PathLSTMClassifier(num_lemmas=len(lemma_index), num_pos=len(pos_index), num_dep=len(dep_index),num_directions=len(dir_index), n_epochs=5, num_relations=2, lemma_embeddings=wv, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=False) print 'Training with learning rate = %f, dropout = %f...' % (alpha, word_dropout_rate) classifier.fit(X_train, y_train) print 'Evaluation:' pred = classifier.predict(X_test) p, r, f1, support = precision_recall_fscore_support(y_test, pred, average='binary') print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (p, r, f1) # Save the best model to a file classifier.save_model(output_file, [lemma_index, pos_index, dep_index, dir_index]) # Write the predictions to a file output_predictions(output_file + '.predictions', relations, pred, test_set.keys(), y_test) # Retrieve k-best scoring paths all_paths = unique([path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, 1000) with codecs.open(output_file + '.paths', 'w', 'utf-8') as f_out: for path, score in top_k: path_str = '_'.join([reconstruct_edge(edge, lemma_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path]) print >> f_out, '\t'.join([path_str, str(score)])
def main(): """ Trains the LSTM-based path-based method for hypernymy detection :return: """ corpus_prefix = sys.argv[5] dataset_prefix = sys.argv[6] output_file = sys.argv[7] embeddings_file = sys.argv[8] alpha = float(sys.argv[9]) word_dropout_rate = float(sys.argv[10]) np.random.seed(133) relations = ['False', 'True'] # Load the datasets print 'Loading the dataset...' train_set = load_dataset(dataset_prefix + 'train.tsv') test_set = load_dataset(dataset_prefix + 'test.tsv') val_set = load_dataset(dataset_prefix + 'val.tsv') y_train = [ 1 if 'True' in train_set[key] else 0 for key in train_set.keys() ] y_test = [1 if 'True' in test_set[key] else 0 for key in test_set.keys()] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # y_val = [1 if 'True' in val_set[key] else 0 for key in val_set.keys()] dataset_keys = train_set.keys() + test_set.keys() + val_set.keys() print 'Done!' # Load the word embeddings print 'Initializing word embeddings...' if embeddings_file is not None: wv, lemma_index = load_embeddings(embeddings_file) lemma_inverted_index = {i: w for w, i in lemma_index.iteritems()} # Load the paths and create the feature vectors print 'Loading path files...' dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index = \ load_paths(corpus_prefix, dataset_keys, lemma_index) print 'Done!' print 'Number of lemmas %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(lemma_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_test = dataset_instances[len(train_set):len(train_set) + len(test_set)] # Uncomment if you'd like to load the validation set (e.g. to tune the hyper-parameters) # X_val = dataset_instances[len(train_set)+len(test_set):] # Create the classifier classifier = PathLSTMClassifier(num_lemmas=len(lemma_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=5, num_relations=2, lemma_embeddings=wv, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=False) print 'Training with learning rate = %f, dropout = %f...' % ( alpha, word_dropout_rate) classifier.fit(X_train, y_train) print 'Evaluation:' pred = classifier.predict(X_test) p, r, f1, support = precision_recall_fscore_support(y_test, pred, average='binary') print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (p, r, f1) # Save the best model to a file classifier.save_model(output_file, [lemma_index, pos_index, dep_index, dir_index]) # Write the predictions to a file output_predictions(output_file + '.predictions', relations, pred, test_set.keys(), y_test) # Retrieve k-best scoring paths all_paths = unique( [path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, 1000) with codecs.open(output_file + '.paths', 'w', 'utf-8') as f_out: for path, score in top_k: path_str = '_'.join([ reconstruct_edge(edge, lemma_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path ]) print >> f_out, '\t'.join([path_str, str(score)])
def main(): np.random.seed(133) # The seed is for when we want repeatable results. # Load the relations with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} print('relation_index :') pprint.pprint(relation_index) # Load the datasets print 'Loading the dataset...' train_set = load_dataset(args.dataset_prefix + '/train.tsv', relations) val_set = load_dataset(args.dataset_prefix + '/val.tsv', relations) test_set = load_dataset(args.dataset_prefix + '/test.tsv', relations) print("test_set ", test_set) print('\n') y_train = [relation_index[label] for label in train_set.values()] y_val = [relation_index[label] for label in val_set.values()] y_test = [relation_index[label] for label in test_set.values()] print("y_test ", y_test) print('\n') print("test_set.keys() ", test_set.keys()) dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() print 'Done!' print('\n') # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(args.corpus_prefix) print("corpus", corpus) print 'Done!' # Get the vocabulary vocabulary = get_vocabulary(corpus, dataset_keys) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index = load_embeddings(args.embeddings_file, vocabulary) # print('word_vectors', word_vectors) # print('word_index', word_index) # 61, u'anti-terrorism': 77116, u'sugino': 378214, u'v-p': 192698, u"l'association": 183544, u'nicolino': 284460, u'paskal': 134561, u'wons': 344069, u'jianming': 147054, u'paskah': 250517, u'paskai': 389896, u'sivarasa': 206828, u'cutlet': 142874, u'cutler': 18017, u'b\xe9liveau': 270700, u'2213': 275550, u'92.29': 344139, u"l'eglise": 248592, u'farhud': 397788, u'hollen': 59631, u'5150': 153632, u'birthmark': 105028, u'namadi': 291246, u'uebber': 235170, u'holler': 53382, u'holles': 137285, u'libretti': 92423, u'holley': 38966, u'suiciders': 297045, u'leaden': 65314, u'blue-and-white': 199303, u'text-types': 365053, u'csaa': # 352872, u'csac': 280893, u'leaded': 55631, u'maranville': 311808, u'reza\xef': 213221, u'csas': 177125, u'csar': 120796, u'csat': 240101, u'haywood': 35298, u'deciliter': 118490, u'yannitsis': 390408, u'thoroughfare': 23669, u'rivonia': 149865, u'ultratop': 133395, u'148-member': 256295, u'kashagan': 100228, u'wesley': 10467, u'matrixx': 207821, u'phylis': 272842, u'fiumicino': 95253, u'pailan': 161723, u'first-of-its-kind': 225091, u'genets': 296893, u'goldwork': 338499, u'neo-babylonian': 195998, u'interventionists': 171769, u'clojure': 303098, u'adhaim': 374678, u'bugle': 41652, u'skaha': 397963, u'aldaco': 396559, u'myisha': 185566, u'frolunda': 195939, u'basilisks': 346164, u'bhatkar': 379085, u'jenine': 184185, u'long-winged': 327407, u'arden-arcade': 341140, u'relaci\xf3n': 267608, u'blockhouses': 114387, u'poove': 367186, u'maize': 14908, u'mojahedin': 224956, u'gerstel': 311743, u'brujas': 128695, u'cracroft': 301781, u'ajna': 377586, u'lederer': 37662, u'100-person': 200702, u'dieguito': 217408, u'quez\xf3n': 302127, u'footwork': 31682, u'kojima': 64072, u'cl\xe9ment': 69390, u'cisowski': 260671, u'jerrys': 247111, u'clampdown': 24779, u'near-fatal': 112364, u'third-world': 163091, u'brashness': 99576, u'adhunik': 398217, u'witherell': # 256863, u'chelyabinsk-70': 321375, u'achmat': 110656, u'short-acting': 262297, u'p19': 320735, u'chromate': 136926, u'worrywarts': 340377, u'achmad': 76861, u'semi-nomadic': 109866, u'chalonnaise': 357270, u'flaminius': 202827, u'hirschbiegel': 215738, u'pandyan': 94815, u'd.n.c.': 367506, u'polevanov': 276819, u'spanks': 280699, u'pandyas': 111459, u'morihiro': 103633, u'121.26': 262184, u'spanky': 85069, u'flameouts': 226637, u'15km': 53708, u'50cm': 358620, u'campton': 154447, u'sancton': 349215, u'beguelin': 345069, u'1.218': 398596, u'50cc': 163840, u'hickling': 105165, u'dogtown': 89630, u'league-record': 362391, u'performance-based': 123474, u'guadagno': 185339, u'guadagni': 190095, u'whitesand': 279029, u'rowshan': 393443, u'bruschi': 45649, u'shyatt': 214189, u'profanity-laced': 388047, u'psone': 234041, u'gencon': 353806, # u'torpey': 144547, u'daum': 57164, u'gao': 9308, u'ipfw': 203997, u'expands': 14320, u'gam': 17423, u'scratchcards': 348767, u'wassermann': 215028, u'sheinbein': 48976, u'clinopyroxene': 356820, u'sd-6': 241100, u'zinka': 274365}) word_inverted_index = {i: w for w, i in word_index.iteritems()} # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)] X_test = dataset_instances[len(train_set) + len(val_set):] print('X_test', X_test) x_y_vectors_train = x_y_vectors[:len(train_set)] x_y_vectors_val = x_y_vectors[len(train_set):len(train_set) + len(val_set)] x_y_vectors_test = x_y_vectors[len(train_set) + len(val_set):] print('x_y_vectors_test', x_y_vectors_test) # Tune the hyper-parameters using the validation set alphas = [0.001] word_dropout_rates = [0.0] # [0.0, 0.2, 0.4] f1_results = [] models = [] descriptions = [] for alpha in alphas: for word_dropout_rate in word_dropout_rates: # Create the classifier classifier = PathLSTMClassifier( num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=args.num_epochs, num_relations=len(relations), lemma_embeddings=word_vectors, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=True, num_hidden_layers=args.num_hidden_layers) print 'Training with learning rate = %f, dropout = %f...' % ( alpha, word_dropout_rate) classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train) pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val) precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False) print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \ (alpha, word_dropout_rate, precision, recall, f1) f1_results.append(f1) models.append(classifier) # Save intermediate models classifier.save_model( args.model_prefix_file + '.' + str(word_dropout_rate), [word_index, pos_index, dep_index, dir_index]) descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate)) best_index = np.argmax(f1_results) classifier = models[best_index] description = descriptions[best_index] print 'Best hyper-parameters: ' + description # Save the best model to a file print 'Saving the model...' classifier.save_model(args.model_prefix_file, [word_index, pos_index, dep_index, dir_index]) # Evaluate on the test set print 'Evaluation:' pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1) # Write the predictions to a file output_predictions(args.model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test) # Retrieve k-best scoring paths for each class all_paths = unique( [path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, relation_index, 0.7) for i, relation in enumerate(relations): with codecs.open(args.model_prefix_file + '.paths.' + relation, 'w', 'utf-8') as f_out: for path, score in top_k[i]: path_str = '_'.join([ reconstruct_edge(edge, word_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path ]) print >> f_out, '\t'.join([path_str, str(score)])