def load_paths(corpus_prefix, dataset_keys, lemma_index): """ Override load_paths from lstm_common to include (x, y) vectors :param corpus_prefix: :param dataset_keys: :return: """ # Define the dictionaries pos_index = defaultdict(count(0).next) dep_index = defaultdict(count(0).next) dir_index = defaultdict(count(0).next) dummy = pos_index['#UNKNOWN#'] dummy = dep_index['#UNKNOWN#'] dummy = dir_index['#UNKNOWN#'] # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(corpus_prefix) print 'Done!' keys = [(corpus.get_id_by_term(str(x)), corpus.get_id_by_term(str(y))) for (x, y) in dataset_keys] paths_x_to_y = [{ vectorize_path(path, lemma_index, pos_index, dep_index, dir_index) : count for path, count in get_paths(corpus, x_id, y_id).iteritems() } for (x_id, y_id) in keys] paths_x_to_y = [ { p : c for p, c in paths_x_to_y[i].iteritems() if p is not None } for i in range(len(keys)) ] paths = paths_x_to_y empty = [dataset_keys[i] for i, path_list in enumerate(paths) if len(path_list.keys()) == 0] print 'Pairs without paths:', len(empty), ', all dataset:', len(dataset_keys) pos_inverted_index = { i : p for p, i in pos_index.iteritems() } dep_inverted_index = { i : p for p, i in dep_index.iteritems() } dir_inverted_index = { i : p for p, i in dir_index.iteritems() } return paths, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index
def predict(): # The LSTM-based integrated pattern-based and distributional method for multiclass semantic relations classification corpus_prefix = CORPUS dataset_prefix = DATA_MODEL model_file_prefix = MODEL_OUTPUT # Load the relations with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} print relation_index # Load the datasets print 'Loading the dataset...' with codecs.open(DATA_PREDICT, 'r', 'utf-8') as f_in: dataset = [tuple(line.strip().split('\t')) for line in f_in] dataset = list(set(dataset)) # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(corpus_prefix) print 'Done!' # Load the pre-trained model file classifier, word_index, pos_index, dep_index, dir_index = load_model( model_file_prefix) # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors_test, X_test = load_paths_and_word_vectors( corpus, dataset, word_index, pos_index, dep_index, dir_index) lemma_inverted_index = {i: p for p, i in word_index.iteritems()} pos_inverted_index = {i: p for p, i in pos_index.iteritems()} dep_inverted_index = {i: p for p, i in dep_index.iteritems()} dir_inverted_index = {i: p for p, i in dir_index.iteritems()} pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) # write out prediction results df = pd.read_csv(DATA_PREDICT, sep='\t', header=None, index_col=None) df['predict'] = pred df.to_csv(DATA_PREDICT, sep='\t', header=False, index=False)
def main(): """ Load a pre-trained model of the LSTM-based integrated path-based and distributional method for hypernymy detection, and test it on the test set :return: """ corpus_prefix = sys.argv[3] dataset_prefix = sys.argv[4] model_file_prefix = sys.argv[5] # Load the datasets print 'Loading the dataset...' test_set = load_dataset(dataset_prefix + 'test.tsv') y_test = [1 if 'True' in label else 0 for label in test_set.values()] # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(corpus_prefix) print 'Done!' # Load the model classifier, lemma_index, pos_index, dep_index, dir_index = load_model( model_file_prefix) # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors_test, X_test = load_paths(corpus, test_set.keys(), lemma_index, pos_index, dep_index, dir_index) print 'Evaluation:' pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) p, r, f1, support = precision_recall_fscore_support(y_test, pred, average='binary') print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (p, r, f1) # Write the predictions to a file relations = ['False', 'True'] output_predictions(model_file_prefix + '.test_predictions', relations, pred, test_set.keys(), y_test)
def main(): # The LSTM-based path-based method for multiclass semantic relations classification corpus_prefix = sys.argv[5] dataset_prefix = sys.argv[6] model_file_prefix = sys.argv[7] # Load the relations with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} # Load the datasets print 'Loading the dataset...' test_set = load_dataset(dataset_prefix + '/test.tsv', relations) y_test = [relation_index[label] for label in test_set.values()] # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(corpus_prefix) print 'Done!' # Load the pre-trained model file classifier, word_index, pos_index, dep_index, dir_index = load_model( model_file_prefix) # Load the paths and create the feature vectors print 'Loading path files...' X_test = load_paths(corpus, test_set.keys(), word_index, pos_index, dep_index, dir_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) print 'Evaluation:' pred = classifier.predict(X_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)
def main(): # The LSTM-based integrated pattern-based and distributional method for multiclass semantic relations classification corpus_prefix = sys.argv[5] dataset_prefix = sys.argv[6] model_prefix_file = sys.argv[7] embeddings_file = sys.argv[8] num_hidden_layers = int(sys.argv[9]) np.random.seed(133) # Load the relations with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} # Load the datasets print 'Loading the dataset...' train_set = load_dataset(dataset_prefix + '/train.tsv', relations) print "Len of train set: " + str(len(train_set)) val_set = load_dataset(dataset_prefix + '/val.tsv', relations) print "Len of val set: " + str(len(val_set)) test_set = load_dataset(dataset_prefix + '/test.tsv', relations) print "Len of test set: " + str(len(test_set)) y_train = [relation_index[label] for label in train_set.values()] y_val = [relation_index[label] for label in val_set.values()] y_test = [relation_index[label] for label in test_set.values()] dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() print 'Done!' # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(corpus_prefix) print 'Done!' # Get the vocabulary vocabulary = get_vocabulary(corpus, dataset_keys) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index = load_embeddings(embeddings_file, vocabulary) word_inverted_index = {i: w for w, i in word_index.iteritems()} # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)] X_test = dataset_instances[len(train_set) + len(val_set):] x_y_vectors_train = x_y_vectors[:len(train_set)] x_y_vectors_val = x_y_vectors[len(train_set):len(train_set) + len(val_set)] x_y_vectors_test = x_y_vectors[len(train_set) + len(val_set):] # Tune the hyper-parameters using the validation set alphas = [0.001] word_dropout_rates = [0.0, 0.2, 0.4] f1_results = [] models = [] descriptions = [] for alpha in alphas: for word_dropout_rate in word_dropout_rates: # Create the classifier classifier = PathLSTMClassifier( num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), num_negation_markers=3, n_epochs=5, num_relations=len(relations), lemma_embeddings=word_vectors, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=True, num_hidden_layers=num_hidden_layers) print 'Training with learning rate = %f, dropout = %f...' % ( alpha, word_dropout_rate) classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train) pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val) precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False) print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \ (alpha, word_dropout_rate, precision, recall, f1) f1_results.append(f1) models.append(classifier) # Save intermediate models classifier.save_model( model_prefix_file + '.' + str(word_dropout_rate), [word_index, pos_index, dep_index, dir_index]) descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate)) best_index = np.argmax(f1_results) classifier = models[best_index] description = descriptions[best_index] print 'Best hyper-parameters: ' + description # Save the best model to a file print 'Saving the model...' classifier.save_model(model_prefix_file, [word_index, pos_index, dep_index, dir_index]) # Evaluate on the test set print 'Evaluation:' pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1) # Write the predictions to a file output_predictions(model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test) # Retrieve k-best scoring paths for each class all_paths = unique( [path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, relation_index, 0.7) for i, relation in enumerate(relations): with codecs.open(model_prefix_file + '.paths.' + relation, 'w', 'utf-8') as f_out: for path, score in top_k[i]: path_str = '_'.join([ reconstruct_edge(edge, word_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path ]) print >> f_out, '\t'.join([path_str, str(score)])
def main(): np.random.seed(133) # The seed is for when we want repeatable results. # Load the relations with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = { relation : i for i, relation in enumerate(relations) } print('relation_index :') pprint.pprint(relation_index) # Load the datasets print 'Loading the dataset...' train_set = load_dataset(args.dataset_prefix + '/train.tsv', relations) val_set = load_dataset(args.dataset_prefix + '/val.tsv', relations) test_set = load_dataset(args.dataset_prefix + '/test.tsv', relations) print("test_set ", test_set) print('\n') y_train = [relation_index[label] for label in train_set.values()] y_val = [relation_index[label] for label in val_set.values()] y_test = [relation_index[label] for label in test_set.values()] print("y_test ", y_test) print('\n') print("test_set.keys() ", test_set.keys()) dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() print 'Done!' print('\n') # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(args.corpus_prefix) print("corpus", corpus) print 'Done!' # Get the vocabulary vocabulary = get_vocabulary(corpus, dataset_keys) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index = load_embeddings(args.embeddings_file, vocabulary) # print('word_vectors', word_vectors) # print('word_index', word_index) # 61, u'anti-terrorism': 77116, u'sugino': 378214, u'v-p': 192698, u"l'association": 183544, u'nicolino': 284460, u'paskal': 134561, u'wons': 344069, u'jianming': 147054, u'paskah': 250517, u'paskai': 389896, u'sivarasa': 206828, u'cutlet': 142874, u'cutler': 18017, u'b\xe9liveau': 270700, u'2213': 275550, u'92.29': 344139, u"l'eglise": 248592, u'farhud': 397788, u'hollen': 59631, u'5150': 153632, u'birthmark': 105028, u'namadi': 291246, u'uebber': 235170, u'holler': 53382, u'holles': 137285, u'libretti': 92423, u'holley': 38966, u'suiciders': 297045, u'leaden': 65314, u'blue-and-white': 199303, u'text-types': 365053, u'csaa': # 352872, u'csac': 280893, u'leaded': 55631, u'maranville': 311808, u'reza\xef': 213221, u'csas': 177125, u'csar': 120796, u'csat': 240101, u'haywood': 35298, u'deciliter': 118490, u'yannitsis': 390408, u'thoroughfare': 23669, u'rivonia': 149865, u'ultratop': 133395, u'148-member': 256295, u'kashagan': 100228, u'wesley': 10467, u'matrixx': 207821, u'phylis': 272842, u'fiumicino': 95253, u'pailan': 161723, u'first-of-its-kind': 225091, u'genets': 296893, u'goldwork': 338499, u'neo-babylonian': 195998, u'interventionists': 171769, u'clojure': 303098, u'adhaim': 374678, u'bugle': 41652, u'skaha': 397963, u'aldaco': 396559, u'myisha': 185566, u'frolunda': 195939, u'basilisks': 346164, u'bhatkar': 379085, u'jenine': 184185, u'long-winged': 327407, u'arden-arcade': 341140, u'relaci\xf3n': 267608, u'blockhouses': 114387, u'poove': 367186, u'maize': 14908, u'mojahedin': 224956, u'gerstel': 311743, u'brujas': 128695, u'cracroft': 301781, u'ajna': 377586, u'lederer': 37662, u'100-person': 200702, u'dieguito': 217408, u'quez\xf3n': 302127, u'footwork': 31682, u'kojima': 64072, u'cl\xe9ment': 69390, u'cisowski': 260671, u'jerrys': 247111, u'clampdown': 24779, u'near-fatal': 112364, u'third-world': 163091, u'brashness': 99576, u'adhunik': 398217, u'witherell': # 256863, u'chelyabinsk-70': 321375, u'achmat': 110656, u'short-acting': 262297, u'p19': 320735, u'chromate': 136926, u'worrywarts': 340377, u'achmad': 76861, u'semi-nomadic': 109866, u'chalonnaise': 357270, u'flaminius': 202827, u'hirschbiegel': 215738, u'pandyan': 94815, u'd.n.c.': 367506, u'polevanov': 276819, u'spanks': 280699, u'pandyas': 111459, u'morihiro': 103633, u'121.26': 262184, u'spanky': 85069, u'flameouts': 226637, u'15km': 53708, u'50cm': 358620, u'campton': 154447, u'sancton': 349215, u'beguelin': 345069, u'1.218': 398596, u'50cc': 163840, u'hickling': 105165, u'dogtown': 89630, u'league-record': 362391, u'performance-based': 123474, u'guadagno': 185339, u'guadagni': 190095, u'whitesand': 279029, u'rowshan': 393443, u'bruschi': 45649, u'shyatt': 214189, u'profanity-laced': 388047, u'psone': 234041, u'gencon': 353806, # u'torpey': 144547, u'daum': 57164, u'gao': 9308, u'ipfw': 203997, u'expands': 14320, u'gam': 17423, u'scratchcards': 348767, u'wassermann': 215028, u'sheinbein': 48976, u'clinopyroxene': 356820, u'sd-6': 241100, u'zinka': 274365}) word_inverted_index = { i : w for w, i in word_index.iteritems() } # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set)+len(val_set)] X_test = dataset_instances[len(train_set)+len(val_set):] print('X_test', X_test) x_y_vectors_train = x_y_vectors[:len(train_set)] x_y_vectors_val = x_y_vectors[len(train_set):len(train_set)+len(val_set)] x_y_vectors_test = x_y_vectors[len(train_set)+len(val_set):] print('x_y_vectors_test', x_y_vectors_test) # Tune the hyper-parameters using the validation set alphas = [0.001] word_dropout_rates = [0.0] # [0.0, 0.2, 0.4] f1_results = [] models = [] descriptions = [] for alpha in alphas: for word_dropout_rate in word_dropout_rates: # Create the classifier classifier = PathLSTMClassifierKeras(num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=args.num_epochs, num_relations=len(relations), lemma_embeddings=word_vectors, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=True, num_hidden_layers=args.num_hidden_layers) print 'Training with learning rate = %f, dropout = %f...' % (alpha, word_dropout_rate) classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train) pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val) precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False) print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \ (alpha, word_dropout_rate, precision, recall, f1) f1_results.append(f1) models.append(classifier) # Save intermediate models classifier.save_model(args.model_prefix_file + '.' + str(word_dropout_rate), [word_index, pos_index, dep_index, dir_index]) descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate)) best_index = np.argmax(f1_results) classifier = models[best_index] description = descriptions[best_index] print 'Best hyper-parameters: ' + description # Save the best model to a file print 'Saving the model...' classifier.save_model(args.model_prefix_file, [word_index, pos_index, dep_index, dir_index]) # Evaluate on the test set print 'Evaluation:' pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1) # Write the predictions to a file output_predictions(args.model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test) # Retrieve k-best scoring paths for each class all_paths = unique([path for path_list in dataset_instances for path in path_list]) top_k = classifier.get_top_k_paths(all_paths, relation_index, 0.7) for i, relation in enumerate(relations): with codecs.open(args.model_prefix_file + '.paths.' + relation, 'w', 'utf-8') as f_out: for path, score in top_k[i]: path_str = '_'.join([reconstruct_edge(edge, word_inverted_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) for edge in path]) print >> f_out, '\t'.join([path_str, str(score)])
def main(): """ Applies bi-directional search in two phases to find the shortest paths between every term-pair in the dataset: the first phase finds the nodes along the shortest paths, while the second reconstructs the paths themselves. """ # Get the arguments args = docopt("""Find the shortest paths between every term-pair in the dataset. Usage: search.py <dataset_path> <resource_matrix_path> <resource_entities_path> <resource_properties_path> <resource_l2r_path> <max_path_length> <allow_reversed_edges> <find_relevant_nodes> <relevant_nodes_file> <paths_out_file> <dataset_path> = the dataset file <resource_matrix_path> = the resource adjacency matrix file (.mm/.npz) <resource_entities_path> = the entity str-id map file <resource_properties_path> = the property str-id map file <resource_l2r_path> = the edges file <max_path_length> = the maximum path length <allow_reversed_edges> = whether reversed edges are allowed in this resource <find_relevant_nodes> = whether to find the relevant nodes (or use the results file) <relevant_nodes_file> = relevant nodes file (input / output) <paths_out_file> = the paths file (output) """) dataset_file = args['<dataset_path>'] resource_mat_file = args['<resource_matrix_path>'] entity_map_file = args['<resource_entities_path>'] property_map_file = args['<resource_properties_path>'] edges_file = args['<resource_l2r_path>'] max_length = int(args['<max_path_length>']) allow_reversed_edges = args['<allow_reversed_edges>'][0].upper() == 'T' do_find_relevant_nodes = args['<find_relevant_nodes>'][0].upper() == 'T' relevant_nodes_file = args['<relevant_nodes_file>'] paths_file = args['<paths_out_file>'] initialize_logger() # Find relevant nodes if do_find_relevant_nodes: # Load the resource resource = KnowledgeResource(resource_mat_file, entity_map_file, property_map_file, edges_file, allow_reversed_edges) adjacency_matrix = resource.adjacency_matrix term_to_id = resource.term_to_id # Load the dataset dataset = load_data_labels(dataset_file, adjacency_matrix) node_finder = RelevantNodesFinder(adjacency_matrix) relevant_nodes = find_relevant_nodes(dataset, max_length, relevant_nodes_file, term_to_id, node_finder) else: # Load the resource partially according to the relevant nodes relevant_nodes = load_relevant_nodes(relevant_nodes_file) resource = KnowledgeResource(resource_mat_file, entity_map_file, property_map_file, edges_file, allow_reversed_edges, get_all_nodes(relevant_nodes)) adjacency_matrix = resource.adjacency_matrix term_to_id = resource.term_to_id # Load the dataset dataset = load_data_labels(dataset_file, adjacency_matrix) path_finder = PathFinder(resource) paths_output = open(paths_file, 'w') pair_num = 0 # For each term-pair, find relevant nodes and then find paths for (x, y) in dataset.keys(): pair_num = pair_num + 1 x_id = -1 if x in term_to_id: x_id = term_to_id[x] y_id = -1 if y in term_to_id: y_id = term_to_id[y] # Limit the search space using the relevant nodes and find paths nodes = relevant_nodes[(x_id, y_id)] l2r_edges, r2l_edges = resource.get_nodes_edges(nodes) paths = path_finder.find_shortest_paths(x_id, y_id, max_length, l2r_edges, r2l_edges) paths_output.write('pair number ' + str(pair_num) + ': ' + x + '->' + y + '\n') for path in paths: paths_output.write(nice_print_path(path, resource.id_to_prop) + '\n') paths_output.close()
def main(): args = docopt("""The LSTM-based integrated pattern-based and distributional method for multiclass semantic relations classification Usage: parse_wikipedia.py <corpus_prefix> <dataset_prefix> <model_prefix_file> <embeddings_file> <num_hidden_layers> <wiki_file> = the Wikipedia dump file <vocabulary_file> = a file containing the words to include <out_file> = the output file """) corpus_prefix = args['<corpus_prefix>'] dataset_prefix = args['<dataset_prefix>'] model_prefix_file = args['<model_prefix_file>'] embeddings_file = args['<embeddings_file>'] num_hidden_layers = int(args['<num_hidden_layers>']) np.random.seed(133) # Load the relations with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = { relation : i for i, relation in enumerate(relations) } # Load the datasets print 'Loading the dataset...' train_set = load_dataset(dataset_prefix + '/train.tsv', relations) val_set = load_dataset(dataset_prefix + '/val.tsv', relations) test_set = load_dataset(dataset_prefix + '/test.tsv', relations) y_train = [relation_index[label] for label in train_set.values()] y_val = [relation_index[label] for label in val_set.values()] y_test = [relation_index[label] for label in test_set.values()] dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() print 'Done!' # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(corpus_prefix) print 'Done!' # Get the vocabulary vocabulary = get_vocabulary(corpus, dataset_keys) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index = load_embeddings(embeddings_file, vocabulary) # Load the paths and create the feature vectors print 'Loading path files...' x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set)+len(val_set)] X_test = dataset_instances[len(train_set)+len(val_set):] x_y_vectors_train = x_y_vectors[:len(train_set)] x_y_vectors_val = x_y_vectors[len(train_set):len(train_set)+len(val_set)] x_y_vectors_test = x_y_vectors[len(train_set)+len(val_set):] # Tune the hyper-parameters using the validation set epochs = [10, 15, 20] word_dropout_rates = [0.0, 0.2, 0.4] f1_results = [] descriptions = [] model_prefixes = [] for word_dropout_rate in word_dropout_rates: for n_epochs in epochs: # Create the classifier classifier = PathLSTMClassifier(num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=n_epochs, num_relations=len(relations), lemma_embeddings=word_vectors, dropout=word_dropout_rate, num_hidden_layers=num_hidden_layers) description = 'dropout = %.2f, num epochs = %d' % (word_dropout_rate, n_epochs) print 'Training with ' + description + '...' classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train) pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val) precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False) print 'Dropout = %f, num epochs = %d, Precision: %.3f, Recall: %.3f, F1: %.3f' % \ (word_dropout_rate, n_epochs, precision, recall, f1) f1_results.append(f1) # Save intermediate models curr_model_prefix = '%s_%.2f_%d' % (model_prefix_file, word_dropout_rate, n_epochs) model_prefixes.append(curr_model_prefix) classifier.save_model(curr_model_prefix, [word_index, pos_index, dep_index, dir_index]) descriptions.append(description) classifier.close() best_index = np.argmax(f1_results) description = descriptions[best_index] print 'Best hyper-parameters: ' + description # Save the best model to a file print 'Saving the model...' best_model_prefix = model_prefixes[best_index] for file in glob.glob(best_model_prefix + '.*'): shutil.copy(file, model_prefix_file + file[file.index(best_model_prefix) + len(best_model_prefix):]) classifier, word_index, pos_index, dep_index, dir_index = PathLSTMClassifier.load_model(model_prefix_file) # Evaluate on the test set print 'Evaluation:' pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1) classifier.close() # Write the predictions to a file output_predictions(model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test)
def main(): np.random.seed(133) # Load the relations with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = { relation : i for i, relation in enumerate(relations) } # Load the datasets print 'Loading the dataset...' train_set = load_dataset(args.dataset_prefix + '/train.tsv', relations) val_set = load_dataset(args.dataset_prefix + '/val.tsv', relations) test_set = load_dataset(args.dataset_prefix + '/test.tsv', relations) y_train = [relation_index[label] for label in train_set.values()] y_val = [relation_index[label] for label in val_set.values()] y_test = [relation_index[label] for label in test_set.values()] dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() print 'Done!' # Load the resource (processed corpus) print 'Loading the corpus...' corpus = KnowledgeResource(args.corpus_prefix) print 'Done!' # Get the vocabulary vocabulary = get_vocabulary(corpus, dataset_keys) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, lemma_index = load_embeddings(args.embeddings_file, vocabulary) lemma_inverted_index = { i : w for w, i in lemma_index.iteritems() } # Load the paths and create the feature vectors print 'Loading path files...' dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths(corpus, dataset_keys, lemma_index) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(lemma_index), len(pos_index), len(dep_index), len(dir_index)) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set)+len(val_set)] X_test = dataset_instances[len(train_set)+len(val_set):] # Tune the hyper-parameters using the validation set alphas = [0.001] word_dropout_rates = [0.0, 0.2, 0.4] f1_results = [] models = [] descriptions = [] for alpha in alphas: for word_dropout_rate in word_dropout_rates: # Create the classifier classifier = PathLSTMClassifier(num_lemmas=len(lemma_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), n_epochs=args.num_epochs, num_relations=len(relations), lemma_embeddings=word_vectors, dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=False, num_hidden_layers=args.num_hidden_layers) print 'Training with learning rate = %f, dropout = %f...' % (alpha, word_dropout_rate) classifier.fit(X_train, y_train) pred = classifier.predict(X_val) precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False) print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \ (alpha, word_dropout_rate, precision, recall, f1) f1_results.append(f1) models.append(classifier) # Save intermediate model classifier.save_model(args.model_prefix_file + '.' + str(word_dropout_rate), [lemma_index, pos_index, dep_index, dir_index]) descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate)) best_index = np.argmax(f1_results) classifier = models[best_index] description = descriptions[best_index] print 'Best hyper-parameters: ' + description # Save the best model to a file print 'Saving the model...' classifier.save_model(args.model_prefix_file, [lemma_index, pos_index, dep_index, dir_index]) # Evaluate on the test set print 'Evaluation:' pred = classifier.predict(X_test) precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True) print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)
def main(): print_config(opt) # Load the relations with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} # Load the datasets if args.debug: trainname = '../datasets/wn-bo/train_sample.tsv' print 'Loading the dataset...', trainname, '*' * 10 train_set = load_dataset(trainname, relations) val_set = load_dataset(trainname, relations) test_set = load_dataset(trainname, relations) else: trainname = '/' + args.trainname + '.tsv' valname = '/' + args.valname + '.tsv' testname = '/' + args.testname + '.tsv' print 'Loading the dataset...', trainname, '*' * 10 train_set = load_dataset(args.dataset_prefix + trainname, relations) print 'Loading the dataset...', valname, '*' * 10 val_set = load_dataset(args.dataset_prefix + valname, relations) print 'Loading the dataset...', testname, '*' * 10 test_set = load_dataset(args.dataset_prefix + testname, relations) # y_train = [relation_index[label] for label in train_set.values()] # y_val = [relation_index[label] for label in val_set.values()] # y_test = [relation_index[label] for label in test_set.values()] dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() # add (x, root) to dataset_keys vocab = set() for (x, y) in dataset_keys: vocab.add(x) vocab.add(y) dataset_keys += [(term, 'root007') for term in vocab] if not args.debug: trees = read_tree_file( "../datasets/wn-bo/wn-bo-trees-4-11-50-train533-lower.ptb", given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up) trees_val = read_tree_file( "../datasets/wn-bo/wn-bo-trees-4-11-50-dev114-lower.ptb", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) trees_test = read_tree_file( "../datasets/wn-bo/wn-bo-trees-4-11-50-test114-lower.ptb", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) trees_semeval = read_edge_files("../datasets/SemEval-2016/original/", given_root=True, filter_root=args.filter_root, allow_up=False) else: trees = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up) trees_val = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up) trees_test = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) trees_semeval = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) # Load the resource (processed corpus) print 'Loading the corpus...', args.corpus_prefix, '*' * 10 corpus = KnowledgeResource(args.corpus_prefix) if not os.path.exists('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug)): print 'Loading the vocabulary...' # path_lemmas_name = "pickled_data/path_lemmas_3in1.pkl" # print 'reload path_lemmas from:', path_lemmas_name # path_lemmas = pickle.load(open(path_lemmas_name, 'rb')) path_lemmas, x_y_words, keys = get_vocabulary(corpus, dataset_keys, None) if not args.debug: pickle.dump(path_lemmas, open('pickled_data/path_lemmas_{}.pkl'.format(args.model_prefix_file), 'wb')) pickle.dump(x_y_words, open('pickled_data/x_y_words_{}.pkl'.format(args.model_prefix_file), 'wb')) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index, word_set = load_embeddings(args.embeddings_file, path_lemmas, x_y_words, debug=args.debug) # Load the paths and create the feature vectors print 'Loading path files...' dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index, keys) print 'saving pkl...' pickle.dump((word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index), open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'wb')) else: print 'Data loaded from', 'pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'make sure pkl is correct' (word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) = pickle.load( open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'rb')) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) # dataset_instances is now (paths, x_y_vectors, features) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)] X_test = dataset_instances[len(train_set) + len(val_set):] print len(X_train), len(X_val), len(X_test) # check_data(train_set, X_train, word_set) # check_data(val_set, X_val, word_set) # check_data(test_set, X_test, word_set) # save_path_info(dataset_keys, dataset_instances) # scores_save = [] # scores_save_test = [] # prob_save = [] # prob_save_test = [] policy = Policy(dataset_keys, dataset_instances, num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), opt=opt, num_relations=len(relations), lemma_embeddings=word_vectors) trainer = dy.AdamTrainer(policy.model, alpha=args.lr) if args.debug: n_epoch = 1000 else: n_epoch = 1000 best = [0] * 6 best_idx = [0] * 6 best_val = [0] * 6 best_val_idx = [0] * 6 best_test = [0] * 6 best_test_idx = [0] * 6 best_semeval = [0] * 6 best_semeval_idx = [0] * 6 policy_save_test = defaultdict(list) wrong_total_l = [] # check_limit(trees, policy, policy.unk_hard) # check_limit(trees, policy, policy.unk_soft) # check_limit(trees_test, policy, policy.unk_hard) # check_limit(trees_test, policy, policy.unk_soft) # exit(0) # TRAIN / TEST START HERE if args.load_model_file is None: for epoch in range(n_epoch): best, best_idx = train(epoch, trees, policy, trainer, best, best_idx, wrong_total_l) # policy_save_test, best_test, best_test_idx = test(epoch, trees_test, policy, policy_save_test, best_test, # best_test_idx) _, best_val, best_val_idx = test_single(epoch, trees_val, policy, [], best_val, best_val_idx, wrong_total_l) policy_save_test, best_test, best_test_idx = test_single(epoch, trees_test, policy, policy_save_test, best_test, best_test_idx, wrong_total_l) else: load_candidate_from_pickle(trees_semeval) _, best_semeval, best_semeval_idx = test_single(0, trees_semeval, policy, [], best_semeval, best_semeval_idx, wrong_total_l, reward_type='print_each')