def main(model_params, model_name, data_folder, word_embeddings, test_set, property_index, save_folder, load_model, result_folder): with open(model_params) as f: model_params = json.load(f) embeddings, word2idx = embedding_utils.load(data_folder + word_embeddings) print("Loaded embeddings:", embeddings.shape) def check_data(data): for g in data: if (not 'vertexSet' in g): print("vertexSet missed\n") print("Reading the property index") with open(data_folder + "models/" + model_name + ".property2idx") as f: property2idx = ast.literal_eval(f.read()) max_sent_len = 36 print("Max sentence length set to: {}".format(max_sent_len)) graphs_to_indices = sp_models.to_indices_and_entity_pair if model_name == "ContextAware": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair elif model_name == "PCNN": graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask_and_entity_pair elif model_name == "CNN": graphs_to_indices = sp_models.to_indices_with_relative_positions_and_entity_pair elif model_name == "GPGNN": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair _, position2idx = embedding_utils.init_random(np.arange( -max_sent_len, max_sent_len), 1, add_all_zeroes=True) training_data = None n_out = len(property2idx) print("N_out:", n_out) model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out).cuda() model.load_state_dict(torch.load(save_folder + load_model)) print("Testing") print("Results on the test set") test_set, _ = io.load_relation_graphs_from_file(data_folder + test_set) test_as_indices = list( graphs_to_indices(test_set, word2idx, property2idx, max_sent_len, embeddings=embeddings, position2idx=position2idx)) print("Start testing!") result_file = open(result_folder + "_" + model_name, "w") for i in tqdm( range(int(test_as_indices[0].shape[0] / model_params['batch_size']))): sentence_input = test_as_indices[0][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] entity_markers = test_as_indices[1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] labels = test_as_indices[2][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] if model_name == "GPGNN": output = model( Variable(torch.from_numpy(sentence_input.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy(entity_markers.astype(int)), volatile=True).cuda(), test_as_indices[3][i * model_params['batch_size']:(i + 1) * model_params['batch_size']]) elif model_name == "PCNN": output = model( Variable(torch.from_numpy(sentence_input.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy(entity_markers.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy( np.array(test_as_indices[3] [i * model_params['batch_size']:(i + 1) * model_params['batch_size']])).float(), requires_grad=False, volatile=True).cuda()) else: output = model( Variable(torch.from_numpy(sentence_input.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy(entity_markers.astype(int)), volatile=True).cuda()) score = F.softmax(output) score = to_np(score).reshape(-1, n_out) labels = labels.reshape(-1) p_indices = labels != 0 score = score[p_indices].tolist() labels = labels[p_indices].tolist() if (model_name != "LSTM" and model_name != "PCNN" and model_name != "CNN"): entity_pairs = test_as_indices[-1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] entity_pairs = reduce(lambda x, y: x + y, entity_pairs) else: entity_pairs = test_as_indices[-1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] for (i, j, entity_pair) in zip(score, labels, entity_pairs): for index, k in enumerate(i): result_file.write( str(index) + "\t" + str(k) + "\t" + str(1 if index == j else 0) + "\t" + entity_pair[0] + "\t" + entity_pair[1] + "\n")
def test(): """ Main Configurations """ model_name = "RECON" load_model = "RECON-{}.out" # you should choose the proper model to load # device_id = 0 data_folder = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/enwiki-20160501/" save_folder = "./models/RECON/" result_folder = "result/" model_params = "model_params.json" word_embeddings = "./glove.6B/glove.6B.50d.txt" test_set = "semantic-graphs-filtered-held-out.02_06.json" gat_embedding_file = None gat_relation_embedding_file = None if "RECON" in model_name: context_data_file = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entities_context.json" if "KGGAT" in model_name: gat_embedding_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/final_entity_embeddings.json' gat_entity2id_file = './data/GAT/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entity2id.txt' if model_name == "RECON": gat_relation_embedding_file = './re/models/GAT_sep_space/WikipediaWikidataDistantSupervisionAnnotations/final_relation_embeddings.json' gat_relation2id_file = './data/GAT_sep_space/WikipediaWikidataDistantSupervisionAnnotations.v1.0/relation2id.txt' w_ent2rel_all_rels_file = './re/models/GAT_sep_space/WikipediaWikidataDistantSupervisionAnnotations/W_ent2rel.json.npy' use_char_vocab = False # a file to store property2idx # if is None use model_name.property2idx property_index = None with open(model_params) as f: model_params = json.load(f) global args save_folder = args.save_folder if args.test_file != '': test_set = args.test_file result_folder = args.result_folder model_params['batch_size'] = args.batch_size if not os.path.exists(result_folder): os.makedirs(result_folder) char_vocab_file = os.path.join(save_folder, "char_vocab.json") sp_models.set_max_edges( model_params['max_num_nodes'] * (model_params['max_num_nodes'] - 1), model_params['max_num_nodes']) if context_data_file: with open(context_data_file, 'r') as f: context_data = json.load(f) if gat_embedding_file: with open(gat_embedding_file, 'r') as f: gat_embeddings = json.load(f) with open(gat_relation_embedding_file, 'r') as f: gat_relation_embeddings = json.load(f) if gat_relation_embedding_file: W_ent2rel_all_rels = np.load(w_ent2rel_all_rels_file) with open(gat_entity2id_file, 'r') as f: gat_entity2idx = {} data = f.read() lines = data.split('\n') for line in lines: line_arr = line.split(' ') if len(line_arr) == 2: gat_entity2idx[line_arr[0].strip()] = line_arr[1].strip() with open(gat_relation2id_file, 'r') as f: gat_relation2idx = {} data = f.read() lines = data.split('\n') for line in lines: line_arr = line.split(' ') if len(line_arr) == 2: gat_relation2idx[line_arr[0].strip()] = line_arr[1].strip() embeddings, word2idx = embedding_utils.load(word_embeddings) print("Loaded embeddings:", embeddings.shape) def check_data(data): for g in data: if (not 'vertexSet' in g): print("vertexSet missed\n") print("Reading the property index") with open(os.path.join(save_folder, model_name + ".property2idx")) as f: property2idx = ast.literal_eval(f.read()) idx2property = {v: k for k, v in property2idx.items()} print("Reading the entity index") with open(os.path.join(save_folder, model_name + ".entity2idx")) as f: entity2idx = ast.literal_eval(f.read()) idx2entity = {v: k for k, v in entity2idx.items()} context_data['ALL_ZERO'] = { 'desc': '', 'label': 'ALL_ZERO', 'instances': [], 'aliases': [] } with open(char_vocab_file, 'r') as f: char_vocab = json.load(f) max_sent_len = 36 print("Max sentence length set to: {}".format(max_sent_len)) graphs_to_indices = sp_models.to_indices_and_entity_pair if model_name == "ContextAware": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "PCNN": graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask elif model_name == "CNN": graphs_to_indices = sp_models.to_indices_with_relative_positions elif model_name == "GPGNN": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "RECON-EAC": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "RECON-EAC-KGGAT": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "RECON": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding _, position2idx = embedding_utils.init_random(np.arange( -max_sent_len, max_sent_len), 1, add_all_zeroes=True) training_data = None n_out = len(property2idx) print("N_out:", n_out) if "RECON" not in model_name: model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out) elif model_name == "RECON-EAC": model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out, char_vocab) elif model_name == "RECON-EAC-KGGAT": model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out, char_vocab) elif model_name == "RECON": model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out, char_vocab, gat_relation_embeddings, W_ent2rel_all_rels, idx2property, gat_relation2idx) model = model.cuda() model.load_state_dict(torch.load(os.path.join(save_folder, load_model))) print("Testing") print("Results on the test set") test_set, _ = io.load_relation_graphs_from_file(data_folder + test_set, data='nyt') test_as_indices = list( graphs_to_indices(test_set, word2idx, property2idx, max_sent_len, embeddings=embeddings, position2idx=position2idx, entity2idx=entity2idx)) print("Start testing!") result_file = open(os.path.join(result_folder, "_" + model_name), "w") test_f1 = 0.0 for i in tqdm( range(int(test_as_indices[0].shape[0] / model_params['batch_size']))): sentence_input = test_as_indices[0][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] entity_markers = test_as_indices[1][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] labels = test_as_indices[2][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] if "RECON" in model_name: entity_indices = test_as_indices[4][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] unique_entities, unique_entities_surface_forms, max_occurred_entity_in_batch_pos = context_utils.get_batch_unique_entities( test_as_indices[4][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], test_as_indices[5][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]]) unique_entities_context_indices = context_utils.get_context_indices( unique_entities, unique_entities_surface_forms, context_data, idx2entity, word2idx, char_vocab, model_params['conv_filter_size'], max_sent_len=32, max_num_contexts=32, max_char_len=10, data='nyt') entities_position = context_utils.get_entity_location_unique_entities( unique_entities, entity_indices) if model_name == "RECON-EAC-KGGAT": gat_entity_embeddings = context_utils.get_gat_entity_embeddings( entity_indices, entity2idx, idx2entity, gat_entity2idx, gat_embeddings) elif model_name == "RECON": gat_entity_embeddings, nonzero_gat_entity_embeddings, nonzero_entity_pos = context_utils.get_selected_gat_entity_embeddings( entity_indices, entity2idx, idx2entity, gat_entity2idx, gat_embeddings) with torch.no_grad(): if model_name == "RECON": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), test_as_indices[3][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos, Variable(torch.from_numpy( nonzero_gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda(), nonzero_entity_pos, Variable(torch.from_numpy( gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda()) elif model_name == "RECON-EAC-KGGAT": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), test_as_indices[3][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos, Variable(torch.from_numpy( gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda()) elif model_name == "RECON-EAC": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), test_as_indices[3][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos) elif model_name == "GPGNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), test_as_indices[3][indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]]) elif model_name == "PCNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), Variable(torch.from_numpy( np.array(test_as_indices[3] [i * model_params['batch_size']:(i + 1) * model_params['batch_size']])).float(), requires_grad=False).cuda()) else: output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda()) _, predicted = torch.max(output, dim=1) labels_copy = labels.reshape(-1).tolist() predicted = predicted.data.tolist() p_indices = np.array(labels_copy) != 0 predicted = np.array(predicted)[p_indices].tolist() labels_copy = np.array(labels_copy)[p_indices].tolist() _, _, add_f1 = evaluation_utils.evaluate_instance_based( predicted, labels_copy, empty_label=p0_index) test_f1 += add_f1 score = F.softmax(output, dim=-1) score = to_np(score).reshape(-1, n_out) labels = labels.reshape(-1) p_indices = labels != 0 score = score[p_indices].tolist() labels = labels[p_indices].tolist() pred_labels = r = np.argmax(score, axis=-1) indices = [i for i in range(len(p_indices)) if p_indices[i]] if (model_name != "LSTM" and model_name != "PCNN" and model_name != "CNN"): entity_pairs = test_as_indices[-1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] entity_pairs = reduce(lambda x, y: x + y, entity_pairs) else: entity_pairs = test_as_indices[-1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] start_idx = i * model_params['batch_size'] for index, (i, j, entity_pair) in enumerate(zip(score, labels, entity_pairs)): sent = ' '.join(test_set[start_idx + indices[index] // (model_params['max_num_nodes'] * (model_params['max_num_nodes'] - 1))] ['tokens']).strip() result_file.write("{} | {} | {} | {} | {} | {}\n".format( sent, entity_pair[0], entity_pair[1], idx2property[pred_labels[index]], idx2property[labels[index]], score[index][pred_labels[index]])) print( "Test f1: ", test_f1 * 1.0 / (test_as_indices[0].shape[0] / model_params['batch_size'])) result_file.close()
args = parser.parse_args() data_folder = args.data_folder model_name = args.model_name mode = args.mode with open(args.model_params) as f: model_params = json.load(f) embeddings, word2idx = embedding_utils.load(data_folder + args.word_embeddings) print("Loaded embeddings:", embeddings.shape) training_data, _ = io.load_relation_graphs_from_file(data_folder + args.train_set, load_vertices=True) val_data, _ = io.load_relation_graphs_from_file(data_folder + args.val_set, load_vertices=True) if args.s: training_data = training_data[:len(training_data) // 3] print("Training data size set to: {}".format(len(training_data))) val_data = val_data[:len(val_data) // 3] print("Validation data size set to: {}".format(len(val_data))) if mode in ['test', 'train-plus-test']: print("Reading the property index") with open(data_folder + "keras-models/" + model_name + ".property2idx") as f:
parser = argparse.ArgumentParser() parser.add_argument('model_name') parser.add_argument('val_set') parser.add_argument('save_to') parser.add_argument('--data_folder', default="../../../data/") parser.add_argument('--word_embeddings', default="glove/glove.6B.50d.txt") args = parser.parse_args() data_folder = args.data_folder model_name = args.model_name word2idx = embedding_utils.load_word_index(data_folder + args.word_embeddings) val_data, _ = io.load_relation_graphs_from_file(data_folder + args.val_set, load_vertices=True) print("Applying the model to a dataset of size: {}".format(len(val_data))) print("Reading the property index") with open(data_folder + "keras-models/" + model_name + ".property2idx") as f: property2idx = ast.literal_eval(f.read()) n_out = len(property2idx) print("N_out:", n_out) idx2property = {v: k for k, v in property2idx.items()} with open(data_folder + "properties-with-labels.txt") as infile: property2label = { l.split("\t")[0]: l.split("\t")[1].strip() for l in infile.readlines() }
def main(model_params, model_name, data_folder, word_embeddings, train_set, val_set, property_index, learning_rate, shuffle_data, save_folder, save_model, grad_clip): if not os.path.exists(save_folder): os.mkdir(save_folder) with open(model_params) as f: model_params = json.load(f) embeddings, word2idx = embedding_utils.load(data_folder + word_embeddings) print("Loaded embeddings:", embeddings.shape) def check_data(data): for g in data: if (not 'vertexSet' in g): print("vertexSet missed\n") training_data, _ = io.load_relation_graphs_from_file(data_folder + train_set, load_vertices=True) val_data, _ = io.load_relation_graphs_from_file(data_folder + val_set, load_vertices=True) check_data(training_data) check_data(val_data) if property_index: print("Reading the property index from parameter") with open(data_folder + args.property_index) as f: property2idx = ast.literal_eval(f.read()) else: _, property2idx = embedding_utils.init_random( {e["kbID"] for g in training_data for e in g["edgeSet"]} | {"P0"}, 1, add_all_zeroes=True, add_unknown=True) max_sent_len = max(len(g["tokens"]) for g in training_data) print("Max sentence length:", max_sent_len) max_sent_len = 36 print("Max sentence length set to: {}".format(max_sent_len)) graphs_to_indices = sp_models.to_indices if model_name == "ContextAware": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "PCNN": graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask elif model_name == "CNN": graphs_to_indices = sp_models.to_indices_with_relative_positions elif model_name == "GPGNN": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding _, position2idx = embedding_utils.init_random(np.arange( -max_sent_len, max_sent_len), 1, add_all_zeroes=True) train_as_indices = list( graphs_to_indices(training_data, word2idx, property2idx, max_sent_len, embeddings=embeddings, position2idx=position2idx)) training_data = None n_out = len(property2idx) print("N_out:", n_out) val_as_indices = list( graphs_to_indices(val_data, word2idx, property2idx, max_sent_len, embeddings=embeddings, position2idx=position2idx)) val_data = None print("Save property dictionary.") with open(data_folder + "models/" + model_name + ".property2idx", 'w') as outfile: outfile.write(str(property2idx)) print("Training the model") print("Initialize the model") model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out).cuda() loss_func = nn.CrossEntropyLoss(ignore_index=0).cuda() opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=model_params['weight_decay']) indices = np.arange(train_as_indices[0].shape[0]) step = 0 for train_epoch in range(model_params['nb_epoch']): if (shuffle_data): np.random.shuffle(indices) f1 = 0 for i in tqdm( range( int(train_as_indices[0].shape[0] / model_params['batch_size']))): opt.zero_grad() sentence_input = train_as_indices[0][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] entity_markers = train_as_indices[1][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] labels = train_as_indices[2][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] if model_name == "GPGNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]]) elif model_name == "PCNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), Variable(torch.from_numpy( np.array(train_as_indices[3] [i * model_params['batch_size']:(i + 1) * model_params['batch_size']])).float(), requires_grad=False).cuda()) else: output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda()) loss = loss_func( output, Variable(torch.from_numpy(labels.astype(int))).view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), grad_clip) opt.step() _, predicted = torch.max(output, dim=1) labels = labels.reshape(-1).tolist() predicted = predicted.data.tolist() p_indices = np.array(labels) != 0 predicted = np.array(predicted)[p_indices].tolist() labels = np.array(labels)[p_indices].tolist() _, _, add_f1 = evaluation_utils.evaluate_instance_based( predicted, labels, empty_label=p0_index) f1 += add_f1 print("Train f1: ", f1 / (train_as_indices[0].shape[0] / model_params['batch_size'])) val_f1 = 0 for i in tqdm( range( int(val_as_indices[0].shape[0] / model_params['batch_size']))): sentence_input = val_as_indices[0][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] entity_markers = val_as_indices[1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] labels = val_as_indices[2][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] if model_name == "GPGNN": output = model( Variable(torch.from_numpy(sentence_input.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy(entity_markers.astype(int)), volatile=True).cuda(), val_as_indices[3][i * model_params['batch_size']:(i + 1) * model_params['batch_size']]) elif model_name == "PCNN": output = model( Variable(torch.from_numpy(sentence_input.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy(entity_markers.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy( np.array(val_as_indices[3] [i * model_params['batch_size']:(i + 1) * model_params['batch_size']])).float(), volatile=True).cuda()) else: output = model( Variable(torch.from_numpy(sentence_input.astype(int)), volatile=True).cuda(), Variable(torch.from_numpy(entity_markers.astype(int)), volatile=True).cuda()) _, predicted = torch.max(output, dim=1) labels = labels.reshape(-1).tolist() predicted = predicted.data.tolist() p_indices = np.array(labels) != 0 predicted = np.array(predicted)[p_indices].tolist() labels = np.array(labels)[p_indices].tolist() _, _, add_f1 = evaluation_utils.evaluate_instance_based( predicted, labels, empty_label=p0_index) val_f1 += add_f1 print( "Validation f1: ", val_f1 / (val_as_indices[0].shape[0] / model_params['batch_size'])) # save model if (train_epoch % 5 == 0 and save_model): torch.save( model.state_dict(), "{0}{1}-{2}.out".format(save_folder, model_name, str(train_epoch))) step = step + 1
def train(): """ Main Configurations """ model_name = "RECON" data_folder = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/enwiki-20160501/" save_folder = "./models/RECON/" model_params = "model_params.json" word_embeddings = "glove.6B.50d.txt" train_set = "semantic-graphs-filtered-training.02_06.json" val_set = "semantic-graphs-filtered-validation.02_06.json" use_char_vocab = False gat_embedding_file = None gat_relation_embedding_file = None # Enter the appropriate file paths here if "RECON" in model_name: context_data_file = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entities_context.json" if "KGGAT" in model_name: gat_embedding_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/final_entity_embeddings.json' gat_entity2id_file = './data/GAT/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entity2id.txt' if model_name == "RECON": # Point to the trained model/embedding/data files gat_relation_embedding_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/final_relation_embeddings.json' gat_relation2id_file = './data/GAT/WikipediaWikidataDistantSupervisionAnnotations.v1.0/relation2id.txt' w_ent2rel_all_rels_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/W_ent2rel.json.npy' # a file to store property2idx # if is None use model_name.property2idx property_index = None learning_rate = 1e-3 shuffle_data = True save_model = True grad_clip = 0.25 # os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) with open(model_params) as f: model_params = json.load(f) global args save_folder = args.save_folder model_params['batch_size'] = args.batch_size model_params['nb_epoch'] = args.epochs val_results_file = os.path.join(save_folder, 'val_results.json') char_vocab_file = os.path.join(save_folder, "char_vocab.json") if not os.path.exists(save_folder): os.mkdir(save_folder) sp_models.set_max_edges( model_params['max_num_nodes'] * (model_params['max_num_nodes'] - 1), model_params['max_num_nodes']) if context_data_file: with open(context_data_file, 'r') as f: context_data = json.load(f) if gat_embedding_file: with open(gat_embedding_file, 'r') as f: gat_embeddings = json.load(f) with open(gat_relation_embedding_file, 'r') as f: gat_relation_embeddings = json.load(f) if gat_relation_embedding_file: W_ent2rel_all_rels = np.load(w_ent2rel_all_rels_file) with open(gat_entity2id_file, 'r') as f: gat_entity2idx = {} data = f.read() lines = data.split('\n') for line in lines: line_arr = line.split(' ') if len(line_arr) == 2: gat_entity2idx[line_arr[0].strip()] = line_arr[1].strip() with open(gat_relation2id_file, 'r') as f: gat_relation2idx = {} data = f.read() lines = data.split('\n') for line in lines: line_arr = line.split(' ') if len(line_arr) == 2: gat_relation2idx[line_arr[0].strip()] = line_arr[1].strip() embeddings, word2idx = embedding_utils.load(data_folder + word_embeddings) print("Loaded embeddings:", embeddings.shape) def check_data(data): for g in data: if (not 'vertexSet' in g): print("vertexSet missed\n") training_data, _ = io.load_relation_graphs_from_file(data_folder + train_set, load_vertices=True, data='nyt') if not use_char_vocab: char_vocab = context_utils.make_char_vocab(training_data) print("Save char vocab dictionary.") with open(char_vocab_file, 'w') as outfile: json.dump(char_vocab, outfile, indent=4) else: with open(char_vocab_file, 'r') as f: char_vocab = json.load(f) val_data, _ = io.load_relation_graphs_from_file(data_folder + val_set, load_vertices=True, data="nyt") check_data(training_data) check_data(val_data) if property_index: print("Reading the property index from parameter") with open(data_folder + args.property_index) as f: property2idx = ast.literal_eval(f.read()) with open(data_folder + args.entity_index) as f: entity2idx = ast.literal_eval(f.read()) else: _, property2idx = embedding_utils.init_random( {e["kbID"] for g in training_data for e in g["edgeSet"]} | {"P0"}, 1, add_all_zeroes=True, add_unknown=True) _, entity2idx = context_utils.init_random( {kbID for kbID, _ in context_data.items()}, model_params['embedding_dim'], add_all_zeroes=True, add_unknown=True) idx2entity = {v: k for k, v in entity2idx.items()} context_data['ALL_ZERO'] = { 'desc': '', 'label': 'ALL_ZERO', 'instances': [], 'aliases': [] } max_sent_len = max(len(g["tokens"]) for g in training_data) print("Max sentence length:", max_sent_len) max_sent_len = 36 print("Max sentence length set to: {}".format(max_sent_len)) graphs_to_indices = sp_models.to_indices if model_name == "ContextAware": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "PCNN": graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask elif model_name == "CNN": graphs_to_indices = sp_models.to_indices_with_relative_positions elif model_name == "GPGNN": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "RECON-EAC": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "RECON-EAC-KGGAT": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding elif model_name == "RECON": graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding _, position2idx = embedding_utils.init_random(np.arange( -max_sent_len, max_sent_len), 1, add_all_zeroes=True) train_as_indices = list( graphs_to_indices(training_data, word2idx, property2idx, max_sent_len, embeddings=embeddings, position2idx=position2idx, entity2idx=entity2idx)) training_data = None n_out = len(property2idx) print("N_out:", n_out) val_as_indices = list( graphs_to_indices(val_data, word2idx, property2idx, max_sent_len, embeddings=embeddings, position2idx=position2idx, entity2idx=entity2idx)) val_data = None print("Save property dictionary.") with open(os.path.join(save_folder, model_name + ".property2idx"), 'w') as outfile: outfile.write(str(property2idx)) print("Save entity dictionary.") with open(os.path.join(save_folder, model_name + ".entity2idx"), 'w') as outfile: outfile.write(str(entity2idx)) print("Training the model") print("Initialize the model") if "RECON" not in model_name: model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out) elif model_name == "RECON-EAC": model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out, char_vocab) elif model_name == "RECON-EAC-KGGAT": model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out, char_vocab) elif model_name == "RECON": model = get_model(model_name)(model_params, embeddings, max_sent_len, n_out, char_vocab, gat_relation_embeddings, W_ent2rel_all_rels, idx2property, gat_relation2idx) model = model.cuda() loss_func = nn.CrossEntropyLoss(ignore_index=0).cuda() opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=model_params['weight_decay']) indices = np.arange(train_as_indices[0].shape[0]) step = 0 val_results = [] for train_epoch in range(model_params['nb_epoch']): if (shuffle_data): np.random.shuffle(indices) f1 = 0 for i in tqdm( range( int(train_as_indices[0].shape[0] / model_params['batch_size']))): opt.zero_grad() sentence_input = train_as_indices[0][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] entity_markers = train_as_indices[1][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] labels = train_as_indices[2][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] if "RECON" in model_name: entity_indices = train_as_indices[4][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]] unique_entities, unique_entities_surface_forms, max_occurred_entity_in_batch_pos = context_utils.get_batch_unique_entities( train_as_indices[4] [indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], train_as_indices[5] [indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]]) unique_entities_context_indices = context_utils.get_context_indices( unique_entities, unique_entities_surface_forms, context_data, idx2entity, word2idx, char_vocab, model_params['conv_filter_size'], max_sent_len=32, max_num_contexts=32, max_char_len=10, data='nyt') entities_position = context_utils.get_entity_location_unique_entities( unique_entities, entity_indices) if model_name == "RECON-EAC-KGGAT": gat_entity_embeddings = context_utils.get_gat_entity_embeddings( entity_indices, entity2idx, idx2entity, gat_entity2idx, gat_embeddings) elif model_name == "RECON": gat_entity_embeddings, nonzero_gat_entity_embeddings, nonzero_entity_pos = context_utils.get_selected_gat_entity_embeddings( entity_indices, entity2idx, idx2entity, gat_entity2idx, gat_embeddings) if model_name == "RECON": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos, Variable(torch.from_numpy( nonzero_gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda(), nonzero_entity_pos, Variable(torch.from_numpy( gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda()) elif model_name == "RECON-EAC-KGGAT": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos, Variable(torch.from_numpy( gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda()) elif model_name == "RECON-EAC": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos) elif model_name == "GPGNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]]) elif model_name == "PCNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), Variable(torch.from_numpy( np.array(train_as_indices[3] [i * model_params['batch_size']:(i + 1) * model_params['batch_size']])).float(), requires_grad=False).cuda()) else: output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda()) loss = loss_func( output, Variable(torch.from_numpy(labels.astype(int))).view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), grad_clip) opt.step() _, predicted = torch.max(output, dim=1) labels = labels.reshape(-1).tolist() predicted = predicted.data.tolist() p_indices = np.array(labels) != 0 predicted = np.array(predicted)[p_indices].tolist() labels = np.array(labels)[p_indices].tolist() _, _, add_f1 = evaluation_utils.evaluate_instance_based( predicted, labels, empty_label=p0_index) f1 += add_f1 train_f1 = f1 / (train_as_indices[0].shape[0] / model_params['batch_size']) print("Train f1: ", train_f1) val_f1 = 0 for i in tqdm( range( int(val_as_indices[0].shape[0] / model_params['batch_size']))): sentence_input = val_as_indices[0][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] entity_markers = val_as_indices[1][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] labels = val_as_indices[2][i * model_params['batch_size']:(i + 1) * model_params['batch_size']] if "RECON" in model_name: entity_indices = val_as_indices[4][i * model_params['batch_size']: (i + 1) * model_params['batch_size']] unique_entities, unique_entities_surface_forms, max_occurred_entity_in_batch_pos = context_utils.get_batch_unique_entities( val_as_indices[4][i * model_params['batch_size']:(i + 1) * model_params['batch_size']], val_as_indices[5][i * model_params['batch_size']:(i + 1) * model_params['batch_size']]) unique_entities_context_indices = context_utils.get_context_indices( unique_entities, unique_entities_surface_forms, context_data, idx2entity, word2idx, char_vocab, model_params['conv_filter_size'], max_sent_len=32, max_num_contexts=32, max_char_len=10, data='nyt') entities_position = context_utils.get_entity_location_unique_entities( unique_entities, entity_indices) if model_name == 'RECON-EAC-KGGAT': gat_entity_embeddings = context_utils.get_gat_entity_embeddings( entity_indices, entity2idx, idx2entity, gat_entity2idx, gat_embeddings) elif model_name == "RECON": gat_entity_embeddings, nonzero_gat_entity_embeddings, nonzero_entity_pos = context_utils.get_selected_gat_entity_embeddings( entity_indices, entity2idx, idx2entity, gat_entity2idx, gat_embeddings) if model_name == "RECON": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos, Variable(torch.from_numpy( nonzero_gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda(), nonzero_entity_pos, Variable(torch.from_numpy( gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda()) elif model_name == "RECON-EAC-KGGAT": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos, Variable(torch.from_numpy( gat_entity_embeddings.astype(np.float32)), requires_grad=False).cuda()) elif model_name == "RECON-EAC": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]], Variable(torch.from_numpy(unique_entities.astype( np.long))).cuda(), Variable(torch.from_numpy(entity_indices.astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[0].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[1].astype( np.long))).cuda(), Variable( torch.from_numpy( unique_entities_context_indices[2].astype( bool))).cuda(), Variable(torch.from_numpy( entities_position.astype(int))).cuda(), max_occurred_entity_in_batch_pos) elif model_name == "GPGNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), train_as_indices[3][ indices[i * model_params['batch_size']:(i + 1) * model_params['batch_size']]]) elif model_name == "PCNN": output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda(), Variable(torch.from_numpy( np.array(train_as_indices[3] [i * model_params['batch_size']:(i + 1) * model_params['batch_size']])).float(), requires_grad=False).cuda()) else: output = model( Variable(torch.from_numpy( sentence_input.astype(int))).cuda(), Variable(torch.from_numpy( entity_markers.astype(int))).cuda()) _, predicted = torch.max(output, dim=1) labels = labels.reshape(-1).tolist() predicted = predicted.data.tolist() p_indices = np.array(labels) != 0 predicted = np.array(predicted)[p_indices].tolist() labels = np.array(labels)[p_indices].tolist() _, _, add_f1 = evaluation_utils.evaluate_instance_based( predicted, labels, empty_label=p0_index) val_f1 += add_f1 val_f1 = val_f1 / (val_as_indices[0].shape[0] / model_params['batch_size']) print("Validation f1: ", val_f1) val_results.append({'train_f1': train_f1, 'val_f1': val_f1}) # save model if (train_epoch % 1 == 0 and save_model): torch.save( model.state_dict(), "{0}{1}-{2}.out".format(save_folder, model_name, str(train_epoch))) step = step + 1 with open(val_results_file, 'w') as f: json.dump(val_results, f, indent=4, cls=context_utils.CustomEncoder)
import json import logging from semanticgraph import io import tqdm logging.basicConfig(level=logging.DEBUG) if __name__ == "__main__": logger = logging.getLogger(__name__) logger.setLevel(logging.ERROR) data_folder = "../data/" relations_data, _ = io.load_relation_graphs_from_file( "/home/local/UKP/sorokin/IdeaProjects/semantic-parsing-training-data-pipeline/data/training-data/semantic-graphs-filtered-training.02_06.json", load_vertices=False) logging.debug('Loaded, size: {}'.format(len(relations_data))) ne_tagger = nltk.tag.stanford.StanfordNERTagger( "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/models-3.7.0/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz", path_to_jar= "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar" ) pos_tagger = nltk.tag.stanford.StanfordPOSTagger( "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/models-3.7.0/edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger", path_to_jar= "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar" ) webquestions_utterances_tokens = [ q_obj['tokens'] for q_obj in relations_data