Example #1
0
def main(model_params, model_name, data_folder, word_embeddings, test_set,
         property_index, save_folder, load_model, result_folder):

    with open(model_params) as f:
        model_params = json.load(f)

    embeddings, word2idx = embedding_utils.load(data_folder + word_embeddings)
    print("Loaded embeddings:", embeddings.shape)

    def check_data(data):
        for g in data:
            if (not 'vertexSet' in g):
                print("vertexSet missed\n")

    print("Reading the property index")
    with open(data_folder + "models/" + model_name + ".property2idx") as f:
        property2idx = ast.literal_eval(f.read())

    max_sent_len = 36
    print("Max sentence length set to: {}".format(max_sent_len))

    graphs_to_indices = sp_models.to_indices_and_entity_pair
    if model_name == "ContextAware":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair
    elif model_name == "PCNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask_and_entity_pair
    elif model_name == "CNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions_and_entity_pair
    elif model_name == "GPGNN":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair

    _, position2idx = embedding_utils.init_random(np.arange(
        -max_sent_len, max_sent_len),
                                                  1,
                                                  add_all_zeroes=True)

    training_data = None

    n_out = len(property2idx)
    print("N_out:", n_out)

    model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                  n_out).cuda()
    model.load_state_dict(torch.load(save_folder + load_model))
    print("Testing")

    print("Results on the test set")
    test_set, _ = io.load_relation_graphs_from_file(data_folder + test_set)
    test_as_indices = list(
        graphs_to_indices(test_set,
                          word2idx,
                          property2idx,
                          max_sent_len,
                          embeddings=embeddings,
                          position2idx=position2idx))

    print("Start testing!")
    result_file = open(result_folder + "_" + model_name, "w")
    for i in tqdm(
            range(int(test_as_indices[0].shape[0] /
                      model_params['batch_size']))):
        sentence_input = test_as_indices[0][i *
                                            model_params['batch_size']:(i +
                                                                        1) *
                                            model_params['batch_size']]
        entity_markers = test_as_indices[1][i *
                                            model_params['batch_size']:(i +
                                                                        1) *
                                            model_params['batch_size']]
        labels = test_as_indices[2][i * model_params['batch_size']:(i + 1) *
                                    model_params['batch_size']]

        if model_name == "GPGNN":
            output = model(
                Variable(torch.from_numpy(sentence_input.astype(int)),
                         volatile=True).cuda(),
                Variable(torch.from_numpy(entity_markers.astype(int)),
                         volatile=True).cuda(),
                test_as_indices[3][i * model_params['batch_size']:(i + 1) *
                                   model_params['batch_size']])
        elif model_name == "PCNN":
            output = model(
                Variable(torch.from_numpy(sentence_input.astype(int)),
                         volatile=True).cuda(),
                Variable(torch.from_numpy(entity_markers.astype(int)),
                         volatile=True).cuda(),
                Variable(torch.from_numpy(
                    np.array(test_as_indices[3]
                             [i * model_params['batch_size']:(i + 1) *
                              model_params['batch_size']])).float(),
                         requires_grad=False,
                         volatile=True).cuda())
        else:
            output = model(
                Variable(torch.from_numpy(sentence_input.astype(int)),
                         volatile=True).cuda(),
                Variable(torch.from_numpy(entity_markers.astype(int)),
                         volatile=True).cuda())

        score = F.softmax(output)
        score = to_np(score).reshape(-1, n_out)
        labels = labels.reshape(-1)
        p_indices = labels != 0
        score = score[p_indices].tolist()
        labels = labels[p_indices].tolist()
        if (model_name != "LSTM" and model_name != "PCNN"
                and model_name != "CNN"):
            entity_pairs = test_as_indices[-1][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
            entity_pairs = reduce(lambda x, y: x + y, entity_pairs)
        else:
            entity_pairs = test_as_indices[-1][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
        for (i, j, entity_pair) in zip(score, labels, entity_pairs):
            for index, k in enumerate(i):
                result_file.write(
                    str(index) + "\t" + str(k) + "\t" +
                    str(1 if index == j else 0) + "\t" + entity_pair[0] +
                    "\t" + entity_pair[1] + "\n")
Example #2
0
def test():
    """ Main Configurations """
    model_name = "RECON"
    load_model = "RECON-{}.out"  # you should choose the proper model to load
    # device_id = 0

    data_folder = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/enwiki-20160501/"
    save_folder = "./models/RECON/"
    result_folder = "result/"

    model_params = "model_params.json"
    word_embeddings = "./glove.6B/glove.6B.50d.txt"

    test_set = "semantic-graphs-filtered-held-out.02_06.json"

    gat_embedding_file = None
    gat_relation_embedding_file = None
    if "RECON" in model_name:
        context_data_file = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entities_context.json"
    if "KGGAT" in model_name:
        gat_embedding_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/final_entity_embeddings.json'
        gat_entity2id_file = './data/GAT/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entity2id.txt'
    if model_name == "RECON":
        gat_relation_embedding_file = './re/models/GAT_sep_space/WikipediaWikidataDistantSupervisionAnnotations/final_relation_embeddings.json'
        gat_relation2id_file = './data/GAT_sep_space/WikipediaWikidataDistantSupervisionAnnotations.v1.0/relation2id.txt'
        w_ent2rel_all_rels_file = './re/models/GAT_sep_space/WikipediaWikidataDistantSupervisionAnnotations/W_ent2rel.json.npy'

    use_char_vocab = False

    # a file to store property2idx
    # if is None use model_name.property2idx
    property_index = None

    with open(model_params) as f:
        model_params = json.load(f)
    global args
    save_folder = args.save_folder
    if args.test_file != '':
        test_set = args.test_file
    result_folder = args.result_folder
    model_params['batch_size'] = args.batch_size
    if not os.path.exists(result_folder):
        os.makedirs(result_folder)

    char_vocab_file = os.path.join(save_folder, "char_vocab.json")

    sp_models.set_max_edges(
        model_params['max_num_nodes'] * (model_params['max_num_nodes'] - 1),
        model_params['max_num_nodes'])

    if context_data_file:
        with open(context_data_file, 'r') as f:
            context_data = json.load(f)
    if gat_embedding_file:
        with open(gat_embedding_file, 'r') as f:
            gat_embeddings = json.load(f)
        with open(gat_relation_embedding_file, 'r') as f:
            gat_relation_embeddings = json.load(f)
    if gat_relation_embedding_file:
        W_ent2rel_all_rels = np.load(w_ent2rel_all_rels_file)
        with open(gat_entity2id_file, 'r') as f:
            gat_entity2idx = {}
            data = f.read()
            lines = data.split('\n')
            for line in lines:
                line_arr = line.split(' ')
                if len(line_arr) == 2:
                    gat_entity2idx[line_arr[0].strip()] = line_arr[1].strip()
        with open(gat_relation2id_file, 'r') as f:
            gat_relation2idx = {}
            data = f.read()
            lines = data.split('\n')
            for line in lines:
                line_arr = line.split(' ')
                if len(line_arr) == 2:
                    gat_relation2idx[line_arr[0].strip()] = line_arr[1].strip()

    embeddings, word2idx = embedding_utils.load(word_embeddings)
    print("Loaded embeddings:", embeddings.shape)

    def check_data(data):
        for g in data:
            if (not 'vertexSet' in g):
                print("vertexSet missed\n")

    print("Reading the property index")
    with open(os.path.join(save_folder, model_name + ".property2idx")) as f:
        property2idx = ast.literal_eval(f.read())
    idx2property = {v: k for k, v in property2idx.items()}
    print("Reading the entity index")
    with open(os.path.join(save_folder, model_name + ".entity2idx")) as f:
        entity2idx = ast.literal_eval(f.read())
    idx2entity = {v: k for k, v in entity2idx.items()}
    context_data['ALL_ZERO'] = {
        'desc': '',
        'label': 'ALL_ZERO',
        'instances': [],
        'aliases': []
    }

    with open(char_vocab_file, 'r') as f:
        char_vocab = json.load(f)

    max_sent_len = 36
    print("Max sentence length set to: {}".format(max_sent_len))

    graphs_to_indices = sp_models.to_indices_and_entity_pair
    if model_name == "ContextAware":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "PCNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask
    elif model_name == "CNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions
    elif model_name == "GPGNN":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "RECON-EAC":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "RECON-EAC-KGGAT":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "RECON":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding

    _, position2idx = embedding_utils.init_random(np.arange(
        -max_sent_len, max_sent_len),
                                                  1,
                                                  add_all_zeroes=True)

    training_data = None

    n_out = len(property2idx)
    print("N_out:", n_out)

    if "RECON" not in model_name:
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out)
    elif model_name == "RECON-EAC":
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out, char_vocab)
    elif model_name == "RECON-EAC-KGGAT":
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out, char_vocab)
    elif model_name == "RECON":
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out, char_vocab,
                                      gat_relation_embeddings,
                                      W_ent2rel_all_rels, idx2property,
                                      gat_relation2idx)

    model = model.cuda()
    model.load_state_dict(torch.load(os.path.join(save_folder, load_model)))

    print("Testing")

    print("Results on the test set")
    test_set, _ = io.load_relation_graphs_from_file(data_folder + test_set,
                                                    data='nyt')
    test_as_indices = list(
        graphs_to_indices(test_set,
                          word2idx,
                          property2idx,
                          max_sent_len,
                          embeddings=embeddings,
                          position2idx=position2idx,
                          entity2idx=entity2idx))

    print("Start testing!")
    result_file = open(os.path.join(result_folder, "_" + model_name), "w")
    test_f1 = 0.0
    for i in tqdm(
            range(int(test_as_indices[0].shape[0] /
                      model_params['batch_size']))):
        sentence_input = test_as_indices[0][
            indices[i * model_params['batch_size']:(i + 1) *
                    model_params['batch_size']]]
        entity_markers = test_as_indices[1][
            indices[i * model_params['batch_size']:(i + 1) *
                    model_params['batch_size']]]
        labels = test_as_indices[2][indices[i *
                                            model_params['batch_size']:(i +
                                                                        1) *
                                            model_params['batch_size']]]
        if "RECON" in model_name:
            entity_indices = test_as_indices[4][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]
            unique_entities, unique_entities_surface_forms, max_occurred_entity_in_batch_pos = context_utils.get_batch_unique_entities(
                test_as_indices[4][indices[i *
                                           model_params['batch_size']:(i + 1) *
                                           model_params['batch_size']]],
                test_as_indices[5][indices[i *
                                           model_params['batch_size']:(i + 1) *
                                           model_params['batch_size']]])
            unique_entities_context_indices = context_utils.get_context_indices(
                unique_entities,
                unique_entities_surface_forms,
                context_data,
                idx2entity,
                word2idx,
                char_vocab,
                model_params['conv_filter_size'],
                max_sent_len=32,
                max_num_contexts=32,
                max_char_len=10,
                data='nyt')
            entities_position = context_utils.get_entity_location_unique_entities(
                unique_entities, entity_indices)
        if model_name == "RECON-EAC-KGGAT":
            gat_entity_embeddings = context_utils.get_gat_entity_embeddings(
                entity_indices, entity2idx, idx2entity, gat_entity2idx,
                gat_embeddings)
        elif model_name == "RECON":
            gat_entity_embeddings, nonzero_gat_entity_embeddings, nonzero_entity_pos = context_utils.get_selected_gat_entity_embeddings(
                entity_indices, entity2idx, idx2entity, gat_entity2idx,
                gat_embeddings)

        with torch.no_grad():
            if model_name == "RECON":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    test_as_indices[3][indices[i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos,
                    Variable(torch.from_numpy(
                        nonzero_gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda(), nonzero_entity_pos,
                    Variable(torch.from_numpy(
                        gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda())
            elif model_name == "RECON-EAC-KGGAT":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    test_as_indices[3][indices[i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos,
                    Variable(torch.from_numpy(
                        gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda())
            elif model_name == "RECON-EAC":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    test_as_indices[3][indices[i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos)
            elif model_name == "GPGNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    test_as_indices[3][indices[i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]])
            elif model_name == "PCNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        np.array(test_as_indices[3]
                                 [i * model_params['batch_size']:(i + 1) *
                                  model_params['batch_size']])).float(),
                             requires_grad=False).cuda())
            else:
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda())

            _, predicted = torch.max(output, dim=1)
            labels_copy = labels.reshape(-1).tolist()
            predicted = predicted.data.tolist()
            p_indices = np.array(labels_copy) != 0
            predicted = np.array(predicted)[p_indices].tolist()
            labels_copy = np.array(labels_copy)[p_indices].tolist()

            _, _, add_f1 = evaluation_utils.evaluate_instance_based(
                predicted, labels_copy, empty_label=p0_index)
            test_f1 += add_f1

        score = F.softmax(output, dim=-1)
        score = to_np(score).reshape(-1, n_out)
        labels = labels.reshape(-1)
        p_indices = labels != 0
        score = score[p_indices].tolist()
        labels = labels[p_indices].tolist()
        pred_labels = r = np.argmax(score, axis=-1)
        indices = [i for i in range(len(p_indices)) if p_indices[i]]
        if (model_name != "LSTM" and model_name != "PCNN"
                and model_name != "CNN"):
            entity_pairs = test_as_indices[-1][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
            entity_pairs = reduce(lambda x, y: x + y, entity_pairs)
        else:
            entity_pairs = test_as_indices[-1][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]

        start_idx = i * model_params['batch_size']
        for index, (i, j,
                    entity_pair) in enumerate(zip(score, labels,
                                                  entity_pairs)):
            sent = ' '.join(test_set[start_idx + indices[index] //
                                     (model_params['max_num_nodes'] *
                                      (model_params['max_num_nodes'] - 1))]
                            ['tokens']).strip()
            result_file.write("{} | {} | {} | {} | {} | {}\n".format(
                sent, entity_pair[0], entity_pair[1],
                idx2property[pred_labels[index]], idx2property[labels[index]],
                score[index][pred_labels[index]]))

    print(
        "Test f1: ", test_f1 * 1.0 /
        (test_as_indices[0].shape[0] / model_params['batch_size']))
    result_file.close()
Example #3
0
    args = parser.parse_args()

    data_folder = args.data_folder
    model_name = args.model_name
    mode = args.mode

    with open(args.model_params) as f:
        model_params = json.load(f)

    embeddings, word2idx = embedding_utils.load(data_folder +
                                                args.word_embeddings)
    print("Loaded embeddings:", embeddings.shape)

    training_data, _ = io.load_relation_graphs_from_file(data_folder +
                                                         args.train_set,
                                                         load_vertices=True)

    val_data, _ = io.load_relation_graphs_from_file(data_folder + args.val_set,
                                                    load_vertices=True)

    if args.s:
        training_data = training_data[:len(training_data) // 3]
        print("Training data size set to: {}".format(len(training_data)))
        val_data = val_data[:len(val_data) // 3]
        print("Validation data size set to: {}".format(len(val_data)))

    if mode in ['test', 'train-plus-test']:
        print("Reading the property index")
        with open(data_folder + "keras-models/" + model_name +
                  ".property2idx") as f:
    parser = argparse.ArgumentParser()
    parser.add_argument('model_name')
    parser.add_argument('val_set')
    parser.add_argument('save_to')
    parser.add_argument('--data_folder', default="../../../data/")
    parser.add_argument('--word_embeddings', default="glove/glove.6B.50d.txt")

    args = parser.parse_args()

    data_folder = args.data_folder
    model_name = args.model_name

    word2idx = embedding_utils.load_word_index(data_folder +
                                               args.word_embeddings)

    val_data, _ = io.load_relation_graphs_from_file(data_folder + args.val_set,
                                                    load_vertices=True)

    print("Applying the model to a dataset of size: {}".format(len(val_data)))

    print("Reading the property index")
    with open(data_folder + "keras-models/" + model_name +
              ".property2idx") as f:
        property2idx = ast.literal_eval(f.read())
    n_out = len(property2idx)
    print("N_out:", n_out)
    idx2property = {v: k for k, v in property2idx.items()}
    with open(data_folder + "properties-with-labels.txt") as infile:
        property2label = {
            l.split("\t")[0]: l.split("\t")[1].strip()
            for l in infile.readlines()
        }
Example #5
0
def main(model_params, model_name, data_folder, word_embeddings, train_set,
         val_set, property_index, learning_rate, shuffle_data, save_folder,
         save_model, grad_clip):
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    with open(model_params) as f:
        model_params = json.load(f)

    embeddings, word2idx = embedding_utils.load(data_folder + word_embeddings)
    print("Loaded embeddings:", embeddings.shape)

    def check_data(data):
        for g in data:
            if (not 'vertexSet' in g):
                print("vertexSet missed\n")

    training_data, _ = io.load_relation_graphs_from_file(data_folder +
                                                         train_set,
                                                         load_vertices=True)

    val_data, _ = io.load_relation_graphs_from_file(data_folder + val_set,
                                                    load_vertices=True)

    check_data(training_data)
    check_data(val_data)

    if property_index:
        print("Reading the property index from parameter")
        with open(data_folder + args.property_index) as f:
            property2idx = ast.literal_eval(f.read())
    else:
        _, property2idx = embedding_utils.init_random(
            {e["kbID"]
             for g in training_data for e in g["edgeSet"]} | {"P0"},
            1,
            add_all_zeroes=True,
            add_unknown=True)

    max_sent_len = max(len(g["tokens"]) for g in training_data)
    print("Max sentence length:", max_sent_len)

    max_sent_len = 36
    print("Max sentence length set to: {}".format(max_sent_len))

    graphs_to_indices = sp_models.to_indices
    if model_name == "ContextAware":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "PCNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask
    elif model_name == "CNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions
    elif model_name == "GPGNN":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding

    _, position2idx = embedding_utils.init_random(np.arange(
        -max_sent_len, max_sent_len),
                                                  1,
                                                  add_all_zeroes=True)

    train_as_indices = list(
        graphs_to_indices(training_data,
                          word2idx,
                          property2idx,
                          max_sent_len,
                          embeddings=embeddings,
                          position2idx=position2idx))

    training_data = None

    n_out = len(property2idx)
    print("N_out:", n_out)

    val_as_indices = list(
        graphs_to_indices(val_data,
                          word2idx,
                          property2idx,
                          max_sent_len,
                          embeddings=embeddings,
                          position2idx=position2idx))
    val_data = None

    print("Save property dictionary.")
    with open(data_folder + "models/" + model_name + ".property2idx",
              'w') as outfile:
        outfile.write(str(property2idx))

    print("Training the model")

    print("Initialize the model")
    model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                  n_out).cuda()

    loss_func = nn.CrossEntropyLoss(ignore_index=0).cuda()
    opt = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=learning_rate,
                           weight_decay=model_params['weight_decay'])

    indices = np.arange(train_as_indices[0].shape[0])

    step = 0
    for train_epoch in range(model_params['nb_epoch']):
        if (shuffle_data):
            np.random.shuffle(indices)
        f1 = 0
        for i in tqdm(
                range(
                    int(train_as_indices[0].shape[0] /
                        model_params['batch_size']))):
            opt.zero_grad()

            sentence_input = train_as_indices[0][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]
            entity_markers = train_as_indices[1][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]
            labels = train_as_indices[2][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]

            if model_name == "GPGNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]])
            elif model_name == "PCNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        np.array(train_as_indices[3]
                                 [i * model_params['batch_size']:(i + 1) *
                                  model_params['batch_size']])).float(),
                             requires_grad=False).cuda())
            else:
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda())

            loss = loss_func(
                output,
                Variable(torch.from_numpy(labels.astype(int))).view(-1).cuda())

            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), grad_clip)
            opt.step()

            _, predicted = torch.max(output, dim=1)
            labels = labels.reshape(-1).tolist()
            predicted = predicted.data.tolist()
            p_indices = np.array(labels) != 0
            predicted = np.array(predicted)[p_indices].tolist()
            labels = np.array(labels)[p_indices].tolist()

            _, _, add_f1 = evaluation_utils.evaluate_instance_based(
                predicted, labels, empty_label=p0_index)
            f1 += add_f1

        print("Train f1: ",
              f1 / (train_as_indices[0].shape[0] / model_params['batch_size']))

        val_f1 = 0
        for i in tqdm(
                range(
                    int(val_as_indices[0].shape[0] /
                        model_params['batch_size']))):
            sentence_input = val_as_indices[0][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
            entity_markers = val_as_indices[1][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
            labels = val_as_indices[2][i * model_params['batch_size']:(i + 1) *
                                       model_params['batch_size']]
            if model_name == "GPGNN":
                output = model(
                    Variable(torch.from_numpy(sentence_input.astype(int)),
                             volatile=True).cuda(),
                    Variable(torch.from_numpy(entity_markers.astype(int)),
                             volatile=True).cuda(),
                    val_as_indices[3][i * model_params['batch_size']:(i + 1) *
                                      model_params['batch_size']])
            elif model_name == "PCNN":
                output = model(
                    Variable(torch.from_numpy(sentence_input.astype(int)),
                             volatile=True).cuda(),
                    Variable(torch.from_numpy(entity_markers.astype(int)),
                             volatile=True).cuda(),
                    Variable(torch.from_numpy(
                        np.array(val_as_indices[3]
                                 [i * model_params['batch_size']:(i + 1) *
                                  model_params['batch_size']])).float(),
                             volatile=True).cuda())
            else:
                output = model(
                    Variable(torch.from_numpy(sentence_input.astype(int)),
                             volatile=True).cuda(),
                    Variable(torch.from_numpy(entity_markers.astype(int)),
                             volatile=True).cuda())

            _, predicted = torch.max(output, dim=1)
            labels = labels.reshape(-1).tolist()
            predicted = predicted.data.tolist()
            p_indices = np.array(labels) != 0
            predicted = np.array(predicted)[p_indices].tolist()
            labels = np.array(labels)[p_indices].tolist()

            _, _, add_f1 = evaluation_utils.evaluate_instance_based(
                predicted, labels, empty_label=p0_index)
            val_f1 += add_f1
        print(
            "Validation f1: ",
            val_f1 / (val_as_indices[0].shape[0] / model_params['batch_size']))

        # save model
        if (train_epoch % 5 == 0 and save_model):
            torch.save(
                model.state_dict(),
                "{0}{1}-{2}.out".format(save_folder, model_name,
                                        str(train_epoch)))

        step = step + 1
Example #6
0
def train():
    """ Main Configurations """
    model_name = "RECON"
    data_folder = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/enwiki-20160501/"
    save_folder = "./models/RECON/"

    model_params = "model_params.json"
    word_embeddings = "glove.6B.50d.txt"
    train_set = "semantic-graphs-filtered-training.02_06.json"
    val_set = "semantic-graphs-filtered-validation.02_06.json"

    use_char_vocab = False

    gat_embedding_file = None
    gat_relation_embedding_file = None
    # Enter the appropriate file paths here
    if "RECON" in model_name:
        context_data_file = "./data/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entities_context.json"
    if "KGGAT" in model_name:
        gat_embedding_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/final_entity_embeddings.json'
        gat_entity2id_file = './data/GAT/WikipediaWikidataDistantSupervisionAnnotations.v1.0/entity2id.txt'
    if model_name == "RECON":
        # Point to the trained model/embedding/data files
        gat_relation_embedding_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/final_relation_embeddings.json'
        gat_relation2id_file = './data/GAT/WikipediaWikidataDistantSupervisionAnnotations.v1.0/relation2id.txt'
        w_ent2rel_all_rels_file = './models/GAT/WikipediaWikidataDistantSupervisionAnnotations/W_ent2rel.json.npy'

    # a file to store property2idx
    # if is None use model_name.property2idx
    property_index = None
    learning_rate = 1e-3
    shuffle_data = True
    save_model = True
    grad_clip = 0.25
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)

    with open(model_params) as f:
        model_params = json.load(f)
    global args
    save_folder = args.save_folder
    model_params['batch_size'] = args.batch_size
    model_params['nb_epoch'] = args.epochs
    val_results_file = os.path.join(save_folder, 'val_results.json')

    char_vocab_file = os.path.join(save_folder, "char_vocab.json")

    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    sp_models.set_max_edges(
        model_params['max_num_nodes'] * (model_params['max_num_nodes'] - 1),
        model_params['max_num_nodes'])

    if context_data_file:
        with open(context_data_file, 'r') as f:
            context_data = json.load(f)
    if gat_embedding_file:
        with open(gat_embedding_file, 'r') as f:
            gat_embeddings = json.load(f)
        with open(gat_relation_embedding_file, 'r') as f:
            gat_relation_embeddings = json.load(f)
    if gat_relation_embedding_file:
        W_ent2rel_all_rels = np.load(w_ent2rel_all_rels_file)
        with open(gat_entity2id_file, 'r') as f:
            gat_entity2idx = {}
            data = f.read()
            lines = data.split('\n')
            for line in lines:
                line_arr = line.split(' ')
                if len(line_arr) == 2:
                    gat_entity2idx[line_arr[0].strip()] = line_arr[1].strip()
        with open(gat_relation2id_file, 'r') as f:
            gat_relation2idx = {}
            data = f.read()
            lines = data.split('\n')
            for line in lines:
                line_arr = line.split(' ')
                if len(line_arr) == 2:
                    gat_relation2idx[line_arr[0].strip()] = line_arr[1].strip()

    embeddings, word2idx = embedding_utils.load(data_folder + word_embeddings)
    print("Loaded embeddings:", embeddings.shape)

    def check_data(data):
        for g in data:
            if (not 'vertexSet' in g):
                print("vertexSet missed\n")

    training_data, _ = io.load_relation_graphs_from_file(data_folder +
                                                         train_set,
                                                         load_vertices=True,
                                                         data='nyt')
    if not use_char_vocab:
        char_vocab = context_utils.make_char_vocab(training_data)
        print("Save char vocab dictionary.")
        with open(char_vocab_file, 'w') as outfile:
            json.dump(char_vocab, outfile, indent=4)
    else:
        with open(char_vocab_file, 'r') as f:
            char_vocab = json.load(f)

    val_data, _ = io.load_relation_graphs_from_file(data_folder + val_set,
                                                    load_vertices=True,
                                                    data="nyt")

    check_data(training_data)
    check_data(val_data)

    if property_index:
        print("Reading the property index from parameter")
        with open(data_folder + args.property_index) as f:
            property2idx = ast.literal_eval(f.read())
        with open(data_folder + args.entity_index) as f:
            entity2idx = ast.literal_eval(f.read())
    else:
        _, property2idx = embedding_utils.init_random(
            {e["kbID"]
             for g in training_data for e in g["edgeSet"]} | {"P0"},
            1,
            add_all_zeroes=True,
            add_unknown=True)
        _, entity2idx = context_utils.init_random(
            {kbID
             for kbID, _ in context_data.items()},
            model_params['embedding_dim'],
            add_all_zeroes=True,
            add_unknown=True)
    idx2entity = {v: k for k, v in entity2idx.items()}
    context_data['ALL_ZERO'] = {
        'desc': '',
        'label': 'ALL_ZERO',
        'instances': [],
        'aliases': []
    }

    max_sent_len = max(len(g["tokens"]) for g in training_data)
    print("Max sentence length:", max_sent_len)

    max_sent_len = 36
    print("Max sentence length set to: {}".format(max_sent_len))

    graphs_to_indices = sp_models.to_indices
    if model_name == "ContextAware":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "PCNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions_and_pcnn_mask
    elif model_name == "CNN":
        graphs_to_indices = sp_models.to_indices_with_relative_positions
    elif model_name == "GPGNN":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "RECON-EAC":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "RECON-EAC-KGGAT":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding
    elif model_name == "RECON":
        graphs_to_indices = sp_models.to_indices_with_real_entities_and_entity_nums_with_vertex_padding

    _, position2idx = embedding_utils.init_random(np.arange(
        -max_sent_len, max_sent_len),
                                                  1,
                                                  add_all_zeroes=True)

    train_as_indices = list(
        graphs_to_indices(training_data,
                          word2idx,
                          property2idx,
                          max_sent_len,
                          embeddings=embeddings,
                          position2idx=position2idx,
                          entity2idx=entity2idx))

    training_data = None

    n_out = len(property2idx)
    print("N_out:", n_out)

    val_as_indices = list(
        graphs_to_indices(val_data,
                          word2idx,
                          property2idx,
                          max_sent_len,
                          embeddings=embeddings,
                          position2idx=position2idx,
                          entity2idx=entity2idx))
    val_data = None

    print("Save property dictionary.")
    with open(os.path.join(save_folder, model_name + ".property2idx"),
              'w') as outfile:
        outfile.write(str(property2idx))
    print("Save entity dictionary.")
    with open(os.path.join(save_folder, model_name + ".entity2idx"),
              'w') as outfile:
        outfile.write(str(entity2idx))

    print("Training the model")

    print("Initialize the model")

    if "RECON" not in model_name:
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out)
    elif model_name == "RECON-EAC":
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out, char_vocab)
    elif model_name == "RECON-EAC-KGGAT":
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out, char_vocab)
    elif model_name == "RECON":
        model = get_model(model_name)(model_params, embeddings, max_sent_len,
                                      n_out, char_vocab,
                                      gat_relation_embeddings,
                                      W_ent2rel_all_rels, idx2property,
                                      gat_relation2idx)

    model = model.cuda()
    loss_func = nn.CrossEntropyLoss(ignore_index=0).cuda()

    opt = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=learning_rate,
                           weight_decay=model_params['weight_decay'])

    indices = np.arange(train_as_indices[0].shape[0])

    step = 0
    val_results = []
    for train_epoch in range(model_params['nb_epoch']):
        if (shuffle_data):
            np.random.shuffle(indices)
        f1 = 0
        for i in tqdm(
                range(
                    int(train_as_indices[0].shape[0] /
                        model_params['batch_size']))):
            opt.zero_grad()

            sentence_input = train_as_indices[0][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]
            entity_markers = train_as_indices[1][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]
            labels = train_as_indices[2][
                indices[i * model_params['batch_size']:(i + 1) *
                        model_params['batch_size']]]
            if "RECON" in model_name:
                entity_indices = train_as_indices[4][
                    indices[i * model_params['batch_size']:(i + 1) *
                            model_params['batch_size']]]
                unique_entities, unique_entities_surface_forms, max_occurred_entity_in_batch_pos = context_utils.get_batch_unique_entities(
                    train_as_indices[4]
                    [indices[i * model_params['batch_size']:(i + 1) *
                             model_params['batch_size']]], train_as_indices[5]
                    [indices[i * model_params['batch_size']:(i + 1) *
                             model_params['batch_size']]])
                unique_entities_context_indices = context_utils.get_context_indices(
                    unique_entities,
                    unique_entities_surface_forms,
                    context_data,
                    idx2entity,
                    word2idx,
                    char_vocab,
                    model_params['conv_filter_size'],
                    max_sent_len=32,
                    max_num_contexts=32,
                    max_char_len=10,
                    data='nyt')
                entities_position = context_utils.get_entity_location_unique_entities(
                    unique_entities, entity_indices)
            if model_name == "RECON-EAC-KGGAT":
                gat_entity_embeddings = context_utils.get_gat_entity_embeddings(
                    entity_indices, entity2idx, idx2entity, gat_entity2idx,
                    gat_embeddings)
            elif model_name == "RECON":
                gat_entity_embeddings, nonzero_gat_entity_embeddings, nonzero_entity_pos = context_utils.get_selected_gat_entity_embeddings(
                    entity_indices, entity2idx, idx2entity, gat_entity2idx,
                    gat_embeddings)

            if model_name == "RECON":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos,
                    Variable(torch.from_numpy(
                        nonzero_gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda(), nonzero_entity_pos,
                    Variable(torch.from_numpy(
                        gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda())
            elif model_name == "RECON-EAC-KGGAT":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos,
                    Variable(torch.from_numpy(
                        gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda())
            elif model_name == "RECON-EAC":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos)
            elif model_name == "GPGNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]])
            elif model_name == "PCNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        np.array(train_as_indices[3]
                                 [i * model_params['batch_size']:(i + 1) *
                                  model_params['batch_size']])).float(),
                             requires_grad=False).cuda())
            else:
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda())

            loss = loss_func(
                output,
                Variable(torch.from_numpy(labels.astype(int))).view(-1).cuda())

            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), grad_clip)
            opt.step()

            _, predicted = torch.max(output, dim=1)
            labels = labels.reshape(-1).tolist()
            predicted = predicted.data.tolist()
            p_indices = np.array(labels) != 0
            predicted = np.array(predicted)[p_indices].tolist()
            labels = np.array(labels)[p_indices].tolist()

            _, _, add_f1 = evaluation_utils.evaluate_instance_based(
                predicted, labels, empty_label=p0_index)
            f1 += add_f1

        train_f1 = f1 / (train_as_indices[0].shape[0] /
                         model_params['batch_size'])
        print("Train f1: ", train_f1)

        val_f1 = 0
        for i in tqdm(
                range(
                    int(val_as_indices[0].shape[0] /
                        model_params['batch_size']))):
            sentence_input = val_as_indices[0][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
            entity_markers = val_as_indices[1][i *
                                               model_params['batch_size']:(i +
                                                                           1) *
                                               model_params['batch_size']]
            labels = val_as_indices[2][i * model_params['batch_size']:(i + 1) *
                                       model_params['batch_size']]
            if "RECON" in model_name:
                entity_indices = val_as_indices[4][i *
                                                   model_params['batch_size']:
                                                   (i + 1) *
                                                   model_params['batch_size']]
                unique_entities, unique_entities_surface_forms, max_occurred_entity_in_batch_pos = context_utils.get_batch_unique_entities(
                    val_as_indices[4][i * model_params['batch_size']:(i + 1) *
                                      model_params['batch_size']],
                    val_as_indices[5][i * model_params['batch_size']:(i + 1) *
                                      model_params['batch_size']])
                unique_entities_context_indices = context_utils.get_context_indices(
                    unique_entities,
                    unique_entities_surface_forms,
                    context_data,
                    idx2entity,
                    word2idx,
                    char_vocab,
                    model_params['conv_filter_size'],
                    max_sent_len=32,
                    max_num_contexts=32,
                    max_char_len=10,
                    data='nyt')
                entities_position = context_utils.get_entity_location_unique_entities(
                    unique_entities, entity_indices)
            if model_name == 'RECON-EAC-KGGAT':
                gat_entity_embeddings = context_utils.get_gat_entity_embeddings(
                    entity_indices, entity2idx, idx2entity, gat_entity2idx,
                    gat_embeddings)
            elif model_name == "RECON":
                gat_entity_embeddings, nonzero_gat_entity_embeddings, nonzero_entity_pos = context_utils.get_selected_gat_entity_embeddings(
                    entity_indices, entity2idx, idx2entity, gat_entity2idx,
                    gat_embeddings)

            if model_name == "RECON":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos,
                    Variable(torch.from_numpy(
                        nonzero_gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda(), nonzero_entity_pos,
                    Variable(torch.from_numpy(
                        gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda())
            elif model_name == "RECON-EAC-KGGAT":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos,
                    Variable(torch.from_numpy(
                        gat_entity_embeddings.astype(np.float32)),
                             requires_grad=False).cuda())
            elif model_name == "RECON-EAC":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]],
                    Variable(torch.from_numpy(unique_entities.astype(
                        np.long))).cuda(),
                    Variable(torch.from_numpy(entity_indices.astype(
                        np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[0].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[1].astype(
                                np.long))).cuda(),
                    Variable(
                        torch.from_numpy(
                            unique_entities_context_indices[2].astype(
                                bool))).cuda(),
                    Variable(torch.from_numpy(
                        entities_position.astype(int))).cuda(),
                    max_occurred_entity_in_batch_pos)
            elif model_name == "GPGNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    train_as_indices[3][
                        indices[i * model_params['batch_size']:(i + 1) *
                                model_params['batch_size']]])
            elif model_name == "PCNN":
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        np.array(train_as_indices[3]
                                 [i * model_params['batch_size']:(i + 1) *
                                  model_params['batch_size']])).float(),
                             requires_grad=False).cuda())
            else:
                output = model(
                    Variable(torch.from_numpy(
                        sentence_input.astype(int))).cuda(),
                    Variable(torch.from_numpy(
                        entity_markers.astype(int))).cuda())

            _, predicted = torch.max(output, dim=1)
            labels = labels.reshape(-1).tolist()
            predicted = predicted.data.tolist()
            p_indices = np.array(labels) != 0
            predicted = np.array(predicted)[p_indices].tolist()
            labels = np.array(labels)[p_indices].tolist()

            _, _, add_f1 = evaluation_utils.evaluate_instance_based(
                predicted, labels, empty_label=p0_index)
            val_f1 += add_f1

        val_f1 = val_f1 / (val_as_indices[0].shape[0] /
                           model_params['batch_size'])
        print("Validation f1: ", val_f1)

        val_results.append({'train_f1': train_f1, 'val_f1': val_f1})

        # save model
        if (train_epoch % 1 == 0 and save_model):
            torch.save(
                model.state_dict(),
                "{0}{1}-{2}.out".format(save_folder, model_name,
                                        str(train_epoch)))

        step = step + 1

        with open(val_results_file, 'w') as f:
            json.dump(val_results,
                      f,
                      indent=4,
                      cls=context_utils.CustomEncoder)
import json
import logging
from semanticgraph import io
import tqdm

logging.basicConfig(level=logging.DEBUG)

if __name__ == "__main__":

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.ERROR)

    data_folder = "../data/"

    relations_data, _ = io.load_relation_graphs_from_file(
        "/home/local/UKP/sorokin/IdeaProjects/semantic-parsing-training-data-pipeline/data/training-data/semantic-graphs-filtered-training.02_06.json",
        load_vertices=False)
    logging.debug('Loaded, size: {}'.format(len(relations_data)))

    ne_tagger = nltk.tag.stanford.StanfordNERTagger(
        "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/models-3.7.0/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz",
        path_to_jar=
        "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar"
    )
    pos_tagger = nltk.tag.stanford.StanfordPOSTagger(
        "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/models-3.7.0/edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger",
        path_to_jar=
        "/home/local/UKP/sorokin/IdeaProjects/question-answering/resources/stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar"
    )
    webquestions_utterances_tokens = [
        q_obj['tokens'] for q_obj in relations_data