logging.info("Loading the dictionaries")
    d = "Wiki" if args.dataset == "figer" else "OntoNotes"
    dicts = joblib.load(DATA_LOCATION + "dict_" + args.dataset + ".pkl")

    logging.info("Loading the datasets")
    train_dataset = joblib.load(DATA_LOCATION + "train_dataset" + "_11072019.pkl")
    dev_dataset = joblib.load(DATA_LOCATION + "dev_dataset" + "_11072019.pkl")
    test_dataset = joblib.load(DATA_LOCATION + "test_dataset" + "_11072019.pkl")
    print("fetching the FNER dataset")
    logging.info("train_size:{}".format(train_dataset["data"].shape[0]))
    logging.info("dev_size: {}".format(dev_dataset["data"].shape[0]))
    logging.info("test_size: {}".format(test_dataset["data"].shape[0]))

    # batch_size : 1000, context_length : 10
    train_batcher = Batcher(train_dataset["storage"], train_dataset["data"], 1000, 10, dicts["id2vec"],
                            train_dataset['Doc2Vec'], train_dataset['sentence_in'])
    dev_batcher = Batcher(dev_dataset["storage"], dev_dataset["data"], dev_dataset["data"].shape[0], 10,
                          dicts["id2vec"], dev_dataset['Doc2Vec'], dev_dataset['sentence_in'])
    test_batcher = Batcher(test_dataset["storage"], test_dataset["data"], test_dataset["data"].shape[0], 10,
                           dicts["id2vec"], test_dataset['Doc2Vec'], test_dataset['sentence_in'])
    # step_par_epoch = args.steps_per_epoch

    Model = Model(type=args.dataset, encoder=args.encoder, hier=args.hier, feature=args.feature,
                  dropout=keep_prob_, decay=True, session_graph=session_graph,
                  learning_rate_fner=args.learning_rate_fner,
                  embedding_matrix=dicts['id2vec'], lstm_layer=lstm_size, doc_vec=eval(args.doc_2_vec))

    previous_f1 = 0.0
    print("Model Loaded")

    for epoch in range(args.epochs):
                        filemode='w',
                        level=logging.DEBUG)
    logging.info("This is the experiment number: %s", experiment_number)
    logging.info("This experiment has changes: %s", experiment_info)
    logging.info("LSTM dropout: {}".format(keep_prob_))
    logging.info("number of training epochs: {}".format(epochs))
    logging.info("steps per epoch LM: {}".format(args.steps_per_epoch_lm))
    logging.info("learning_rate LM: {}".format(args.learning_rate_lm))
    logging.info("Loading the dictionaries")

    data_dicts = joblib.load(args.data_location + "dict_wikifact_selected.pkl")
    train_data_ = args.data_location + 'train_selected.txt.gz'
    dev_data_ = args.data_location + 'dev_selected.txt.gz'
    test_data_ = args.data_location + 'test_selected.txt.gz'
    print("data_dict loaded")
    train_batcher = Batcher(train_data_, data_dicts['word2id'], 5, limit=True)
    print("train object formed")
    dev_batcher = Batcher(dev_data_, data_dicts['word2id'], 1)
    print("dev object formed")
    test_batcher = Batcher(test_data_, data_dicts['word2id'], 1)

    Model = Language_Model(embedding_matrix=data_dicts['id2vec'],
                           lstm_layer=lstm_size,
                           dropout=keep_prob_,
                           learning_rate=args.learning_rate_lm,
                           layer_type=args.layer_type)
    prev_perlexity = 1e+10
    for epoch in range(epochs):
        logging.info("Epochs ==> {}".format(epoch))
        batch_loss_2 = []
        for i in range(args.steps_per_epoch_lm):
Esempio n. 3
0
d = "Wiki" if args.dataset == "figer" else "OntoNotes"
dicts = joblib.load("data/" + d + "/dicts_" + args.dataset + ".pkl")

print "Loading the datasets"
train_dataset = joblib.load("data/" + d + "/train_" + args.dataset + ".pkl")
dev_dataset = joblib.load("data/" + d + "/dev_" + args.dataset + ".pkl")
test_dataset = joblib.load("data/" + d + "/test_" + args.dataset + ".pkl")

print
print "train_size:", train_dataset["data"].shape[0]
print "dev_size: ", dev_dataset["data"].shape[0]
print "test_size: ", test_dataset["data"].shape[0]

print "Creating batchers"
# batch_size : 1000, context_length : 10
train_batcher = Batcher(train_dataset["storage"], train_dataset["data"], 1000,
                        10, dicts["id2vec"])
dev_batcher = Batcher(dev_dataset["storage"], dev_dataset["data"],
                      dev_dataset["data"].shape[0], 10, dicts["id2vec"])
test_batcher = Batcher(test_dataset["storage"], test_dataset["data"],
                       test_dataset["data"].shape[0], 10, dicts["id2vec"])

step_par_epoch = 2000 if args.dataset == "figer" else 150

print "start trainning"
for epoch in range(5):
    train_batcher.shuffle()
    print "epoch", epoch
    for i in range(step_par_epoch):
        context_data, mention_representation_data, target_data, feature_data = train_batcher.next(
        )
        model.train(context_data, mention_representation_data, target_data,
    logging.info("Loading the dictionaries")
    d = "Wiki" if args.dataset == "figer" else "OntoNotes"
    dicts = joblib.load(DATA_LOCATION + "dict_" + args.dataset + ".pkl")

    logging.info("Loading the datasets")
    train_dataset = joblib.load(DATA_LOCATION + "train_dataset" + "_11072019.pkl")
    dev_dataset = joblib.load(DATA_LOCATION + "dev_dataset" + "_11072019.pkl")
    test_dataset = joblib.load(DATA_LOCATION + "test_dataset" + "_11072019.pkl")
    print("fetching the FNER dataset")
    logging.info("train_size:{}".format(train_dataset["data"].shape[0]))
    logging.info("dev_size: {}".format(dev_dataset["data"].shape[0]))
    logging.info("test_size: {}".format(test_dataset["data"].shape[0]))

    # batch_size : 1000, context_length : 10
    train_batcher = Batcher(train_dataset["storage"], train_dataset["data"], 1000, 10, dicts["id2vec"],
                            train_dataset['Doc2Vec'], train_dataset['sentence_in'])
    dev_batcher = Batcher(dev_dataset["storage"], dev_dataset["data"], dev_dataset["data"].shape[0], 10,
                          dicts["id2vec"], dev_dataset['Doc2Vec'], dev_dataset['sentence_in'])
    test_batcher = Batcher(test_dataset["storage"], test_dataset["data"], test_dataset["data"].shape[0], 10,
                           dicts["id2vec"], test_dataset['Doc2Vec'], test_dataset['sentence_in'])
    # step_par_epoch = args.steps_per_epoch

    Model = Model(type=args.dataset, encoder=args.encoder, hier=args.hier, feature=args.feature,
                  dropout=keep_prob_, decay=True, session_graph=session_graph,
                  learning_rate_fner=args.learning_rate_fner,
                  embedding_matrix=dicts['id2vec'], lstm_layer=lstm_size, doc_vec=eval(args.doc_2_vec))

    previous_f1 = 0.0
    logging.info("test evaluation FNER: ======>")
    Model.load(sess_save_location, model_name, experiment_number)
    type_embed = Model.get_type_embed()
Esempio n. 5
0
    dicts = joblib.load(DATA_LOCATION_2 + "dict_" + args.dataset + ".pkl")

    logging.info("Loading the datasets")
    train_dataset = joblib.load(DATA_LOCATION_2 + "train_dataset" +
                                "_11072019.pkl")
    dev_dataset = joblib.load(DATA_LOCATION_2 + "dev_dataset" +
                              "_11072019.pkl")
    test_dataset = joblib.load(DATA_LOCATION_2 + "test_dataset" +
                               "_11072019.pkl")
    print("fetching the FNER dataset")
    logging.info("train_size:{}".format(train_dataset["data"].shape[0]))
    logging.info("dev_size: {}".format(dev_dataset["data"].shape[0]))
    logging.info("test_size: {}".format(test_dataset["data"].shape[0]))

    test_batcher = Batcher(test_dataset["storage"], test_dataset["data"],
                           test_dataset["data"].shape[0], 10, dicts["id2vec"],
                           test_dataset['Doc2Vec'],
                           test_dataset['sentence_in'])
    print("loading KGE dataset")
    embedding_matrix = pickle.load(
        open(DATA_LOCATION + "embedding_matrix.pkl", "rb"))
    test_data = joblib.load(DATA_LOCATION + "test_vec_dict.pkl")
    head_vec_test = test_data["Head"]
    tail_vec_test = test_data["tail"]
    relation_vec_test = test_data["relation"]
    y_output_test = test_data["score"]

    Model = NAM_Modified(embedding_matrix_fner=dicts['id2vec'],
                         lstm_layer=lstm_size,
                         balance=args.balance,
                         type=args.dataset,
                         encoder=args.encoder,
Esempio n. 6
0
 logging.basicConfig(filename=Log_location + log_file_name, filemode='w', level=logging.DEBUG)
 logging.info("This is the experiment number: %s", experiment_number)
 logging.info("This experiment has changes: %s", experiment_info)
 logging.info("LSTM dropout: {}".format(keep_prob_))
 logging.info("number of training epochs: {}".format(epochs))
 logging.info("steps per epoch LM: {}".format(args.steps_per_epoch_lm))
 logging.info("steps per epoch KGE: {}".format(args.steps_per_epoch_kge))
 # logging.info("KGE dataset: {}".format(args.dataset_kge))
 logging.info("learning_rate KGE: {}".format(args.learning_rate_kge))
 logging.info("learning_rate LM: {}".format(args.learning_rate_lm))
 logging.info("Options: {}".format(args.option))
 logging.info("Loading the dictionaries")
 test_data_ = args.data_location_lm + 'test_selected.txt.gz'
 data_dict = joblib.load(args.data_location_kge + 'dict_wikifact_selected.pkl')
 print("data_dict loaded")
 test_batcher = Batcher(test_data_, data_dict['word2id'], 1)
 print("Data set loaded")
 load_data = joblib.load(DATA_LOCATION + "Sample_data_lm_24.pkl")
 head_list = load_data["Head"]
 relation_list = load_data["relation"]
 tail_list = load_data["tail"]
 label = np.array(load_data["score"])
 dev_data = joblib.load(DATA_LOCATION + "dev_vec_lm_dict.pkl")
 test_data = joblib.load(DATA_LOCATION + "test_vec_lm_dict.pkl")
 head_vec_dev = dev_data["Head"]
 tail_vec_dev = dev_data["tail"]
 relation_vec_dev = dev_data["relation"]
 y_output_dev = dev_data["score"]
 head_vec_test = test_data["Head"]
 tail_vec_test = test_data["tail"]
 relation_vec_test = test_data["relation"]
Esempio n. 7
0
              LE=LE,
              vocab_size=vocab_size,
              lamb=float(args.lamb),
              LMout=dicts["id2vec"],
              hard=args.hard)

batch_size = 1000 if args.dataset == "figer" else 128

print "Loading the datasets"
if not args.test:
    train_dataset = joblib.load("data/" + d + "/train_" + args.dataset +
                                ".pkl")
    dev_dataset = joblib.load("data/" + d + "/dev_" + args.dataset + ".pkl")
    print "train_size:", train_dataset["data"].shape[0]
    print "dev_size: ", dev_dataset["data"].shape[0]
    train_batcher = Batcher(train_dataset["storage"], train_dataset["data"],
                            batch_size, 10, dicts["id2vec"], vocab_size)
    dev_batcher = Batcher(dev_dataset["storage"], dev_dataset["data"],
                          dev_dataset["data"].shape[0], 10, dicts["id2vec"],
                          vocab_size)

test_dataset = joblib.load("data/" + d + "/test_" + args.dataset + ".pkl")
test_batch_size = test_dataset["data"].shape[0]
if args.cs:
    test_batch_size = 1
print "test_size: ", test_dataset["data"].shape[0]

test_batcher = Batcher(test_dataset["storage"], test_dataset["data"],
                       test_batch_size, 10, dicts["id2vec"], vocab_size)
if args.test:
    #only works for Wiki
    model.load_all("./Models/" + d + "/lamb" + str(args.lamb) + "/model")
Esempio n. 8
0
print "coarse_set"
print model.coarse_set

print "Loading the datasets"
train_dataset = joblib.load("data/" + d + "/train_" + args.dataset + ".pkl")
dev_dataset = joblib.load("data/" + d + "/dev_" + args.dataset + ".pkl")
test_dataset = joblib.load("data/" + d + "/test_" + args.dataset + ".pkl")

print "train_size:", train_dataset["data"].shape[0]
print "dev_size: ", dev_dataset["data"].shape[0]
print "test_size: ", test_dataset["data"].shape[0]

print "Creating batchers"
# batch_size : 1000, context_length : 10
batch_size = args.batch_size
train_batcher = Batcher(train_dataset["storage"], train_dataset["data"],
                        batch_size, 10, dicts["id2vec"])
dev_batcher = Batcher(dev_dataset["storage"], dev_dataset["data"],
                      dev_dataset["data"].shape[0], 10, dicts["id2vec"])
test_batcher = Batcher(test_dataset["storage"], test_dataset["data"],
                       test_dataset["data"].shape[0], 10, dicts["id2vec"])

# print "divide data to labels"
# train_batcher.divide_data(target_dim)
label_hierarchy = train_batcher.get_label_hierarchy(dicts['id2label'],
                                                    target_dim)
if args.path:
    print "transform label to path"
    train_batcher.transform2path(label_hierarchy, target_dim)
    dev_batcher.transform2path(label_hierarchy, target_dim)
    test_batcher.transform2path(label_hierarchy, target_dim)
if args.bags:
Esempio n. 9
0
print "decode_size: ", decode_size

# if decode_size is 0 (i.e. the decode_dataset is empty), the serif-instances file must have been empty
# this can happen if the serif name-list adder did not find any names to add as mentions (e.g. if you are using a very short text)
# in such a case, exit with success
if decode_size == 0:
    print "Exiting decoding since decode dataset is empty!"
    # save an empty output file; this will avoid other steps in the CauseEX pipeline from failing
    with (open(args.output, "w")) as fp:
        pass
    sys.exit(0)

if decode_dataset["data"].shape[0] == 0:
    print "Dataset is empty. Exit"
    sys.exit()

print "Creating batcher..."
test_batcher = Batcher(decode_dataset["storage"], decode_dataset["data"],
                       decode_dataset["data"].shape[0], 10, dicts["id2vec"])

print "Getting bacther.next..."
context_data, mention_representation_data, target_data, feature_data = test_batcher.next(
)

print "Running decoding..."
scores = model.predict(context_data, mention_representation_data, feature_data)
acc_hook(scores, target_data)
save_predictions(scores, target_data, dicts["id2label"], args.output)

print "Finished decoding! Predicted labels written to: " + args.output
    logging.info("steps per epoch LM: {}".format(args.steps_per_epoch_lm))
    logging.info("steps per epoch KGE: {}".format(args.steps_per_epoch_kge))
    logging.info("learning_rate KGE: {}".format(args.learning_rate_kge))
    logging.info("learning_rate LM: {}".format(args.learning_rate_lm))
    logging.info("Options: {}".format(args.option))
    logging.info("Loading the dictionaries")
    train_data_ = args.data_location_lm + 'train_selected.txt.gz'
    dev_data_ = args.data_location_lm + 'dev_selected.txt.gz'
    test_data_ = args.data_location_lm + 'test_selected.txt.gz'
    data_dict = joblib.load(args.data_location_kge +
                            'dict_wikifact_selected.pkl')
    print("data_dict loaded")
    train_batch = 5
    test_batch = 1
    dev_batch = 1
    train_batcher = Batcher(train_data_, data_dict['word2id'], 5)
    print("train object formed")
    dev_batcher = Batcher(dev_data_, data_dict['word2id'], dev_batch)
    print("dev object formed")
    test_batcher = Batcher(test_data_, data_dict['word2id'], train_batch)
    print("Data set loaded")

    load_data = joblib.load(DATA_LOCATION + "Sample_data_lm_24.pkl")
    head_list = load_data["Head"]
    relation_list = load_data["relation"]
    tail_list = load_data["tail"]
    label = np.array(load_data["score"])
    dev_data = joblib.load(DATA_LOCATION + "dev_vec_lm_dict.pkl")
    test_data = joblib.load(DATA_LOCATION + "test_vec_lm_dict.pkl")
    head_vec_dev = dev_data["Head"]
    tail_vec_dev = dev_data["tail"]