def __init__(self, relext_model_name, models_foldes="../trainedmodels/"): module_location = os.path.abspath(__file__) module_location = os.path.dirname(module_location) model_params = keras_models.model_params max_sent_len = keras_models.model_params['max_sent_len'] self._embeddings, self._word2idx = embeddings.load( keras_models.model_params['wordembeddings']) print("Loaded embeddings:", self._embeddings.shape) self._idx2word = {v: k for k, v in self._word2idx.items()} self._model = getattr(keras_models, relext_model_name)( model_params, np.zeros((len(self._word2idx), 50), dtype='float32'), max_sent_len, len(keras_models.property2idx)) self._model.load_weights(models_foldes + relext_model_name + ".kerasmodel") with codecs.open(os.path.join( module_location, "../../resources/properties-with-labels.txt"), encoding='utf-8') as infile: self._property2label = { l.split("\t")[0]: l.split("\t")[1].strip() for l in infile.readlines() } self._graphs_to_indices = keras_models.to_indices if "Context" in relext_model_name: self._graphs_to_indices = keras_models.to_indices_with_extracted_entities elif "CNN" in relext_model_name: self._graphs_to_indices = keras_models.to_indices_with_relative_positions
def __init__(self, relext_model_name, models_foldes="../trainedmodels/", embeddings_location="glove/glove.6B.50d.txt", resource_folder="../resources/"): with open(models_foldes + relext_model_name + ".property2idx") as f: self._property2idx = ast.literal_eval(f.read()) module_location = os.path.abspath(__file__) module_location = os.path.dirname(module_location) with open(os.path.join(module_location, "../model_params.json")) as f: model_params = json.load(f) self._embeddings, self._word2idx = embeddings.load(embeddings_location) print("Loaded embeddings:", self._embeddings.shape) self._idx2word = {v: k for k, v in self._word2idx.items()} self._model = keras_models.model_ContextWeighted(model_params, np.zeros((len(self._word2idx), 50), dtype='float32'), max_sent_len, len(self._property2idx)) self._model.load_weights(models_foldes + relext_model_name + ".kerasmodel") with open(resource_folder + "properties-with-labels.txt") as infile: self._property2label = {l.split("\t")[0] : l.split("\t")[1].strip() for l in infile.readlines()} self._idx2property = {v: k for k, v in self._property2idx.items()} self._graphs_to_indices = keras_models.to_indices_with_real_entities if "CNN" in relext_model_name: self._graphs_to_indices = keras_models.to_indices_with_relative_positions
def __init__(self, relext_model_name, models_folder="../trainedmodels/"): """ Initialize a new relation parser with the given model type. This class simplifies the loading of models and encapsulates encoding sentences into the correct format for the given model. :param relext_model_name: The name of the model type that should correspond to the correct model class and the name of the model file :param models_folder: location of pre-trained model files """ module_location = os.path.abspath(__file__) module_location = os.path.dirname(module_location) model_params = keras_models.model_params max_sent_len = keras_models.model_params['max_sent_len'] self._embeddings, self._word2idx = embeddings.load( keras_models.model_params['wordembeddings']) print("Loaded embeddings:", self._embeddings.shape) self._idx2word = {v: k for k, v in self._word2idx.items()} self._model = getattr(keras_models, relext_model_name)( model_params, np.zeros((len(self._word2idx), 50), dtype='float32'), max_sent_len, len(keras_models.property2idx)) self._model.load_weights(models_folder + relext_model_name + ".kerasmodel") with codecs.open(os.path.join( module_location, "../../resources/properties-with-labels.txt"), encoding='utf-8') as infile: self._property2label = { l.split("\t")[0]: l.split("\t")[1].strip() for l in infile.readlines() } self._graphs_to_indices = keras_models.to_indices if "Context" in relext_model_name: self._graphs_to_indices = keras_models.to_indices_with_extracted_entities elif "CNN" in relext_model_name: self._graphs_to_indices = keras_models.to_indices_with_relative_positions
) parser.add_argument('--property_index') parser.add_argument( '-s', action='store_true', help="Use only a portion of the training and validation sets.") args = parser.parse_args() model_name = args.model_name mode = args.mode with open(args.model_params) as f: model_params = json.load(f) embeddings, word2idx = embeddings.load(args.word_embeddings) print("Loaded embeddings:", embeddings.shape) training_data, _ = io.load_relation_graphs_from_file(args.train_set, load_vertices=True) val_data, _ = io.load_relation_graphs_from_file(args.val_set, load_vertices=True) if args.s: training_data = training_data[:len(training_data) // 3] print("Training data size set to: {}".format(len(training_data))) val_data = val_data[:len(val_data) // 3] print("Validation data size set to: {}".format(len(val_data))) if mode in ['test', 'train-plus-test']:
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('model_name') parser.add_argument('mode', choices=['train', 'optimize', 'train-continue']) parser.add_argument('train_set') parser.add_argument('val_set') parser.add_argument('--models_folder', default="../trainedmodels/") args = parser.parse_args() model_name = args.model_name mode = args.mode embedding_matrix, word2idx = embeddings.load(keras_models.model_params['wordembeddings']) print("Loaded embeddings:", embedding_matrix.shape) training_data, _ = io.load_relation_graphs_from_file(args.train_set, load_vertices=True) val_data, _ = io.load_relation_graphs_from_file(args.val_set, load_vertices=True) print("Training data size: {}".format(len(training_data))) print("Validation data size: {}".format(len(val_data))) max_sent_len = keras_models.model_params['max_sent_len'] print("Max sentence length set to: {}".format(max_sent_len)) to_one_hot = np_utils.to_categorical graphs_to_indices = keras_models.to_indices if "Context" in model_name: to_one_hot = embeddings.timedistributed_to_one_hot
import os import numpy as np import tensorflow as tf import random import sys import json import _pickle as cPickle from core import embeddings embedding_matrix, word2idx = embeddings.load('./glove/glove.6B.50d.txt') print("Loaded embeddings:", embedding_matrix.shape) ### preprocess all the wiki data and select desired data to construct data pairs CORPUS_FOlDER = "./enwiki-20160501" propertylist = [ 'P20', 'P19', 'P551', 'P463', 'P108', 'P157', 'P69', 'P172', 'P140', 'P26', 'P40', 'P22', 'P25', 'P119', 'P66', 'P27', 'P101', 'P800', 'P166', 'P39', 'P102', 'P263', 'P184', 'P802', 'P53', 'P553', 'P1344', 'P1416', 'P103', 'P91', 'P237', 'P411', 'P412', 'P450', 'P97', 'P512', 'P1303', 'P1399', 'P1412', 'P1429', 'P451', 'P1038', 'P21', 'P734', 'P735', 'P570', 'P569', 'P1196', 'P106', 'P509' ] relation_num = { 'P20': 50, 'P19': 1, 'P551': 2, 'P463': 3, 'P108': 4, 'P157': 5, 'P69': 6, 'P172': 7,
# parser.add_argument('val_set') parser.add_argument('--models_folder', default="./trainedmodels/") parser.add_argument('--earlystop', action='store_true') parser.add_argument('--epoch', default=50, type=int) parser.add_argument('--checkpoint', action='store_true') parser.add_argument('--tensorboard', action='store_true') parser.add_argument('--metadata', type=str) parser.add_argument('--error_out_folder', default='./error_output/') args = parser.parse_args() model_name = args.model_name mode = args.mode error_out_folder = args.error_out_folder embedding_matrix, word2idx = embeddings.load(args.word_embedding) print("embedding_matrix: " + str(embedding_matrix.shape)) if args.exist: relationMention_ph = args.train_set train_data, val_data, test_data = io.load_relation_from_existing_sets( relationMention_ph) else: relationMention_files = glob.glob(args.train_set) train_data, val_data, test_data = io.load_relation_from_files( relationMention_files, val_portion=0.1, test_portion=0.1) print("Document number: {}".format(len(relationMention_files))) if mode == 'create-data-set': with open('./data/relationMention/train.relationMention.json', 'w') as f: