def __init__(self, relext_model_name, models_foldes="../trainedmodels/"):

        module_location = os.path.abspath(__file__)
        module_location = os.path.dirname(module_location)

        model_params = keras_models.model_params
        max_sent_len = keras_models.model_params['max_sent_len']
        self._embeddings, self._word2idx = embeddings.load(
            keras_models.model_params['wordembeddings'])
        print("Loaded embeddings:", self._embeddings.shape)
        self._idx2word = {v: k for k, v in self._word2idx.items()}

        self._model = getattr(keras_models, relext_model_name)(
            model_params, np.zeros((len(self._word2idx), 50),
                                   dtype='float32'), max_sent_len,
            len(keras_models.property2idx))

        self._model.load_weights(models_foldes + relext_model_name +
                                 ".kerasmodel")

        with codecs.open(os.path.join(
                module_location, "../../resources/properties-with-labels.txt"),
                         encoding='utf-8') as infile:
            self._property2label = {
                l.split("\t")[0]: l.split("\t")[1].strip()
                for l in infile.readlines()
            }

        self._graphs_to_indices = keras_models.to_indices
        if "Context" in relext_model_name:
            self._graphs_to_indices = keras_models.to_indices_with_extracted_entities
        elif "CNN" in relext_model_name:
            self._graphs_to_indices = keras_models.to_indices_with_relative_positions
Beispiel #2
0
    def __init__(self, relext_model_name, models_foldes="../trainedmodels/",
                 embeddings_location="glove/glove.6B.50d.txt", resource_folder="../resources/"):

        with open(models_foldes + relext_model_name + ".property2idx") as f:
            self._property2idx = ast.literal_eval(f.read())

        module_location = os.path.abspath(__file__)
        module_location = os.path.dirname(module_location)

        with open(os.path.join(module_location, "../model_params.json")) as f:
            model_params = json.load(f)

        self._embeddings, self._word2idx = embeddings.load(embeddings_location)
        print("Loaded embeddings:", self._embeddings.shape)
        self._idx2word = {v: k for k, v in self._word2idx.items()}

        self._model = keras_models.model_ContextWeighted(model_params,
                                                         np.zeros((len(self._word2idx), 50), dtype='float32'),
                                                         max_sent_len, len(self._property2idx))

        self._model.load_weights(models_foldes + relext_model_name + ".kerasmodel")

        with open(resource_folder + "properties-with-labels.txt") as infile:
            self._property2label = {l.split("\t")[0] : l.split("\t")[1].strip() for l in infile.readlines()}
        self._idx2property = {v: k for k, v in self._property2idx.items()}

        self._graphs_to_indices = keras_models.to_indices_with_real_entities
        if "CNN" in relext_model_name:
            self._graphs_to_indices = keras_models.to_indices_with_relative_positions
    def __init__(self, relext_model_name, models_folder="../trainedmodels/"):
        """
        Initialize a new relation parser with the given model type. This class simplifies the loading of models and
        encapsulates encoding sentences into the correct format for the given model.

        :param relext_model_name: The name of the model type that should correspond to the correct model class and
        the name of the model file
        :param models_folder: location of pre-trained model files
        """

        module_location = os.path.abspath(__file__)
        module_location = os.path.dirname(module_location)

        model_params = keras_models.model_params
        max_sent_len = keras_models.model_params['max_sent_len']
        self._embeddings, self._word2idx = embeddings.load(
            keras_models.model_params['wordembeddings'])
        print("Loaded embeddings:", self._embeddings.shape)
        self._idx2word = {v: k for k, v in self._word2idx.items()}

        self._model = getattr(keras_models, relext_model_name)(
            model_params, np.zeros((len(self._word2idx), 50),
                                   dtype='float32'), max_sent_len,
            len(keras_models.property2idx))

        self._model.load_weights(models_folder + relext_model_name +
                                 ".kerasmodel")

        with codecs.open(os.path.join(
                module_location, "../../resources/properties-with-labels.txt"),
                         encoding='utf-8') as infile:
            self._property2label = {
                l.split("\t")[0]: l.split("\t")[1].strip()
                for l in infile.readlines()
            }

        self._graphs_to_indices = keras_models.to_indices
        if "Context" in relext_model_name:
            self._graphs_to_indices = keras_models.to_indices_with_extracted_entities
        elif "CNN" in relext_model_name:
            self._graphs_to_indices = keras_models.to_indices_with_relative_positions
    )
    parser.add_argument('--property_index')
    parser.add_argument(
        '-s',
        action='store_true',
        help="Use only a portion of the training and validation sets.")

    args = parser.parse_args()

    model_name = args.model_name
    mode = args.mode

    with open(args.model_params) as f:
        model_params = json.load(f)

    embeddings, word2idx = embeddings.load(args.word_embeddings)
    print("Loaded embeddings:", embeddings.shape)

    training_data, _ = io.load_relation_graphs_from_file(args.train_set,
                                                         load_vertices=True)

    val_data, _ = io.load_relation_graphs_from_file(args.val_set,
                                                    load_vertices=True)

    if args.s:
        training_data = training_data[:len(training_data) // 3]
        print("Training data size set to: {}".format(len(training_data)))
        val_data = val_data[:len(val_data) // 3]
        print("Validation data size set to: {}".format(len(val_data)))

    if mode in ['test', 'train-plus-test']:
Beispiel #5
0
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('model_name')
    parser.add_argument('mode', choices=['train', 'optimize', 'train-continue'])
    parser.add_argument('train_set')
    parser.add_argument('val_set')
    parser.add_argument('--models_folder', default="../trainedmodels/")

    args = parser.parse_args()

    model_name = args.model_name
    mode = args.mode

    embedding_matrix, word2idx = embeddings.load(keras_models.model_params['wordembeddings'])
    print("Loaded embeddings:", embedding_matrix.shape)

    training_data, _ = io.load_relation_graphs_from_file(args.train_set, load_vertices=True)
    val_data, _ = io.load_relation_graphs_from_file(args.val_set, load_vertices=True)

    print("Training data size: {}".format(len(training_data)))
    print("Validation data size: {}".format(len(val_data)))

    max_sent_len = keras_models.model_params['max_sent_len']
    print("Max sentence length set to: {}".format(max_sent_len))

    to_one_hot = np_utils.to_categorical
    graphs_to_indices = keras_models.to_indices
    if "Context" in model_name:
        to_one_hot = embeddings.timedistributed_to_one_hot
Beispiel #6
0
import os
import numpy as np
import tensorflow as tf
import random
import sys
import json
import _pickle as cPickle
from core import embeddings

embedding_matrix, word2idx = embeddings.load('./glove/glove.6B.50d.txt')
print("Loaded embeddings:", embedding_matrix.shape)
### preprocess all the wiki data and select desired data to construct data pairs
CORPUS_FOlDER = "./enwiki-20160501"
propertylist = [
    'P20', 'P19', 'P551', 'P463', 'P108', 'P157', 'P69', 'P172', 'P140', 'P26',
    'P40', 'P22', 'P25', 'P119', 'P66', 'P27', 'P101', 'P800', 'P166', 'P39',
    'P102', 'P263', 'P184', 'P802', 'P53', 'P553', 'P1344', 'P1416', 'P103',
    'P91', 'P237', 'P411', 'P412', 'P450', 'P97', 'P512', 'P1303', 'P1399',
    'P1412', 'P1429', 'P451', 'P1038', 'P21', 'P734', 'P735', 'P570', 'P569',
    'P1196', 'P106', 'P509'
]

relation_num = {
    'P20': 50,
    'P19': 1,
    'P551': 2,
    'P463': 3,
    'P108': 4,
    'P157': 5,
    'P69': 6,
    'P172': 7,
    # parser.add_argument('val_set')
    parser.add_argument('--models_folder', default="./trainedmodels/")
    parser.add_argument('--earlystop', action='store_true')
    parser.add_argument('--epoch', default=50, type=int)
    parser.add_argument('--checkpoint', action='store_true')
    parser.add_argument('--tensorboard', action='store_true')
    parser.add_argument('--metadata', type=str)
    parser.add_argument('--error_out_folder', default='./error_output/')

    args = parser.parse_args()

    model_name = args.model_name
    mode = args.mode
    error_out_folder = args.error_out_folder

    embedding_matrix, word2idx = embeddings.load(args.word_embedding)
    print("embedding_matrix: " + str(embedding_matrix.shape))

    if args.exist:
        relationMention_ph = args.train_set
        train_data, val_data, test_data = io.load_relation_from_existing_sets(
            relationMention_ph)
    else:
        relationMention_files = glob.glob(args.train_set)
        train_data, val_data, test_data = io.load_relation_from_files(
            relationMention_files, val_portion=0.1, test_portion=0.1)
        print("Document number: {}".format(len(relationMention_files)))

    if mode == 'create-data-set':
        with open('./data/relationMention/train.relationMention.json',
                  'w') as f: