Beispiel #1
0
    def __processFile(self, addUNK=True):
        """
        The .vec file is the input and the npz file without the extension
        :param inputFile:
        :param outputFile:
        :return:
        """
        assert self.resourcesReady
        dimensions = ModelConfig.Instance().embeddingSize
        wikiModel = Word2Vec.load(self.vecFile)
        weightMatrix = wikiModel.wv.syn0

        vocab = dict([(k, v.index) for k, v in wikiModel.wv.vocab.items()])
        print len(vocab)
        print weightMatrix.shape
        if addUNK:
            vocab['UNK_TK'] = len(vocab)
            weightMatrix = np.vstack(
                (weightMatrix, np.random.rand(1, dimensions)))
            vocab['UNK_PRED'] = len(vocab)
            weightMatrix = np.vstack(
                (weightMatrix, np.random.rand(1, dimensions)))
        print len(vocab)
        print weightMatrix.shape

        with open(self.word2idxFile, 'w') as f:
            f.write(json.dumps(vocab))

        np.save(self.npzModel, weightMatrix)

        f.close()
Beispiel #2
0
def readConfig(file='/srl-config.json'):
    config = Config.Instance()
    config.prepare(Utils.getWorkingDirectory())

    modelConfig = ModelConfig.Instance()
    modelConfig.prepare(config.srlConfig + file)
    return config, modelConfig
Beispiel #3
0
def prepareEmbeddings(useWiki=False):
    config = Config.Instance()
    modelConfig = ModelConfig.Instance()

    tokens = extractAllTokens(config.convertedCorpusDir+'/propbank_full.csv')
    if useWiki:
        tokens.update(extractAllTokens(config.convertedCorpusDir+'/wiki.csv'))

    print '{} tokens found'.format(len(tokens))

    predicates = extractAllTokens(config.convertedCorpusDir+'/propbank_full.csv', 'predicate')


    w2vFiles = {
        "npzFile":config.embeddingsDir+"/wordEmbeddings.npy",
        "npzModel":config.embeddingsDir+"/wordEmbeddings",
        "vecFile":__getVecFile(config.embeddingsDir, modelConfig.embeddingSize),
        "w2idxFile":config.embeddingsDir+"/vocabulary.json"
    }

    w2v = W2VModel()
    w2v.setResources(w2vFiles)
    loader = EmbeddingLoader(w2v)
    word2idx, idx2word, weights = loader.process()

    if modelConfig.embeddingType == 'w2v':
        return loader, loader

    sentHybridFiles = {
        "npzFile":config.embeddingsDir+"/sent_hybrid.npy",
        "npzModel":config.embeddingsDir+"/sent_hybrid",
        "w2idxFile":config.embeddingsDir+"/sent_hybrid.json"
    }

    sentHybrid = HybridModel()
    sentHybrid.setResources(sentHybridFiles)
    print 'creating sentence corpus'
    sentHybrid.generateCorpus(tokens, weights, word2idx)
    Hloader = EmbeddingLoader(sentHybrid)
    Hword2idx, Hidx2word, Hweights = Hloader.process()


    predHybridFiles = {
        "npzFile":config.embeddingsDir+"/pred_hybrid.npy",
        "npzModel":config.embeddingsDir+"/pred_hybrid",
        "w2idxFile":config.embeddingsDir+"/pred_hybrid.json"
    }

    predHybrid = HybridModel()
    predHybrid.setResources(predHybridFiles)
    print 'creating predicate corpus'
    predHybrid.generateCorpus(predicates, weights, word2idx)
    Ploader = EmbeddingLoader(predHybrid)
    Pword2idx, Pidx2word, Pweights = Ploader.process()


    return Hloader, Ploader
Beispiel #4
0
    def process(self, addPadding=True):
        """
        Loads the embedding model. If necessary adds a padding member (array with zeros) at the end
        :param addPadding:
        :return:
        """
        dimensions = ModelConfig.Instance().embeddingSize
        print 'processing embeddings : {}'.format(dimensions)
        self.model.prepare()
        self.word2idx = self.model.getVocabulary()
        self.weights = self.model.getWeights()
        if addPadding:
            self.weights = np.vstack((self.weights, np.zeros((1, dimensions))))

        self.idx2word = self.__createInvertedIndex(self.word2idx)

        return self.word2idx, self.idx2word, self.weights
Beispiel #5
0
def getEmbeddings():
    config = Config.Instance()
    modelConfig = ModelConfig.Instance()

    if modelConfig.embeddingType == 'w2v':
        w2vFiles = {
            "npzFile":config.embeddingsDir+"/wordEmbeddings.npy",
            "npzModel":config.embeddingsDir+"/wordEmbeddings",
            "vecFile":__getVecFile(config.embeddingsDir, modelConfig.embeddingSize),
            "w2idxFile":config.embeddingsDir+"/vocabulary.json"
        }

        w2v = W2VModel()
        w2v.setResources(w2vFiles)
        loader = EmbeddingLoader(w2v)
        word2idx, idx2word, weights = loader.process()
        return loader, loader
    else:
        sentHybridFiles = {
            "npzFile":config.embeddingsDir+"/sent_hybrid.npy",
            "npzModel":config.embeddingsDir+"/sent_hybrid",
            "w2idxFile":config.embeddingsDir+"/sent_hybrid.json"
        }

        sentHybrid = HybridModel()
        sentHybrid.setResources(sentHybridFiles)
        Hloader = EmbeddingLoader(sentHybrid)
        Hword2idx, Hidx2word, Hweights = Hloader.process()


        predHybridFiles = {
            "npzFile":config.embeddingsDir+"/pred_hybrid.npy",
            "npzModel":config.embeddingsDir+"/pred_hybrid",
            "w2idxFile":config.embeddingsDir+"/pred_hybrid.json"
        }

        predHybrid = HybridModel()
        predHybrid.setResources(predHybridFiles)
        Ploader = EmbeddingLoader(predHybrid)
        Pword2idx, Pidx2word, Pweights = Ploader.process()


        return Hloader, Ploader
Beispiel #6
0
from model.configuration import Config
from model.configuration.model_config import ModelConfig
from utils.function_utils import Utils
from utils import extractFeaturesFromSentence, toNNFormat
from embeddings import getEmbeddings
import pandas as pd

print 'loading configuration'
config = Config.Instance()
config.prepare(Utils.getWorkingDirectory())

modelConfig = ModelConfig.Instance()
modelConfig.prepare(config.srlConfig + '/srl-config.json')
print 'configuration loaded'

sentenceLoader, predicateLoader = getEmbeddings(config,
                                                modelConfig.embeddingType)

wikiFile = pd.read_csv(config.convertedCorpusDir + '/wiki.csv')

for i in xrange(0, len(wikiFile)):
    predicate = wikiFile['predicate'][i]
    sentence = wikiFile['sentence'][i]
    convertedSentence, convertedPredicate, allCaps, firstCaps, noCaps, context, distance = extractFeaturesFromSentence(
        sentence, predicate, sentenceLoader.word2idx, predicateLoader.word2idx)
    inputSentence, inputPredicate, inputAux = toNNFormat(
        convertedSentence, convertedPredicate, allCaps, firstCaps, noCaps,
        context, distance)

    print inputSentence.shape, inputPredicate.shape, inputAux.shape
    break
Beispiel #7
0
batcher.addAll(trainingData[0], trainingData[1], trainingData[2],
               trainingData[3])
container = batcher.getBatches()

inference = SRLInference(tagMap, tagList)
evaluator = Evaluator(
    testData, inference, nnUtils,
    config.resultsDir + '/fold_' + str(fold) + '/finalResult.json')
#lrReducer = PatienceBaseLrReducer(modelConfig.trainingEpochs, modelConfig.patience, modelConfig.decayRate)
#lrReducer = FixedBasedLrReducer(modelConfig.trainingEpochs)
#clr = CyclicLearningRate(base_lr=0.00020, max_lr=0.0012, step_size=(204.*3), mode='exp_range', gamma=0.99996)
msaver = ModelEvaluation(fold)
print 'prepared'

print 'creating neural network model'
model = LSTMModel(ModelConfig.Instance())
nn = model.create(sentenceLoader.weights, predicateLoader.weights)
nn.summary()
#lrReducer.setNetwork(nn)
es = EarlyStopper()
evaluator.prepare(
    nn, config.resultsDir + '/fold_' + str(fold) + '/epoch_' + str(1) + '/',
    config.resourceDir + '/srl-eval.pl')
print 'model loaded'

print 'start training'

number_of_epochs = ModelConfig.Instance().trainingEpochs
for epoch in xrange(1, number_of_epochs):
    print "--------- Epoch %d -----------" % (epoch)
    start_time = time.time()
Beispiel #8
0
print 'creating neural network model'
mp = ModelPersistence()
nn = mp.load(
    Config.Instance().resultsDir + '/model_50.json',
    Config.Instance().resultsDir + '/model_50.h5py',
)
nn.compile(optimizer=modelConfig.optimizer,
           loss=modelConfig.lossFunction,
           metrics=['accuracy'])
nn.summary()
lrReducer.setNetwork(nn)
print 'model loaded'

print 'start training'

number_of_epochs = ModelConfig.Instance().trainingEpochs
for epoch in xrange(50, 50 + number_of_epochs):
    print "--------- Epoch %d -----------" % (epoch + 1)
    start_time = time.time()
    numIterations = len(container)

    indexes = np.arange(len(container))
    #np.random.shuffle(indexes)

    print indexes

    print 'Running in {} batches'.format(numIterations)
    for i in xrange(0, numIterations):
        z = indexes[i]

        sent, pred, aux, label = batcher.open(container[z])
Beispiel #9
0
print 'loading corpus'
csvFiles = [
    config.convertedCorpusDir + '/propbank_training.csv',
    config.convertedCorpusDir + '/propbank_test.csv'
]
converter = CorpusConverter(csvFiles, sentenceLoader, predicateLoader)
data = converter.load(config.resourceDir + '/wiki_feature_file.npy')
tagMap = converter.tagMap
tagList = converter.tagList
nnUtils.setTagList(tagMap, tagList)
print 'loaded'

print 'loading neural network model'
inference = SRLInference(tagMap, tagList)
model = LSTMModel(ModelConfig.Instance())
nn = model.load(Config.Instance().resultsDir + '/best/wiki_model.json',
                Config.Instance().resultsDir + '/best/wiki_model.h5py')
nn.summary()
print 'model loaded'

prediction = Predictor(nn, tagList, inference)

wikiFile = pd.read_csv(config.convertedCorpusDir + '/wiki.csv')

results = []
iterations = len(wikiFile)
for i in xrange(0, iterations):
    try:
        propositionId = wikiFile['propositionId'][i]
        predicate = wikiFile['predicate'][i]