def __processFile(self, addUNK=True): """ The .vec file is the input and the npz file without the extension :param inputFile: :param outputFile: :return: """ assert self.resourcesReady dimensions = ModelConfig.Instance().embeddingSize wikiModel = Word2Vec.load(self.vecFile) weightMatrix = wikiModel.wv.syn0 vocab = dict([(k, v.index) for k, v in wikiModel.wv.vocab.items()]) print len(vocab) print weightMatrix.shape if addUNK: vocab['UNK_TK'] = len(vocab) weightMatrix = np.vstack( (weightMatrix, np.random.rand(1, dimensions))) vocab['UNK_PRED'] = len(vocab) weightMatrix = np.vstack( (weightMatrix, np.random.rand(1, dimensions))) print len(vocab) print weightMatrix.shape with open(self.word2idxFile, 'w') as f: f.write(json.dumps(vocab)) np.save(self.npzModel, weightMatrix) f.close()
def readConfig(file='/srl-config.json'): config = Config.Instance() config.prepare(Utils.getWorkingDirectory()) modelConfig = ModelConfig.Instance() modelConfig.prepare(config.srlConfig + file) return config, modelConfig
def prepareEmbeddings(useWiki=False): config = Config.Instance() modelConfig = ModelConfig.Instance() tokens = extractAllTokens(config.convertedCorpusDir+'/propbank_full.csv') if useWiki: tokens.update(extractAllTokens(config.convertedCorpusDir+'/wiki.csv')) print '{} tokens found'.format(len(tokens)) predicates = extractAllTokens(config.convertedCorpusDir+'/propbank_full.csv', 'predicate') w2vFiles = { "npzFile":config.embeddingsDir+"/wordEmbeddings.npy", "npzModel":config.embeddingsDir+"/wordEmbeddings", "vecFile":__getVecFile(config.embeddingsDir, modelConfig.embeddingSize), "w2idxFile":config.embeddingsDir+"/vocabulary.json" } w2v = W2VModel() w2v.setResources(w2vFiles) loader = EmbeddingLoader(w2v) word2idx, idx2word, weights = loader.process() if modelConfig.embeddingType == 'w2v': return loader, loader sentHybridFiles = { "npzFile":config.embeddingsDir+"/sent_hybrid.npy", "npzModel":config.embeddingsDir+"/sent_hybrid", "w2idxFile":config.embeddingsDir+"/sent_hybrid.json" } sentHybrid = HybridModel() sentHybrid.setResources(sentHybridFiles) print 'creating sentence corpus' sentHybrid.generateCorpus(tokens, weights, word2idx) Hloader = EmbeddingLoader(sentHybrid) Hword2idx, Hidx2word, Hweights = Hloader.process() predHybridFiles = { "npzFile":config.embeddingsDir+"/pred_hybrid.npy", "npzModel":config.embeddingsDir+"/pred_hybrid", "w2idxFile":config.embeddingsDir+"/pred_hybrid.json" } predHybrid = HybridModel() predHybrid.setResources(predHybridFiles) print 'creating predicate corpus' predHybrid.generateCorpus(predicates, weights, word2idx) Ploader = EmbeddingLoader(predHybrid) Pword2idx, Pidx2word, Pweights = Ploader.process() return Hloader, Ploader
def process(self, addPadding=True): """ Loads the embedding model. If necessary adds a padding member (array with zeros) at the end :param addPadding: :return: """ dimensions = ModelConfig.Instance().embeddingSize print 'processing embeddings : {}'.format(dimensions) self.model.prepare() self.word2idx = self.model.getVocabulary() self.weights = self.model.getWeights() if addPadding: self.weights = np.vstack((self.weights, np.zeros((1, dimensions)))) self.idx2word = self.__createInvertedIndex(self.word2idx) return self.word2idx, self.idx2word, self.weights
def getEmbeddings(): config = Config.Instance() modelConfig = ModelConfig.Instance() if modelConfig.embeddingType == 'w2v': w2vFiles = { "npzFile":config.embeddingsDir+"/wordEmbeddings.npy", "npzModel":config.embeddingsDir+"/wordEmbeddings", "vecFile":__getVecFile(config.embeddingsDir, modelConfig.embeddingSize), "w2idxFile":config.embeddingsDir+"/vocabulary.json" } w2v = W2VModel() w2v.setResources(w2vFiles) loader = EmbeddingLoader(w2v) word2idx, idx2word, weights = loader.process() return loader, loader else: sentHybridFiles = { "npzFile":config.embeddingsDir+"/sent_hybrid.npy", "npzModel":config.embeddingsDir+"/sent_hybrid", "w2idxFile":config.embeddingsDir+"/sent_hybrid.json" } sentHybrid = HybridModel() sentHybrid.setResources(sentHybridFiles) Hloader = EmbeddingLoader(sentHybrid) Hword2idx, Hidx2word, Hweights = Hloader.process() predHybridFiles = { "npzFile":config.embeddingsDir+"/pred_hybrid.npy", "npzModel":config.embeddingsDir+"/pred_hybrid", "w2idxFile":config.embeddingsDir+"/pred_hybrid.json" } predHybrid = HybridModel() predHybrid.setResources(predHybridFiles) Ploader = EmbeddingLoader(predHybrid) Pword2idx, Pidx2word, Pweights = Ploader.process() return Hloader, Ploader
from model.configuration import Config from model.configuration.model_config import ModelConfig from utils.function_utils import Utils from utils import extractFeaturesFromSentence, toNNFormat from embeddings import getEmbeddings import pandas as pd print 'loading configuration' config = Config.Instance() config.prepare(Utils.getWorkingDirectory()) modelConfig = ModelConfig.Instance() modelConfig.prepare(config.srlConfig + '/srl-config.json') print 'configuration loaded' sentenceLoader, predicateLoader = getEmbeddings(config, modelConfig.embeddingType) wikiFile = pd.read_csv(config.convertedCorpusDir + '/wiki.csv') for i in xrange(0, len(wikiFile)): predicate = wikiFile['predicate'][i] sentence = wikiFile['sentence'][i] convertedSentence, convertedPredicate, allCaps, firstCaps, noCaps, context, distance = extractFeaturesFromSentence( sentence, predicate, sentenceLoader.word2idx, predicateLoader.word2idx) inputSentence, inputPredicate, inputAux = toNNFormat( convertedSentence, convertedPredicate, allCaps, firstCaps, noCaps, context, distance) print inputSentence.shape, inputPredicate.shape, inputAux.shape break
batcher.addAll(trainingData[0], trainingData[1], trainingData[2], trainingData[3]) container = batcher.getBatches() inference = SRLInference(tagMap, tagList) evaluator = Evaluator( testData, inference, nnUtils, config.resultsDir + '/fold_' + str(fold) + '/finalResult.json') #lrReducer = PatienceBaseLrReducer(modelConfig.trainingEpochs, modelConfig.patience, modelConfig.decayRate) #lrReducer = FixedBasedLrReducer(modelConfig.trainingEpochs) #clr = CyclicLearningRate(base_lr=0.00020, max_lr=0.0012, step_size=(204.*3), mode='exp_range', gamma=0.99996) msaver = ModelEvaluation(fold) print 'prepared' print 'creating neural network model' model = LSTMModel(ModelConfig.Instance()) nn = model.create(sentenceLoader.weights, predicateLoader.weights) nn.summary() #lrReducer.setNetwork(nn) es = EarlyStopper() evaluator.prepare( nn, config.resultsDir + '/fold_' + str(fold) + '/epoch_' + str(1) + '/', config.resourceDir + '/srl-eval.pl') print 'model loaded' print 'start training' number_of_epochs = ModelConfig.Instance().trainingEpochs for epoch in xrange(1, number_of_epochs): print "--------- Epoch %d -----------" % (epoch) start_time = time.time()
print 'creating neural network model' mp = ModelPersistence() nn = mp.load( Config.Instance().resultsDir + '/model_50.json', Config.Instance().resultsDir + '/model_50.h5py', ) nn.compile(optimizer=modelConfig.optimizer, loss=modelConfig.lossFunction, metrics=['accuracy']) nn.summary() lrReducer.setNetwork(nn) print 'model loaded' print 'start training' number_of_epochs = ModelConfig.Instance().trainingEpochs for epoch in xrange(50, 50 + number_of_epochs): print "--------- Epoch %d -----------" % (epoch + 1) start_time = time.time() numIterations = len(container) indexes = np.arange(len(container)) #np.random.shuffle(indexes) print indexes print 'Running in {} batches'.format(numIterations) for i in xrange(0, numIterations): z = indexes[i] sent, pred, aux, label = batcher.open(container[z])
print 'loading corpus' csvFiles = [ config.convertedCorpusDir + '/propbank_training.csv', config.convertedCorpusDir + '/propbank_test.csv' ] converter = CorpusConverter(csvFiles, sentenceLoader, predicateLoader) data = converter.load(config.resourceDir + '/wiki_feature_file.npy') tagMap = converter.tagMap tagList = converter.tagList nnUtils.setTagList(tagMap, tagList) print 'loaded' print 'loading neural network model' inference = SRLInference(tagMap, tagList) model = LSTMModel(ModelConfig.Instance()) nn = model.load(Config.Instance().resultsDir + '/best/wiki_model.json', Config.Instance().resultsDir + '/best/wiki_model.h5py') nn.summary() print 'model loaded' prediction = Predictor(nn, tagList, inference) wikiFile = pd.read_csv(config.convertedCorpusDir + '/wiki.csv') results = [] iterations = len(wikiFile) for i in xrange(0, iterations): try: propositionId = wikiFile['propositionId'][i] predicate = wikiFile['predicate'][i]