def getEmbeddingFromPersistenceManager(persistenceManager, name): """ Return the embedding vector from the database :type persistenceManager: persistence.PersistentManager.PersistentManager :param persistenceManager: :param name: name of object which the embedding was saved as attribute :return: """ return EmbeddingLayer.getEmbeddingFromPersistenceManager( persistenceManager, name)
def main(**kwargs): log = logging.getLogger(__name__) log.info(kwargs) if kwargs["seed"] != None: random.seed(kwargs["seed"]) np.random.seed(kwargs["seed"]) filters = [] for filterName in kwargs["filters"]: moduleName, className = filterName.rsplit('.', 1) log.info("Usando o filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) wordWindowSize = kwargs["word_window_size"] hiddenLayerSize = kwargs["hidden_size"] batchSize = kwargs["batch_size"] startSymbol = kwargs["start_symbol"] endSymbol = kwargs["end_symbol"] numEpochs = kwargs["num_epochs"] lr = kwargs["lr"] tagLexicon = createLexiconUsingFile(kwargs["label_file"]) # _lambda = theano.shared(kwargs["lambda"], "lambda") _lambda = theano.shared(0.0, "lambda") useAdagrad = kwargs["adagrad"] shuffle = kwargs["shuffle"] supHiddenLayerSize = kwargs["hidden_size_supervised_part"] unsupHiddenLayerSize = kwargs["hidden_size_unsupervised_part"] normalization = kwargs["normalization"] activationHiddenExtractor = kwargs["activation_hidden_extractor"] withCharWNN = kwargs["with_charwnn"] convSize = kwargs["conv_size"] charEmbeddingSize = kwargs["char_emb_size"] charWindowSize = kwargs["char_window_size"] startSymbolChar = "</s>" if kwargs["charwnn_with_act"]: charAct = tanh else: charAct = None # TODO: the maximum number of characters of word is fixed in 20. numMaxChar = 20 if kwargs["decay"].lower() == "normal": decay = 0.0 elif kwargs["decay"].lower() == "divide_epoch": decay = 1.0 # Add the lexicon of target domainLexicon = Lexicon() domainLexicon.put("0") domainLexicon.put("1") domainLexicon.stopAdd() log.info("Reading W2v File1") wordEmbedding = EmbeddingFactory().createFromW2V(kwargs["word_embedding"], RandomUnknownStrategy()) log.info("Reading training examples") # Generators inputGenerators = [ WordWindowGenerator(wordWindowSize, wordEmbedding, filters, startSymbol) ] outputGeneratorTag = LabelGenerator(tagLexicon) if withCharWNN: # Create the character embedding charEmbedding = EmbeddingFactory().createRandomEmbedding( charEmbeddingSize) # Insert the padding of the character window charEmbedding.put(startSymbolChar) # Insert the character that will be used to fill the matrix # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication. artificialChar = "ART_CHAR" charEmbedding.put(artificialChar) inputGenerators.append( CharacterWindowGenerator(charEmbedding, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol)) unsupervisedLabelSource = ConstantLabel(domainLexicon, "0") # Reading supervised and unsupervised data sets. trainSupervisedDatasetReader = TokenLabelReader( kwargs["train_source"], kwargs["token_label_separator"]) trainSupervisedBatch = SyncBatchIterator( trainSupervisedDatasetReader, inputGenerators, [outputGeneratorTag, unsupervisedLabelSource], batchSize[0], shuffle=shuffle) # Get Unsupervised Input unsupervisedLabelTarget = ConstantLabel(domainLexicon, "1") trainUnsupervisedDatasetReader = TokenReader(kwargs["train_target"]) trainUnsupervisedDatasetBatch = SyncBatchIterator( trainUnsupervisedDatasetReader, inputGenerators, [unsupervisedLabelTarget], batchSize[1], shuffle=shuffle) # Stopping to add new words, labels and chars wordEmbedding.stopAdd() tagLexicon.stopAdd() domainLexicon.stopAdd() if withCharWNN: charEmbedding.stopAdd() # Printing embedding information dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Size of word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize)) log.info( "Size of char dictionary and char embedding size: %d and %d" % (charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize())) # Word Embedding Normalization if normalization == "zscore": wordEmbedding.zscoreNormalization() elif normalization == "minmax": wordEmbedding.minMaxNormalization() elif normalization == "mean": wordEmbedding.meanNormalization() elif normalization == "none" or not normalization: pass else: raise Exception() # Source input wordWindowSource = T.lmatrix(name="windowSource") sourceInput = [wordWindowSource] # Create the layers related with the extractor of features embeddingLayerSrc = EmbeddingLayer(wordWindowSource, wordEmbedding.getEmbeddingMatrix(), trainable=True) flattenSrc = FlattenLayer(embeddingLayerSrc) if withCharWNN: log.info("Using charwnn") # Create the charwn charWindowIdxSrc = T.ltensor4(name="char_window_idx_source") sourceInput.append(charWindowIdxSrc) charEmbeddingConvLayerSrc = EmbeddingConvolutionalLayer( charWindowIdxSrc, charEmbedding.getEmbeddingMatrix(), numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct) layerBeforeLinearSrc = ConcatenateLayer( [flattenSrc, charEmbeddingConvLayerSrc]) sizeLayerBeforeLinearSrc = wordWindowSize * ( wordEmbedding.getEmbeddingSize() + convSize) else: layerBeforeLinearSrc = flattenSrc sizeLayerBeforeLinearSrc = wordWindowSize * wordEmbedding.getEmbeddingSize( ) if activationHiddenExtractor == "tanh": log.info("Using tanh in the hidden layer of extractor") linear1 = LinearLayer(layerBeforeLinearSrc, sizeLayerBeforeLinearSrc, hiddenLayerSize, weightInitialization=GlorotUniform()) act1 = ActivationLayer(linear1, tanh) elif activationHiddenExtractor == "sigmoid": log.info("Using sigmoid in the hidden layer of extractor") linear1 = LinearLayer(layerBeforeLinearSrc, sizeLayerBeforeLinearSrc, hiddenLayerSize, weightInitialization=SigmoidGenerator()) act1 = ActivationLayer(linear1, sigmoid) else: raise Exception() # Create the layers with the Tagger if supHiddenLayerSize == 0: layerBeforeSupSoftmax = act1 layerSizeBeforeSupSoftmax = hiddenLayerSize log.info("It didn't insert the layer before the supervised softmax.") else: linear2 = LinearLayer(act1, hiddenLayerSize, supHiddenLayerSize, weightInitialization=GlorotUniform()) act2 = ActivationLayer(linear2, tanh) layerBeforeSupSoftmax = act2 layerSizeBeforeSupSoftmax = supHiddenLayerSize log.info("It inserted the layer before the supervised softmax.") supervisedLinear = LinearLayer(layerBeforeSupSoftmax, layerSizeBeforeSupSoftmax, tagLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) supervisedSoftmax = ActivationLayer(supervisedLinear, softmax) # Create the layers with the domain classifier gradientReversalSource = GradientReversalLayer(act1, _lambda) if unsupHiddenLayerSize == 0: layerBeforeUnsupSoftmax = gradientReversalSource layerSizeBeforeUnsupSoftmax = hiddenLayerSize log.info("It didn't insert the layer before the unsupervised softmax.") else: unsupervisedSourceLinearBf = LinearLayer( gradientReversalSource, hiddenLayerSize, unsupHiddenLayerSize, weightInitialization=GlorotUniform()) actUnsupervisedSourceBf = ActivationLayer(unsupervisedSourceLinearBf, tanh) layerBeforeUnsupSoftmax = actUnsupervisedSourceBf layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize log.info("It inserted the layer before the unsupervised softmax.") unsupervisedSourceLinear = LinearLayer( layerBeforeUnsupSoftmax, layerSizeBeforeUnsupSoftmax, domainLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) unsupervisedSourceSoftmax = ActivationLayer(unsupervisedSourceLinear, softmax) ## Target Part windowTarget = T.lmatrix(name="windowTarget") targetInput = [windowTarget] # Create the layers related with the extractor of features embeddingLayerUnsuper1 = EmbeddingLayer( windowTarget, embeddingLayerSrc.getParameters()[0], trainable=True) flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1) if withCharWNN: log.info("Using charwnn") # Create the charwn charWindowIdxTgt = T.ltensor4(name="char_window_idx_target") targetInput.append(charWindowIdxTgt) charEmbeddingConvLayerTgt = EmbeddingConvolutionalLayer( charWindowIdxTgt, charEmbeddingConvLayerSrc.getParameters()[0], numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct, trainable=True) layerBeforeLinearTgt = ConcatenateLayer( [flattenUnsuper1, charEmbeddingConvLayerTgt]) sizeLayerBeforeLinearTgt = wordWindowSize * ( wordEmbedding.getEmbeddingSize() + convSize) else: layerBeforeLinearTgt = flattenUnsuper1 sizeLayerBeforeLinearTgt = wordWindowSize * wordEmbedding.getEmbeddingSize( ) w, b = linear1.getParameters() linearUnsuper1 = LinearLayer(layerBeforeLinearTgt, sizeLayerBeforeLinearTgt, hiddenLayerSize, W=w, b=b, trainable=True) if activationHiddenExtractor == "tanh": log.info("Using tanh in the hidden layer of extractor") actUnsupervised1 = ActivationLayer(linearUnsuper1, tanh) elif activationHiddenExtractor == "sigmoid": log.info("Using sigmoid in the hidden layer of extractor") actUnsupervised1 = ActivationLayer(linearUnsuper1, sigmoid) else: raise Exception() # Create the layers with the domain classifier grandientReversalTarget = GradientReversalLayer(actUnsupervised1, _lambda) if unsupHiddenLayerSize == 0: layerBeforeUnsupSoftmax = grandientReversalTarget layerSizeBeforeUnsupSoftmax = hiddenLayerSize log.info("It didn't insert the layer before the unsupervised softmax.") else: w, b = unsupervisedSourceLinearBf.getParameters() unsupervisedTargetLinearBf = LinearLayer(grandientReversalTarget, hiddenLayerSize, unsupHiddenLayerSize, W=w, b=b, trainable=True) actUnsupervisedTargetLinearBf = ActivationLayer( unsupervisedTargetLinearBf, tanh) layerBeforeUnsupSoftmax = actUnsupervisedTargetLinearBf layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize log.info("It inserted the layer before the unsupervised softmax.") w, b = unsupervisedSourceLinear.getParameters() unsupervisedTargetLinear = LinearLayer(layerBeforeUnsupSoftmax, layerSizeBeforeUnsupSoftmax, domainLexicon.getLen(), W=w, b=b, trainable=True) unsupervisedTargetSoftmax = ActivationLayer(unsupervisedTargetLinear, softmax) # Set loss and prediction and retrieve all layers supervisedLabel = T.lvector("supervisedLabel") unsupervisedLabelSource = T.lvector("unsupervisedLabelSource") unsupervisedLabelTarget = T.lvector("unsupervisedLabelTarget") supervisedOutput = supervisedSoftmax.getOutput() supervisedPrediction = ArgmaxPrediction(1).predict(supervisedOutput) supervisedLoss = NegativeLogLikelihood().calculateError( supervisedOutput, supervisedPrediction, supervisedLabel) unsupervisedOutputSource = unsupervisedSourceSoftmax.getOutput() unsupervisedPredSource = ArgmaxPrediction(1).predict( unsupervisedOutputSource) unsupervisedLossSource = NegativeLogLikelihood().calculateError( unsupervisedOutputSource, None, unsupervisedLabelSource) unsupervisedOutputTarget = unsupervisedTargetSoftmax.getOutput() unsupervisedPredTarget = ArgmaxPrediction(1).predict( unsupervisedOutputTarget) unsupervisedLossTarget = NegativeLogLikelihood().calculateError( unsupervisedOutputTarget, None, unsupervisedLabelTarget) # Creates model if useAdagrad: log.info("Using ADAGRAD") opt = Adagrad(lr=lr, decay=decay) else: log.info("Using SGD") opt = SGD(lr=lr, decay=decay) allLayersSource = supervisedSoftmax.getLayerSet( ) | unsupervisedSourceSoftmax.getLayerSet() allLayersTarget = unsupervisedTargetSoftmax.getLayerSet() unsupervisedLossTarget *= float( trainSupervisedBatch.size()) / trainUnsupervisedDatasetBatch.size() supervisedTrainMetrics = [ LossMetric("TrainSupervisedLoss", supervisedLoss), AccuracyMetric("TrainSupervisedAcc", supervisedLabel, supervisedPrediction), LossMetric("TrainUnsupervisedLoss", unsupervisedLossSource), AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelSource, unsupervisedPredSource) ] unsupervisedTrainMetrics = [ LossMetric("TrainUnsupervisedLoss", unsupervisedLossTarget), AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelTarget, unsupervisedPredTarget) ] evalMetrics = [ AccuracyMetric("EvalAcc", supervisedLabel, supervisedPrediction) ] testMetrics = [ AccuracyMetric("TestAcc", supervisedLabel, supervisedPrediction) ] #TODO: Não tive tempo de testar o código depois das modificações GradientReversalModel(sourceInput, targetInput, supervisedLabel, unsupervisedLabelSource, unsupervisedLabelTarget, allLayersSource, allLayersTarget, opt, supervisedPrediction, supervisedLoss, unsupervisedLossSource, unsupervisedLossTarget, supervisedTrainMetrics, unsupervisedTrainMetrics, evalMetrics, testMetrics, mode=None) # Get dev inputs and output log.info("Reading development examples") devDatasetReader = TokenLabelReader(kwargs["dev"], kwargs["token_label_separator"]) devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGeneratorTag], sys.maxint, shuffle=False) callbacks = [] # log.info("Usando lambda fixo: " + str(_lambda.get_value())) log.info("Usando lambda variado. alpha=" + str(kwargs["alpha"]) + " height=" + str(kwargs["height"])) callbacks.append( ChangeLambda(_lambda, kwargs["alpha"], numEpochs, kwargs["height"])) if kwargs["additional_dev"]: callbacks.append( AdditionalDevDataset(model, kwargs["additional_dev"], kwargs["token_label_separator"], inputGenerators, outputGeneratorTag)) # Training Model model.train([trainSupervisedBatch, trainUnsupervisedDatasetBatch], numEpochs, devReader, callbacks=callbacks)
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 2: log.error("Missing argument: <JSON config file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error("You must provide argument word_embedding or word_lexicon and word_emb_size") # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error("Only one of the parameters label_lexicon and labels can be provided!") exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error("One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) if args.conv_act: convOut = ActivationLayer(convLinear, tanh) else: convOut = convLinear # Max pooling layer. maxPooling = MaxPoolingLayer(convOut) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen(): log.error("Number of label weights (%d) is different from number of labels (%d)!" % ( len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if args.train: trainDatasetReader = ShortDocReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle) wordLexicon.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error("Argument eval_per_iteration cannot be used without a dev argument.") sys.exit(1) if dev: log.info("Reading development examples") devReader = ShortDocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, - 1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Saving model after training if args.save_wordEmbedding: embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon) log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding)) if args.save_conv: convLinear.save(args.save_conv) log.info("Saved convolution layer to file: %s" % (args.save_conv)) if args.save_hiddenLayer: hiddenLinear.save(args.save_hiddenLayer) log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer)) if args.save_softmax: sotmaxLinearInput.save(args.save_softmax) log.info("Saved softmax to file: %s" % (args.save_softmax)) # Testing if args.test: log.info("Reading test examples") testReader = ShortDocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, - 1, shuffle=False) log.info("Testing") model.test(testIterator)
def trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow, labelLexicon): # Build neural network. wordWindow = T.lmatrix("word_window") inputModel = [wordWindow] wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), borrow=borrow, structGrad=args.struct_grad, trainable=True, name="word_embedding_layer") flatWordEmbedding = FlattenLayer(wordEmbeddingLayer) charWindowIdxs = T.ltensor4(name="char_window_idx") inputModel.append(charWindowIdxs) # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), 20, args.conv_size, args.char_window_size, args.char_emb_size, tanh, structGrad=args.char_struct_grad, name="char_convolution_layer", borrow=borrow) layerBeforeLinear = ConcatenateLayer([flatWordEmbedding, charEmbeddingConvLayer]) sizeLayerBeforeLinear = args.word_window_size * (wordEmbedding.getEmbeddingSize() + args.conv_size) hiddenActFunction = method_name(args.hidden_activation_function) weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform() linearHidden = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, args.hidden_size, weightInitialization=weightInit, name="linear1") actHidden = ActivationLayer(linearHidden, hiddenActFunction) linearSoftmax = LinearLayer(actHidden, args.hidden_size, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator(), name="linear_softmax") actSoftmax = ActivationLayer(linearSoftmax, softmax) prediction = ArgmaxPrediction(1).predict(actSoftmax.getOutput()) # Output symbolic tensor variable. y = T.lvector("y") if args.decay.lower() == "normal": decay = 0.0 elif args.decay.lower() == "divide_epoch": decay = 1.0 else: log.error("Unknown decay argument: %s" % args.decay) sys.exit(1) if args.adagrad: log.info("Training algorithm: Adagrad") opt = Adagrad(lr=args.lr, decay=decay) else: log.info("Training algorithm: SGD") opt = SGD(lr=args.lr, decay=decay) # Training loss function. loss = NegativeLogLikelihood().calculateError(actSoftmax.getOutput(), prediction, y) # L2 regularization. if args.l2: loss += args.l2 * (T.sum(T.square(linearHidden.getParameters()[0]))) # # TODO: debug # opt.lr.tag.test_value = 0.02 # Metrics. trainMetrics = [ LossMetric("LossTrain", loss, True), AccuracyMetric("AccTrain", y, prediction) ] evalMetrics = None if args.dev: evalMetrics = [ LossMetric("LossDev", loss, True), AccuracyMetric("AccDev", y, prediction), CustomMetric("CustomMetricDev", y, prediction) ] testMetrics = None if args.test: testMetrics = [ CustomMetric("CustomMetricTest", y, prediction) ] log.info("Compiling the network...") # # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None wnnModel = BasicModel(inputModel, [y], actSoftmax.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) log.info("Training...") wnnModel.train(trainIterator, args.num_epochs, devIterator)
def main(args): log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) loadPath = args.load_model if loadPath: with codecs.open(loadPath + ".param", "r", encoding="utf-8") as paramsFile: param = json.load(paramsFile, encoding="utf-8") hiddenActFunctionName = param['hiddenActFunction'] hiddenActFunction = method_name(hiddenActFunctionName) # Loading Embedding log.info("Loading Model") wordEmbedding = EmbeddingFactory().createFromW2V( loadPath + ".wv", ChosenUnknownStrategy(param["unknown"])) labelLexicon = Lexicon() for l in param["labels"]: labelLexicon.put(l) labelLexicon.stopAdd() # Loading model labelWeights = np.load(loadPath + ".npy").item(0) W1 = labelWeights["W_Hidden"] b1 = labelWeights["b_Hidden"] W2 = labelWeights["W_Softmax"] b2 = labelWeights["b_Softmax"] hiddenLayerSize = b1.shape[0] else: W1 = None b1 = None W2 = None b2 = None hiddenActFunctionName = args.hidden_activation_function hiddenActFunction = method_name(hiddenActFunctionName) if args.word_embedding: log.info("Reading W2v File") wordEmbedding = EmbeddingFactory().createFromW2V( args.word_embedding, RandomUnknownStrategy()) wordEmbedding.stopAdd() elif args.hash_lex_size: wordEmbedding = RandomEmbedding(args.word_emb_size, RandomUnknownStrategy(), HashLexicon(args.hash_lex_size)) else: wordEmbedding = EmbeddingFactory().createRandomEmbedding( args.word_emb_size) # Get the inputs and output if args.labels: labelLexicon = createLexiconUsingFile(args.labels) else: labelLexicon = Lexicon() if args.load_hidden_layer: # Loading Hidden Layer log.info("Loading Hidden Layer") hl = np.load(args.load_hidden_layer).item(0) W1 = hl["W_Encoder"] b1 = hl["b_Encoder"] hiddenLayerSize = b1.shape[0] # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = T.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = T.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = SigmoidGlorot( ) if hiddenActFunction == sigmoid else GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=None, b=None, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # List of input layers (will be concatenated). inputLayers = [maxPooling] # Generate word windows. wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, wordEmbedding, filters, startSymbol, endSymbol) # List of input generators. inputGenerators = [ lambda offer: wordWindowFeatureGenerator(offer["tokens"]) ] concatenatedSize = convSize # Additional features. if args.categorical_features is not None: log.info("Using categorical features: %s" % str([ftr[0] for ftr in args.categorical_features])) for ftr in args.categorical_features: concatenatedSize += ftr[2] ftrLexicon = createLexiconUsingFile(ftr[1]) ftrEmbedding = RandomEmbedding( embeddingSize=ftr[2], unknownGenerateStrategy=RandomUnknownStrategy(), lexicon=ftrLexicon, ) ftrInput = T.lscalar("in_" + ftr[0]) ftrLayer = EmbeddingLayer(ftrInput, ftrEmbedding.getEmbeddingMatrix()) inputGenerators.append( lambda offer: ftrLexicon.put(offer[ftr[0]].strip().lower())) inputTensors.append(ftrInput) inputLayers.append(ftrLayer) log.info("Input layers: %s" % str(inputLayers)) # Concatenate all input layers, when there are more thean one input layer. concatenatedInLayers = maxPooling if len( inputLayers) == 1 else ConcatenateLayer(inputLayers, axis=0) if args.include_hidden_layer: # Hidden layer. hiddenLinear = LinearLayer(concatenatedInLayers, concatenatedSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction) else: # Do not use a hidden layer. log.info("Not using hidden layer!") hiddenAct = concatenatedInLayers hiddenLayerSize = concatenatedSize # Entrada linear da camada softmax. sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Class weights. labelWeights = None if args.labels_probs: numLabels = labelLexicon.getLen() labelWeights = np.zeros(numLabels, dtype=theano.config.floatX) if args.labels_probs.startswith("@"): # Load the dictionary from a JSON file. with codecs.open(args.labels_probs[1:], mode="r", encoding="utf8") as f: labelDistribution = json.load(f) else: # The argument value is already a JSON. labelDistribution = json.loads(args.labels_probs) for k, v in labelDistribution.items(): # The weight of a class is inversely-proportional to its frequency. labelWeights[labelLexicon.getLexiconIndex(k)] = 1.0 / v if args.labels_weights_log: # Attenuate weights for highly unbalanced classes. labelWeights = np.log(labelWeights) log.info("Label weights: " + str(labelWeights)) # Loss function. loss = NegativeLogLikelihoodOneExample(labelWeights).calculateError( softmaxAct.getOutput()[0], prediction, outLabel) # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] if args.train: trainDatasetReader = OfertasReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, intputGenerators, outputGenerators, -1, shuffle=shuffle) wordEmbedding.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, -1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error( "Argument eval_per_iteration cannot be used without a dev argument." ) sys.exit(1) if dev: log.info("Reading development examples") devReader = OfertasReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, -1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) if normalizeMethod is not None and loadPath is not None: log.warn( "The word embedding of model was normalized. This can change the result of test." ) # if kwargs["lambda"]: # _lambda = kwargs["lambda"] # log.info("Using L2 with lambda= %.2f", _lambda) # loss += _lambda * (T.sum(T.square(hiddenLinear.getParameters()[0]))) # Decaimento da taxa de aprendizado. decay = 0.0 if args.decay == "linear": decay = 1.0 # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error( "Unknown algorithm: %s. Expected values are: adagrad or sgd." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] if args.test_probs: # Append predicted probabilities for the test set. testMetrics.append( PredictedProbabilities("TestProbs", softmaxAct.getOutput())) else: if args.test_probs: log.error( "The option test_probs requires a test dataset (option test).") sys.exit(1) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = Model(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: callback = [] if args.save_model: savePath = args.save_model modelWriter = OfertasModelWritter(savePath, embeddingLayer, hiddenLinear, sotmaxLinearInput, wordEmbedding, labelLexicon, hiddenActFunctionName) callback.append(SaveModelCallback(modelWriter, "eval_acc", True)) log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration, callbacks=callback) # Testing if args.test: log.info("Reading test examples") testReader = OfertasReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator)
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 3: log.error("Missing argument: <JSON config file> or/and <Input file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error( "You must provide argument word_embedding or word_lexicon and word_emb_size" ) # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error( "Only one of the parameters label_lexicon and labels can be provided!" ) exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error( "One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len( args.label_weights) != labelLexicon.getLen(): log.error( "Number of label weights (%d) is different from number of labels (%d)!" % (len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [ WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) ] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, mode=mode) wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) # GETS HIDDEN LAYER: # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow) # GRAPH FOR PREDICTION LAYER graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode) lblTxt = ["Sim", "Nao"] tweets = [] with open(sys.argv[2]) as inputFile: content = inputFile.readlines() for line in content: tweets.append(line.decode('utf-8').encode('utf-8')) #print tweets # graph.getResultsFor(t) retorna a predição para dado Tweet t try: output_file = open("Output.txt", "w") except: print "Falha em criar o arquivo de saida\n" try: for t in tweets: output_file.write( t.replace('\n', '').replace('\t', '') + "\t " + lblTxt[graph.getResultsFor(t)] + "\n") print "Resultados gerados com sucesso!\n" except: print "Erro na geração de resultados\n"
def __init__(self, input, charEmbedding, numMaxCh, convSize, charWindowSize, charEmbSize, charAct=tanh, structGrad=True, trainable=True, name=None, borrow=True): """ :param input: a layer or theano variable :param charEmbedding: numpy.array or python list that contains the character vectors :param numMaxCh: the number of characters that will be used in a word. If the word size is greater than this parameter, so it'll be used the numMaxCh end characters of the word. If the word size is lesser than this parameter, so it'll be filled (numMaxCh - word_size) characters at the word end. :param convSize: number of convolution filters :param charWindowSize: the size of the character window :param charEmbSize: the size of the character embedding :param charAct: the activation function. If this paramater is None, so no activation function will be used. :param structGrad: whether to use structured gradients or not. When using small batches (online gradient descent, in the limit), the structured gradient is much more efficient because a small fraction of word vectors are used on each iteration. However, when using large batches (ordinary gradient descent, in the limit), ordinary gradients and updates are more efficient because most (or all of the) word vectors are used on each iteration. :param trainable: set if the layer is trainable or not :param name:unique name of the layer. This is use to save the attributes of this object. :param borrow: whether the shared variable of this layer will activate the borrow setting. """ # Input variable for this layer. Its shape is (numExs, szWrdWin, numMaxCh, szChWin) # where numExs is the number of examples in the training batch, # szWrdWin is the size of the word window, # numMaxCh is the number of characters used to represent words, and # szChWin is the size of the character window. self.input = input super(EmbeddingConvolutionalLayer, self).__init__(self.input, trainable, name) self.__output = None self.__charWindowSize = charWindowSize self.__convSize = convSize # This is the fixed size of all words. self.maxLenWord = numMaxCh # Activation function of hidden layer self.charAct = charAct self.__structGrad = structGrad # We use the symbolic shape of the input to perform all dimension # transformations (reshape) necessary for the computation of this layer. shape = T.shape(self.input) numExs = shape[0] szWrdWin = shape[1] numMaxCh = shape[2] szChWin = shape[3] # Character embedding layer. self.__embedLayer = EmbeddingLayer(self.input.flatten(2), charEmbedding, borrow=borrow, structGrad=structGrad, trainable=self.isTrainable()) # It chooses, based in the activation function, the way that the weights of liner layer will be initialized. if charAct is tanh: weightInitialization = GlorotUniform() elif charAct is sigmoid: weightInitialization = SigmoidGenerator() elif charAct is None: pass else: raise Exception("Activation function is not supported") # This is the bank of filters. It is an ordinary hidden layer. hidInput = ReshapeLayer( self.__embedLayer, (numExs * szWrdWin * numMaxCh, szChWin * charEmbSize)) self.__linearLayer = LinearLayer( hidInput, charWindowSize * charEmbSize, self.__convSize, weightInitialization=weightInitialization, trainable=self.isTrainable()) if charAct: self.actLayer = ActivationLayer(self.__linearLayer, self.charAct) layerBeforePolling = self.actLayer else: layerBeforePolling = self.__linearLayer # 3-D tensor with shape (numExs * szWrdWin, numMaxCh, convSize). # This tensor is used to perform the max pooling along its 2nd dimension. o = ReshapeLayer(layerBeforePolling, (numExs * szWrdWin, numMaxCh, convSize)) # Max pooling layer. Perform a max op along the character dimension. # The shape of the output is equal to (numExs*szWrdWin, convSize). m = T.max(o.getOutput(), axis=1) # The output is a 2-D tensor with shape (numExs, szWrdWin * convSize). self.__output = m.reshape((numExs, szWrdWin * convSize))
class EmbeddingConvolutionalLayer(Layer): """ Convolutional layer of embedding features. The input of this layer is a 4-D tensor whose shape is: (numExs, szWrdWin, numMaxCh, szChWin) where numExs is the number of examples in a training (mini) batch, szWrdWin is the size of the word window (the convolution is independently performed for each index in this dimension), numMaxCh is the number of characters used to represent words (the convolution is performed over this dimension), szChWin is the size of the character window (each input for the convolution filters is composed by this number of features). The value numMaxCh, the number of characters used to represent a word, is fixed for all word to speedup training. For words that are shorter than this value, we extend them with an artificial character. For words that are longer than this value, we use only the last numMaxCh characters in them. Thus, this layer is not really convolutional (with variable-sized inputs), but it is sufficient for many applications and is much faster than an ordinary convolutional layer. """ def __init__(self, input, charEmbedding, numMaxCh, convSize, charWindowSize, charEmbSize, charAct=tanh, structGrad=True, trainable=True, name=None, borrow=True): """ :param input: a layer or theano variable :param charEmbedding: numpy.array or python list that contains the character vectors :param numMaxCh: the number of characters that will be used in a word. If the word size is greater than this parameter, so it'll be used the numMaxCh end characters of the word. If the word size is lesser than this parameter, so it'll be filled (numMaxCh - word_size) characters at the word end. :param convSize: number of convolution filters :param charWindowSize: the size of the character window :param charEmbSize: the size of the character embedding :param charAct: the activation function. If this paramater is None, so no activation function will be used. :param structGrad: whether to use structured gradients or not. When using small batches (online gradient descent, in the limit), the structured gradient is much more efficient because a small fraction of word vectors are used on each iteration. However, when using large batches (ordinary gradient descent, in the limit), ordinary gradients and updates are more efficient because most (or all of the) word vectors are used on each iteration. :param trainable: set if the layer is trainable or not :param name:unique name of the layer. This is use to save the attributes of this object. :param borrow: whether the shared variable of this layer will activate the borrow setting. """ # Input variable for this layer. Its shape is (numExs, szWrdWin, numMaxCh, szChWin) # where numExs is the number of examples in the training batch, # szWrdWin is the size of the word window, # numMaxCh is the number of characters used to represent words, and # szChWin is the size of the character window. self.input = input super(EmbeddingConvolutionalLayer, self).__init__(self.input, trainable, name) self.__output = None self.__charWindowSize = charWindowSize self.__convSize = convSize # This is the fixed size of all words. self.maxLenWord = numMaxCh # Activation function of hidden layer self.charAct = charAct self.__structGrad = structGrad # We use the symbolic shape of the input to perform all dimension # transformations (reshape) necessary for the computation of this layer. shape = T.shape(self.input) numExs = shape[0] szWrdWin = shape[1] numMaxCh = shape[2] szChWin = shape[3] # Character embedding layer. self.__embedLayer = EmbeddingLayer(self.input.flatten(2), charEmbedding, borrow=borrow, structGrad=structGrad, trainable=self.isTrainable()) # It chooses, based in the activation function, the way that the weights of liner layer will be initialized. if charAct is tanh: weightInitialization = GlorotUniform() elif charAct is sigmoid: weightInitialization = SigmoidGenerator() elif charAct is None: pass else: raise Exception("Activation function is not supported") # This is the bank of filters. It is an ordinary hidden layer. hidInput = ReshapeLayer( self.__embedLayer, (numExs * szWrdWin * numMaxCh, szChWin * charEmbSize)) self.__linearLayer = LinearLayer( hidInput, charWindowSize * charEmbSize, self.__convSize, weightInitialization=weightInitialization, trainable=self.isTrainable()) if charAct: self.actLayer = ActivationLayer(self.__linearLayer, self.charAct) layerBeforePolling = self.actLayer else: layerBeforePolling = self.__linearLayer # 3-D tensor with shape (numExs * szWrdWin, numMaxCh, convSize). # This tensor is used to perform the max pooling along its 2nd dimension. o = ReshapeLayer(layerBeforePolling, (numExs * szWrdWin, numMaxCh, convSize)) # Max pooling layer. Perform a max op along the character dimension. # The shape of the output is equal to (numExs*szWrdWin, convSize). m = T.max(o.getOutput(), axis=1) # The output is a 2-D tensor with shape (numExs, szWrdWin * convSize). self.__output = m.reshape((numExs, szWrdWin * convSize)) def updateAllCharIndexes(self, charIdxWord): for Idx in xrange(len(charIdxWord)): indexes = [] for charIdx in xrange(len(charIdxWord[Idx])): indexes.append(self.getCharIndexes(charIdx, charIdxWord[Idx])) self.AllCharWindowIndexes.append(indexes) def getParameters(self): return self.__embedLayer.getParameters( ) + self.__linearLayer.getParameters() def getDefaultGradParameters(self): return self.__linearLayer.getDefaultGradParameters( ) + self.__embedLayer.getDefaultGradParameters() def getStructuredParameters(self): return self.__linearLayer.getStructuredParameters( ) + self.__embedLayer.getStructuredParameters() def getUpdates(self, cost, lr, sumSqGrads=None): return self.__embedLayer.getUpdates(cost, lr, sumSqGrads) def getNormalizationUpdates(self, strategy, coef): return self.__embedLayer.getNormalizationUpdate(strategy, coef) def getOutput(self): return self.__output def getAttributes(self): keyValueList = self.__embedLayer.getAttributes().items() keyValueList += self.__linearLayer.getAttributes().items() dict = {} for key, value in keyValueList: dict[key] = value return dict def load(self, attributes): self.__embedLayer.load(attributes) self.__linearLayer.load(attributes) @staticmethod def getEmbeddingFromPersistenceManager(persistenceManager, name): """ Return the embedding vector from the database :type persistenceManager: persistence.PersistentManager.PersistentManager :param persistenceManager: :param name: name of object which the embedding was saved as attribute :return: """ return EmbeddingLayer.getEmbeddingFromPersistenceManager( persistenceManager, name)
def main(args): log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.wv_normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None hiddenActFunction = tanh if args.word_embedding: log.info("Reading W2v File") (lexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol='unknown') lexicon.stopAdd() else: wordEmbedding = EmbeddingFactory().createRandomEmbedding( args.word_emb_size) # Get the inputs and output if args.labels: labelLexicon = Lexicon.fromTextFile(args.labels, hasUnknowSymbol=False) else: labelLexicon = Lexicon() # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = T.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = T.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = SigmoidGlorot( ) if hiddenActFunction == sigmoid else GlorotUniform() # Convolution layer. Convolução no texto de um documento. convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=None, b=None, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Generate word windows. wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, lexicon, filters, startSymbol, endSymbol) # List of input generators. inputGenerators = [wordWindowFeatureGenerator] # Hidden layer. hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction) # Entrada linear da camada softmax. sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. loss = NegativeLogLikelihoodOneExample().calculateError( softmaxAct.getOutput()[0], prediction, outLabel) # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] if args.train: trainDatasetReader = DocReader(args.train) log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, -1, shuffle=shuffle) lexicon.stopAdd() labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error( "Argument eval_per_iteration cannot be used without a dev argument." ) sys.exit(1) if dev: log.info("Reading development examples") devReader = DocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, -1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. if args.decay == "linear": decay = 1.0 elif args.decay == "none": decay = 0.0 else: log.error("Unknown decay strategy %s. Expected: none or linear." % args.decay) sys.exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error( "Unknown algorithm: %s. Expected values are: adagrad or sgd." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Testing if args.test: log.info("Reading test examples") testReader = DocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator)
def main(**kwargs): log = logging.getLogger(__name__) log.info(kwargs) if kwargs["seed"] != None: random.seed(kwargs["seed"]) np.random.seed(kwargs["seed"]) filters = [] for filterName in kwargs["filters"]: moduleName, className = filterName.rsplit('.', 1) log.info("Usando o filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) # Get the inputs and output wordWindowSize = kwargs["word_window_size"] hiddenLayerSize = kwargs["hidden_size"] batchSize = kwargs["batch_size"] startSymbol = kwargs["start_symbol"] numEpochs = kwargs["num_epochs"] lr = kwargs["lr"] labelLexicon = createLexiconUsingFile(kwargs["label_file"]) log.info("Reading training examples") log.info("Reading W2v File1") embedding1 = EmbeddingFactory().createFromW2V(kwargs["word_embedding1"], RandomUnknownStrategy()) # Supervised part # Learner1 input1 = T.lmatrix(name="input1") embeddingLayer1 = EmbeddingLayer(input1, embedding1.getEmbeddingMatrix(), trainable=True) flatten1 = FlattenLayer(embeddingLayer1) linear11 = LinearLayer(flatten1, wordWindowSize * embedding1.getEmbeddingSize(), hiddenLayerSize, weightInitialization=GlorotUniform()) act11 = ActivationLayer(linear11, tanh) linear12 = LinearLayer(act11, hiddenLayerSize, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) act12 = ActivationLayer(linear12, softmax) ## Learner2 log.info("Reading W2v File2") embedding2 = EmbeddingFactory().createFromW2V(kwargs["word_embedding2"], RandomUnknownStrategy()) input2 = T.lmatrix(name="input2") embeddingLayer2 = EmbeddingLayer(input2, embedding2.getEmbeddingMatrix(), trainable=True) flatten2 = FlattenLayer(embeddingLayer2) linear21 = LinearLayer(flatten2, wordWindowSize * embedding2.getEmbeddingSize(), hiddenLayerSize, weightInitialization=GlorotUniform()) act21 = ActivationLayer(linear21, tanh) linear22 = LinearLayer(act21, hiddenLayerSize, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator()) act22 = ActivationLayer(linear22, softmax) y = T.lvector("y") # Set loss and prediction and retrieve all layers output1 = act12.getOutput() prediction1 = ArgmaxPrediction(1).predict(output1) loss1 = NegativeLogLikelihood().calculateError(output1, prediction1, y) if kwargs["l2"][0]: _lambda1 = kwargs["l2"][0] log.info("Using L2 with lambda= %.2f", _lambda1) loss1 += _lambda1 * (T.sum(T.square(linear11.getParameters()[0]))) output2 = act22.getOutput() prediction2 = ArgmaxPrediction(1).predict(output2) loss2 = NegativeLogLikelihood().calculateError(output2, prediction2, y) if kwargs["l2"][1]: _lambda2 = kwargs["l2"][1] log.info("Using L2 with lambda= %.2f", _lambda2) loss2 += _lambda2 * (T.sum(T.square(linear21.getParameters()[0]))) loss = loss1 + loss2 ## CoLearningPrediction output = T.stack([linear12.getOutput(), linear22.getOutput()]) # return T.argmax(output, 2)[T.argmax(T.max(output, 2), 0),T.arange(output.shape[1])] average = T.mean(output, 0) prediction = ArgmaxPrediction(1).predict( ActivationLayer(average, softmax).getOutput()) # prediction = CoLearningWnnPrediction().predict([output1, output2]) supervisedModeUnit = ModelUnit("supervised_wnn", [input1, input2], y, loss, prediction=prediction) # Unsupervised part ## Learner1 inputUnsuper1 = T.lmatrix(name="input_unsupervised_1") embeddingLayerUnsuper1 = EmbeddingLayer(inputUnsuper1, embeddingLayer1.getParameters()[0], trainable=True) flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1) w, b = linear11.getParameters() linearUnsuper11 = LinearLayer(flattenUnsuper1, wordWindowSize * embedding1.getEmbeddingSize(), hiddenLayerSize, W=w, b=b) actUnsupervised11 = ActivationLayer(linearUnsuper11, tanh) w, b = linear12.getParameters() linearUnsuper12 = LinearLayer(actUnsupervised11, hiddenLayerSize, labelLexicon.getLen(), W=w, b=b) actUnsuper12 = ActivationLayer(linearUnsuper12, softmax) ## Learner2 inputUnsuper2 = T.lmatrix(name="input_unsupervised_2") embeddingLayerUnsuper2 = EmbeddingLayer(inputUnsuper2, embeddingLayer2.getParameters()[0], trainable=True) flattenUnsuper2 = FlattenLayer(embeddingLayerUnsuper2) w, b = linear21.getParameters() linearUnsuper21 = LinearLayer(flattenUnsuper2, wordWindowSize * embedding2.getEmbeddingSize(), hiddenLayerSize, W=w, b=b) actUnsuper21 = ActivationLayer(linearUnsuper21, tanh) w, b = linear22.getParameters() linearUnsuper22 = LinearLayer(actUnsuper21, hiddenLayerSize, labelLexicon.getLen(), W=w, b=b) actUnsuper22 = ActivationLayer(linearUnsuper22, softmax) # Set loss and prediction and retrieve all layers outputUns1 = actUnsuper12.getOutput() predictionUns1 = ArgmaxPrediction(1).predict(outputUns1) outputUns2 = actUnsuper22.getOutput() predictionUns2 = ArgmaxPrediction(1).predict(outputUns2) # # unsupervisedLoss = kwargs["lambda"] * ( # NegativeLogLikelihood().calculateError(outputUns1, predictionUns1, predictionUns2) + # NegativeLogLikelihood().calculateError(outputUns2, predictionUns2, predictionUns1)) _lambdaShared = theano.shared(value=kwargs["lambda"], name='lambda', borrow=True) unsupervisedLoss = _lambdaShared * (NegativeLogLikelihood().calculateError( outputUns1, predictionUns1, predictionUns2) + NegativeLogLikelihood( ).calculateError(outputUns2, predictionUns2, predictionUns1)) unsupervisedUnit = ModelUnit("unsupervised_wnn", [inputUnsuper1, inputUnsuper2], None, unsupervisedLoss, yWillBeReceived=False) # Creates model model = CoLearningModel(kwargs["loss_uns_epoch"]) model.addTrainingModelUnit(supervisedModeUnit, metrics=["loss", "acc"]) model.addTrainingModelUnit(unsupervisedUnit, metrics=["loss"]) model.setEvaluatedModelUnit(supervisedModeUnit, metrics=["acc"]) # Compile Model opt1 = SGD(lr=lr[0], decay=1.0) opt2 = SGD(lr=lr[1], decay=1.0) log.info("Compiling the model") model.compile([(opt1, { supervisedModeUnit: act12.getLayerSet(), unsupervisedUnit: actUnsuper12.getLayerSet() }), (opt2, { supervisedModeUnit: act22.getLayerSet(), unsupervisedUnit: actUnsuper22.getLayerSet() })]) # Generators inputGenerator1 = WordWindowGenerator(wordWindowSize, embedding1, filters, startSymbol) inputGenerator2 = WordWindowGenerator(wordWindowSize, embedding2, filters, startSymbol) outputGenerator = LabelGenerator(labelLexicon) # Reading supervised and unsupervised data sets. trainSupervisedDatasetReader = TokenLabelReader( kwargs["train_supervised"], kwargs["token_label_separator"]) trainSupervisedDatasetReader = SyncBatchIterator( trainSupervisedDatasetReader, [inputGenerator1, inputGenerator2], [outputGenerator], batchSize[0]) trainUnsupervisedDataset = TokenReader(kwargs["train_unsupervised"]) trainUnsupervisedDatasetReader = SyncBatchIterator( trainUnsupervisedDataset, [inputGenerator1, inputGenerator2], None, batchSize[1]) embedding1.stopAdd() embedding2.stopAdd() labelLexicon.stopAdd() # Get dev inputs and output log.info("Reading development examples") devDatasetReader = TokenLabelReader(kwargs["dev"], kwargs["token_label_separator"]) devReader = SyncBatchIterator(devDatasetReader, [inputGenerator1, inputGenerator2], [outputGenerator], sys.maxint, shuffle=False) lambdaChange = ChangeLambda(_lambdaShared, kwargs["lambda"], kwargs["loss_uns_epoch"]) lossCallback = LossCallback(loss1, loss2, input1, input2, y) # trainUnsupervisedDatasetReaderAcc = SyncBatchIterator(trainUnsupervisedDataset, # [inputGenerator1, inputGenerator2], # [outputGenerator], sys.maxint) # accCallBack = AccCallBack(prediction1, prediction2, input1, input2, # unsurpervisedDataset=trainUnsupervisedDatasetReaderAcc) # Training Model model.train([trainSupervisedDatasetReader, trainUnsupervisedDatasetReader], numEpochs, devReader, callbacks=[lambdaChange, lossCallback])
def mainWnnNegativeSampling(args): # Reading parameters embeddingMatrix = None wordEmbeddingSize = args.word_embedding_size windowSize = args.window_size hiddenLayerSize = args.hidden_size startSymbol = args.start_symbol # endSymbol = args.end_symbol endSymbol = startSymbol noiseRate = args.noise_rate # todo: o algoritmo não suporta mini batch. Somente treinamento estocástico. batchSize = 1 shuffle = args.shuffle lr = args.lr numEpochs = args.num_epochs power = args.power minLr = args.min_lr numExUpdLr = args.num_examples_updt_lr log = logging.getLogger(__name__) log.info(str(args)) if args.seed: random.seed(args.seed) np.random.seed(args.seed) # # if args.decay.lower() == "normal": # decay = 0.0 # elif args.decay.lower() == "divide_epoch": # decay = 1.0 parametersToSaveOrLoad = {"hidden_size", "window_size", "start_symbol"} # Calculate the frequency of each word trainReader = TokenReader(args.train) wordLexicon = Lexicon("UUKNNN", "lexicon") wordLexicon.put(startSymbol, False) totalNumOfTokens = 0 for tokens, labels in trainReader.read(): # we don't count the </s>, because this token is only insert in the sentence to count its frequency. totalNumOfTokens += len(tokens) # Word2vec considers that the number of lines is the frequency of </s> tokens += [startSymbol] for token in tokens: wordLexicon.put(token) # Prune the words with the frequency less than min_count wordLexicon.prune(args.min_count) wordLexicon.stopAdd() # Calculte the unigram distribution frequency = np.power(wordLexicon.getFrequencyOfAllWords(), power) total = float(frequency.sum()) # # Print the distribution of all words # for _ in xrange(len(frequency)): # print "%s\t%d\t%.4f" % (wordLexicon.getLexicon(_), frequency[_],frequency[_]/float(total)) sampler = Sampler(frequency / float(total)) # Create a random embedding for each word wordEmbedding = Embedding(wordLexicon, None, wordEmbeddingSize) log.info("Lexicon size: %d" % (wordLexicon.getLen())) # Create NN x = T.lmatrix("word_window") y = T.lvector("labels") wordEmbeddingLayer = EmbeddingLayer(x, wordEmbedding.getEmbeddingMatrix(), name="embedding") flatten = FlattenLayer(wordEmbeddingLayer) linear1 = LinearLayer(flatten, wordEmbeddingSize * windowSize, hiddenLayerSize, name="linear1") act1 = ActivationLayer(linear1, tanh) # Softmax regression. It's like a logistic regression linear2 = LinearLayer(act1, hiddenLayerSize, 1, weightInitialization=ZeroWeightGenerator(), name="linear_softmax_regresion") act2 = ActivationLayer(linear2, sigmoid) # We clip the output of -sigmoid, because this output can be 0 and ln(0) is infinite, which can cause problems. output = T.flatten(T.clip(act2.getOutput(), 10**-5, 1 - 10**-5)) # Loss Functions negativeSamplingLoss = T.nnet.binary_crossentropy(output, y).sum() # Set training inputGenerators = [ WordWindowGenerator(windowSize, wordLexicon, [], startSymbol, endSymbol) ] outputGenerators = [ConstantLabel(labelLexicon=None, label=1)] trainIterator = SyncBatchIterator(trainReader, inputGenerators, outputGenerators, batchSize, shuffle) trainMetrics = [LossMetric("lossTrain", negativeSamplingLoss)] allLayers = act2.getLayerSet() # opt = SGD(lr=lr, decay=decay) opt = SGD(lr=lr) model = NegativeSamplingModel(args.t, noiseRate, sampler, minLr, numExUpdLr, totalNumOfTokens, numEpochs, [x], [y], allLayers, opt, negativeSamplingLoss, trainMetrics) # Save Model if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon] modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Training model.train(trainIterator, numEpochs=numEpochs, callbacks=[]) if args.save_model: modelWriter.save()
def mainWnn(args): ################################################ # Initializing parameters ############################################## log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters", "alg", "hidden_activation_function", "word_window_size", "char_window_size", "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size", "use_capitalization", "start_symbol", "end_symbol", "with_hidden"} # Load parameters of the saving model if args.load_model: persistentManager = H5py(args.load_model) savedParameters = json.loads(persistentManager.getAttribute("parameters")) if savedParameters.get("charwnn_filters", None) != None: savedParameters["char_filters"] = savedParameters["charwnn_filters"] savedParameters.pop("charwnn_filters") print savedParameters log.info("Loading parameters of the model") args = args._replace(**savedParameters) log.info(str(args)) # Read the parameters lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization.lower() if args.normalization is not None else None wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size hiddenActFunctionName = args.hidden_activation_function embeddingSize = args.word_emb_size withCharWNN = args.with_charwnn charEmbeddingSize = args.char_emb_size charWindowSize = args.char_window_size startSymbolChar = "</s>" suffixEmbSize = args.suffix_emb_size capEmbSize = args.cap_emb_size useSuffixFeatures = args.suffix_size > 0 useCapFeatures = args.use_capitalization # Insert the character that will be used to fill the matrix # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication. artificialChar = "ART_CHAR" # TODO: the maximum number of characters of word is fixed in 20. numMaxChar = 20 if args.alg == "window_stn": isSentenceModel = True elif args.alg == "window_word": isSentenceModel = False else: raise Exception("The value of model_type isn't valid.") batchSize = -1 if isSentenceModel else args.batch_size wordFilters = [] # Lendo Filtros do wnn log.info("Lendo filtros básicos") wordFilters = getFilters(args.word_filters, log) # Lendo Filtros do charwnn log.info("Lendo filtros do charwnn") charFilters = getFilters(args.char_filters, log) # Lendo Filtros do suffix log.info("Lendo filtros do sufixo") suffixFilters = getFilters(args.suffix_filters, log) # Lendo Filtros da capitalização log.info("Lendo filtros da capitalização") capFilters = getFilters(args.cap_filters, log) ################################################ # Create the lexicon and go out after this ################################################ if args.create_only_lexicon: inputGenerators = [] lexiconsToSave = [] if args.word_lexicon and not os.path.exists(args.word_lexicon): wordLexicon = Lexicon("UUUNKKK", "labelLexicon") inputGenerators.append( WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)) lexiconsToSave.append((wordLexicon, args.word_lexicon)) if not os.path.exists(args.label_file): labelLexicon = Lexicon(None, "labelLexicon") outputGenerator = [LabelGenerator(labelLexicon)] lexiconsToSave.append((labelLexicon, args.label_file)) else: outputGenerator = None if args.char_lexicon and not os.path.exists(args.char_lexicon): charLexicon = Lexicon("UUUNKKK", "charLexicon") charLexicon.put(startSymbolChar) charLexicon.put(artificialChar) inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) lexiconsToSave.append((charLexicon, args.char_lexicon)) if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon): suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon") if args.suffix_size <= 0: raise Exception( "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.") inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) lexiconsToSave.append((suffixLexicon, args.suffix_lexicon)) if args.cap_lexicon and not os.path.exists(args.cap_lexicon): capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon") inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) lexiconsToSave.append((capLexicon, args.cap_lexicon)) if len(inputGenerators) == 0: inputGenerators = None if not (inputGenerators or outputGenerator): log.info("All lexicons have been generated.") return trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize, shuffle=shuffle) for lexicon, pathToSave in lexiconsToSave: lexicon.save(pathToSave) log.info("Lexicons were generated with success!") return ################################################ # Starting training ########################################### if withCharWNN and (useSuffixFeatures or useCapFeatures): raise Exception("It's impossible to use hand-crafted features with Charwnn.") # Read word lexicon and create word embeddings if args.load_model: wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon") vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer") wordEmbedding = Embedding(wordLexicon, vectors) elif args.word_embedding: wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon") elif args.word_lexicon: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon") wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize) else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Read char lexicon and create char embeddings if withCharWNN: if args.load_model: charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "char_convolution_layer") charEmbedding = Embedding(charLexicon, vectors) elif args.char_lexicon: charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon") charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize) else: log.error("You need to set one of these parameters: load_model or char_lexicon") return else: # Read suffix lexicon if suffix size is greater than 0 if useSuffixFeatures: if args.load_model: suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "suffix_embedding") suffixEmbedding = Embedding(suffixLexicon, vectors) elif args.suffix_lexicon: suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon") suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize) else: log.error("You need to set one of these parameters: load_model or suffix_lexicon") return # Read capitalization lexicon if useCapFeatures: if args.load_model: capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "cap_embedding") capEmbedding = Embedding(capLexicon, vectors) elif args.cap_lexicon: capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon") capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize) else: log.error("You need to set one of these parameters: load_model or cap_lexicon") return # Read labels if args.load_model: labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon") elif args.label_file: labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon") else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Normalize the word embedding if not normalizeMethod: pass elif normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() else: log.error("Unknown normalization method: %s" % normalizeMethod) sys.exit(1) if normalizeMethod is not None and args.load_model is not None: log.warn("The word embedding of model was normalized. This can change the result of test.") # Build neural network if isSentenceModel: raise NotImplementedError("Sentence model is not implemented!") else: wordWindow = T.lmatrix("word_window") inputModel = [wordWindow] wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True, name="word_embedding_layer") flatten = FlattenLayer(wordEmbeddingLayer) if withCharWNN: # Use the convolution log.info("Using charwnn") convSize = args.conv_size if args.charwnn_with_act: charAct = tanh else: charAct = None charWindowIdxs = T.ltensor4(name="char_window_idx") inputModel.append(charWindowIdxs) charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct, name="char_convolution_layer") layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer]) sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize) elif useSuffixFeatures or useCapFeatures: # Use hand-crafted features concatenateInputs = [flatten] nmFetauresByWord = wordEmbedding.getEmbeddingSize() if useSuffixFeatures: log.info("Using suffix features") suffixInput = T.lmatrix("suffix_input") suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(), name="suffix_embedding") suffixFlatten = FlattenLayer(suffixEmbLayer) concatenateInputs.append(suffixFlatten) nmFetauresByWord += suffixEmbedding.getEmbeddingSize() inputModel.append(suffixInput) if useCapFeatures: log.info("Using capitalization features") capInput = T.lmatrix("capitalization_input") capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(), name="cap_embedding") capFlatten = FlattenLayer(capEmbLayer) concatenateInputs.append(capFlatten) nmFetauresByWord += capEmbedding.getEmbeddingSize() inputModel.append(capInput) layerBeforeLinear = ConcatenateLayer(concatenateInputs) sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord else: # Use only the word embeddings layerBeforeLinear = flatten sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize() # The rest of the NN if args.with_hidden: hiddenActFunction = method_name(hiddenActFunctionName) weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform() linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize, weightInitialization=weightInit, name="linear1") act1 = ActivationLayer(linear1, hiddenActFunction) layerBeforeSoftmax = act1 sizeLayerBeforeSoftmax = hiddenLayerSize log.info("Using hidden layer") else: layerBeforeSoftmax = layerBeforeLinear sizeLayerBeforeSoftmax = sizeLayerBeforeLinear log.info("Not using hidden layer") linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator(), name="linear_softmax") act2 = ActivationLayer(linear2, softmax) prediction = ArgmaxPrediction(1).predict(act2.getOutput()) # Load the model if args.load_model: alreadyLoaded = set([wordEmbeddingLayer]) for o in (act2.getLayerSet() - alreadyLoaded): if o.getName(): persistentManager.load(o) # Set the input and output inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)] if withCharWNN: inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) else: if useSuffixFeatures: inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) if useCapFeatures: inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) outputGenerator = LabelGenerator(labelLexicon) if args.train: log.info("Reading training examples") trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize, shuffle=shuffle) # Get dev inputs and output dev = args.dev if dev: log.info("Reading development examples") devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator) devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) else: devReader = None else: trainReader = None devReader = None y = T.lvector("y") if args.decay.lower() == "normal": decay = 0.0 elif args.decay.lower() == "divide_epoch": decay = 1.0 if args.adagrad: log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) else: log.info("Using SGD") opt = SGD(lr=lr, decay=decay) # Printing embedding information dictionarySize = wordEmbedding.getNumberOfVectors() log.info("Size of word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize)) if withCharWNN: log.info("Size of char dictionary and char embedding size: %d and %d" % ( charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize())) if useSuffixFeatures: log.info("Size of suffix dictionary and suffix embedding size: %d and %d" % ( suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize())) if useCapFeatures: log.info("Size of capitalization dictionary and capitalization embedding size: %d and %d" % ( capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize())) # Compiling loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y) if args.lambda_L2: _lambda = args.lambda_L2 log.info("Using L2 with lambda= %.2f", _lambda) loss += _lambda * (T.sum(T.square(linear1.getParameters()[0]))) trainMetrics = [ LossMetric("LossTrain", loss, True), AccuracyMetric("AccTrain", y, prediction), ] evalMetrics = [ LossMetric("LossDev", loss, True), AccuracyMetric("AccDev", y, prediction), ] testMetrics = [ LossMetric("LossTest", loss, True), AccuracyMetric("AccTest", y, prediction), ] wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None) # Training if trainReader: callback = [] if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon] if withCharWNN: objsToSave.append(charLexicon) if useSuffixFeatures: objsToSave.append(suffixLexicon) if useCapFeatures: objsToSave.append(capLexicon) modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Save the model with best acc in dev if args.save_by_acc: callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True)) log.info("Training") wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback) # Save the model at the end of training if args.save_model and not args.save_by_acc: modelWriter.save() # Testing if args.test: log.info("Reading test examples") testDatasetReader = TokenLabelReader(args.test, args.token_label_separator) testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) log.info("Testing") wnnModel.test(testReader) if args.print_prediction: f = codecs.open(args.print_prediction, "w", encoding="utf-8") for x, labels in testReader: inputs = x predictions = wnnModel.prediction(inputs) for prediction in predictions: f.write(labelLexicon.getLexicon(prediction)) f.write("\n")