Esempio n. 1
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 2:
        log.error("Missing argument: <JSON config file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error("You must provide argument word_embedding or word_lexicon and word_emb_size")

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error("Only one of the parameters label_lexicon and labels can be provided!")
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False)
    else:
        log.error("One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize, W=convW, b=convb,
                             weightInitialization=weightInit)

    if args.conv_act:
        convOut = ActivationLayer(convLinear, tanh)
    else:
        convOut = convLinear

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convOut)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1, b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2, b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen():
        log.error("Number of label weights (%d) is different from number of labels (%d)!" % (
            len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel)

    # Input generators: word window.
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None
    if args.train:
        trainDatasetReader = ShortDocReader(args.train)
        if args.load_method == "sync":
            log.info("Reading training examples...")
            trainIterator = SyncBatchIterator(trainDatasetReader,
                                              inputGenerators,
                                              outputGenerators,
                                              - 1,
                                              shuffle=shuffle)
            wordLexicon.stopAdd()
        elif args.load_method == "async":
            log.info("Examples will be asynchronously loaded.")
            trainIterator = AsyncBatchIterator(trainDatasetReader,
                                               inputGenerators,
                                               outputGenerators,
                                               - 1,
                                               shuffle=shuffle,
                                               maxqSize=1000)
        else:
            log.error("The argument 'load_method' has an invalid value: %s." % args.load_method)
            sys.exit(1)

        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error("Argument eval_per_iteration cannot be used without a dev argument.")
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = ShortDocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            - 1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction),
            FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction),
            FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration)

    # Saving model after training
        if args.save_wordEmbedding:
            embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon)
            log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding))
        if args.save_conv:
            convLinear.save(args.save_conv)
            log.info("Saved convolution layer to file: %s" % (args.save_conv))
        if args.save_hiddenLayer:
            hiddenLinear.save(args.save_hiddenLayer)
            log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer))
        if args.save_softmax:
            sotmaxLinearInput.save(args.save_softmax)
            log.info("Saved softmax to file: %s" % (args.save_softmax))

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = ShortDocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         - 1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
def main(**kwargs):
    log = logging.getLogger(__name__)
    log.info(kwargs)

    if kwargs["seed"] != None:
        random.seed(kwargs["seed"])
        np.random.seed(kwargs["seed"])

    filters = []

    for filterName in kwargs["filters"]:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Usando o filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    wordWindowSize = kwargs["word_window_size"]
    hiddenLayerSize = kwargs["hidden_size"]
    batchSize = kwargs["batch_size"]
    startSymbol = kwargs["start_symbol"]
    endSymbol = kwargs["end_symbol"]
    numEpochs = kwargs["num_epochs"]
    lr = kwargs["lr"]
    tagLexicon = createLexiconUsingFile(kwargs["label_file"])
    # _lambda = theano.shared(kwargs["lambda"], "lambda")
    _lambda = theano.shared(0.0, "lambda")
    useAdagrad = kwargs["adagrad"]
    shuffle = kwargs["shuffle"]
    supHiddenLayerSize = kwargs["hidden_size_supervised_part"]
    unsupHiddenLayerSize = kwargs["hidden_size_unsupervised_part"]
    normalization = kwargs["normalization"]
    activationHiddenExtractor = kwargs["activation_hidden_extractor"]

    withCharWNN = kwargs["with_charwnn"]
    convSize = kwargs["conv_size"]
    charEmbeddingSize = kwargs["char_emb_size"]
    charWindowSize = kwargs["char_window_size"]
    startSymbolChar = "</s>"

    if kwargs["charwnn_with_act"]:
        charAct = tanh
    else:
        charAct = None

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if kwargs["decay"].lower() == "normal":
        decay = 0.0
    elif kwargs["decay"].lower() == "divide_epoch":
        decay = 1.0

    # Add the lexicon of target
    domainLexicon = Lexicon()

    domainLexicon.put("0")
    domainLexicon.put("1")
    domainLexicon.stopAdd()

    log.info("Reading W2v File1")
    wordEmbedding = EmbeddingFactory().createFromW2V(kwargs["word_embedding"],
                                                     RandomUnknownStrategy())

    log.info("Reading training examples")
    # Generators
    inputGenerators = [
        WordWindowGenerator(wordWindowSize, wordEmbedding, filters,
                            startSymbol)
    ]
    outputGeneratorTag = LabelGenerator(tagLexicon)

    if withCharWNN:
        # Create the character embedding
        charEmbedding = EmbeddingFactory().createRandomEmbedding(
            charEmbeddingSize)

        # Insert the padding of the character window
        charEmbedding.put(startSymbolChar)

        # Insert the character that will be used to fill the matrix
        # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
        artificialChar = "ART_CHAR"
        charEmbedding.put(artificialChar)

        inputGenerators.append(
            CharacterWindowGenerator(charEmbedding,
                                     numMaxChar,
                                     charWindowSize,
                                     wordWindowSize,
                                     artificialChar,
                                     startSymbolChar,
                                     startPaddingWrd=startSymbol,
                                     endPaddingWrd=endSymbol))

    unsupervisedLabelSource = ConstantLabel(domainLexicon, "0")

    # Reading supervised and unsupervised data sets.
    trainSupervisedDatasetReader = TokenLabelReader(
        kwargs["train_source"], kwargs["token_label_separator"])
    trainSupervisedBatch = SyncBatchIterator(
        trainSupervisedDatasetReader,
        inputGenerators, [outputGeneratorTag, unsupervisedLabelSource],
        batchSize[0],
        shuffle=shuffle)

    # Get Unsupervised Input
    unsupervisedLabelTarget = ConstantLabel(domainLexicon, "1")

    trainUnsupervisedDatasetReader = TokenReader(kwargs["train_target"])
    trainUnsupervisedDatasetBatch = SyncBatchIterator(
        trainUnsupervisedDatasetReader,
        inputGenerators, [unsupervisedLabelTarget],
        batchSize[1],
        shuffle=shuffle)

    # Stopping to add new words, labels and chars
    wordEmbedding.stopAdd()
    tagLexicon.stopAdd()
    domainLexicon.stopAdd()

    if withCharWNN:
        charEmbedding.stopAdd()

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Size of  word dictionary and word embedding size: %d and %d" %
             (dictionarySize, embeddingSize))
    log.info(
        "Size of  char dictionary and char embedding size: %d and %d" %
        (charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    # Word Embedding Normalization
    if normalization == "zscore":
        wordEmbedding.zscoreNormalization()
    elif normalization == "minmax":
        wordEmbedding.minMaxNormalization()
    elif normalization == "mean":
        wordEmbedding.meanNormalization()
    elif normalization == "none" or not normalization:
        pass
    else:
        raise Exception()

    # Source input
    wordWindowSource = T.lmatrix(name="windowSource")
    sourceInput = [wordWindowSource]

    # Create the layers related with the extractor of features
    embeddingLayerSrc = EmbeddingLayer(wordWindowSource,
                                       wordEmbedding.getEmbeddingMatrix(),
                                       trainable=True)
    flattenSrc = FlattenLayer(embeddingLayerSrc)

    if withCharWNN:
        log.info("Using charwnn")

        # Create the charwn
        charWindowIdxSrc = T.ltensor4(name="char_window_idx_source")
        sourceInput.append(charWindowIdxSrc)

        charEmbeddingConvLayerSrc = EmbeddingConvolutionalLayer(
            charWindowIdxSrc, charEmbedding.getEmbeddingMatrix(), numMaxChar,
            convSize, charWindowSize, charEmbeddingSize, charAct)
        layerBeforeLinearSrc = ConcatenateLayer(
            [flattenSrc, charEmbeddingConvLayerSrc])
        sizeLayerBeforeLinearSrc = wordWindowSize * (
            wordEmbedding.getEmbeddingSize() + convSize)
    else:
        layerBeforeLinearSrc = flattenSrc
        sizeLayerBeforeLinearSrc = wordWindowSize * wordEmbedding.getEmbeddingSize(
        )

    if activationHiddenExtractor == "tanh":
        log.info("Using tanh in the hidden layer of extractor")

        linear1 = LinearLayer(layerBeforeLinearSrc,
                              sizeLayerBeforeLinearSrc,
                              hiddenLayerSize,
                              weightInitialization=GlorotUniform())
        act1 = ActivationLayer(linear1, tanh)
    elif activationHiddenExtractor == "sigmoid":
        log.info("Using sigmoid in the hidden layer of extractor")

        linear1 = LinearLayer(layerBeforeLinearSrc,
                              sizeLayerBeforeLinearSrc,
                              hiddenLayerSize,
                              weightInitialization=SigmoidGenerator())
        act1 = ActivationLayer(linear1, sigmoid)
    else:
        raise Exception()

    # Create the layers with the Tagger
    if supHiddenLayerSize == 0:
        layerBeforeSupSoftmax = act1
        layerSizeBeforeSupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the supervised softmax.")
    else:
        linear2 = LinearLayer(act1,
                              hiddenLayerSize,
                              supHiddenLayerSize,
                              weightInitialization=GlorotUniform())
        act2 = ActivationLayer(linear2, tanh)

        layerBeforeSupSoftmax = act2
        layerSizeBeforeSupSoftmax = supHiddenLayerSize

        log.info("It inserted the layer before the supervised softmax.")

    supervisedLinear = LinearLayer(layerBeforeSupSoftmax,
                                   layerSizeBeforeSupSoftmax,
                                   tagLexicon.getLen(),
                                   weightInitialization=ZeroWeightGenerator())
    supervisedSoftmax = ActivationLayer(supervisedLinear, softmax)

    # Create the layers with the domain classifier
    gradientReversalSource = GradientReversalLayer(act1, _lambda)

    if unsupHiddenLayerSize == 0:
        layerBeforeUnsupSoftmax = gradientReversalSource
        layerSizeBeforeUnsupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the unsupervised softmax.")
    else:
        unsupervisedSourceLinearBf = LinearLayer(
            gradientReversalSource,
            hiddenLayerSize,
            unsupHiddenLayerSize,
            weightInitialization=GlorotUniform())
        actUnsupervisedSourceBf = ActivationLayer(unsupervisedSourceLinearBf,
                                                  tanh)

        layerBeforeUnsupSoftmax = actUnsupervisedSourceBf
        layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize

        log.info("It inserted the layer before the unsupervised softmax.")

    unsupervisedSourceLinear = LinearLayer(
        layerBeforeUnsupSoftmax,
        layerSizeBeforeUnsupSoftmax,
        domainLexicon.getLen(),
        weightInitialization=ZeroWeightGenerator())
    unsupervisedSourceSoftmax = ActivationLayer(unsupervisedSourceLinear,
                                                softmax)

    ## Target Part
    windowTarget = T.lmatrix(name="windowTarget")
    targetInput = [windowTarget]

    # Create the layers related with the extractor of features
    embeddingLayerUnsuper1 = EmbeddingLayer(
        windowTarget, embeddingLayerSrc.getParameters()[0], trainable=True)
    flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1)

    if withCharWNN:
        log.info("Using charwnn")

        # Create the charwn
        charWindowIdxTgt = T.ltensor4(name="char_window_idx_target")
        targetInput.append(charWindowIdxTgt)

        charEmbeddingConvLayerTgt = EmbeddingConvolutionalLayer(
            charWindowIdxTgt,
            charEmbeddingConvLayerSrc.getParameters()[0],
            numMaxChar,
            convSize,
            charWindowSize,
            charEmbeddingSize,
            charAct,
            trainable=True)
        layerBeforeLinearTgt = ConcatenateLayer(
            [flattenUnsuper1, charEmbeddingConvLayerTgt])
        sizeLayerBeforeLinearTgt = wordWindowSize * (
            wordEmbedding.getEmbeddingSize() + convSize)
    else:
        layerBeforeLinearTgt = flattenUnsuper1
        sizeLayerBeforeLinearTgt = wordWindowSize * wordEmbedding.getEmbeddingSize(
        )

    w, b = linear1.getParameters()
    linearUnsuper1 = LinearLayer(layerBeforeLinearTgt,
                                 sizeLayerBeforeLinearTgt,
                                 hiddenLayerSize,
                                 W=w,
                                 b=b,
                                 trainable=True)

    if activationHiddenExtractor == "tanh":
        log.info("Using tanh in the hidden layer of extractor")
        actUnsupervised1 = ActivationLayer(linearUnsuper1, tanh)
    elif activationHiddenExtractor == "sigmoid":
        log.info("Using sigmoid in the hidden layer of extractor")
        actUnsupervised1 = ActivationLayer(linearUnsuper1, sigmoid)
    else:
        raise Exception()

    # Create the layers with the domain classifier
    grandientReversalTarget = GradientReversalLayer(actUnsupervised1, _lambda)

    if unsupHiddenLayerSize == 0:
        layerBeforeUnsupSoftmax = grandientReversalTarget
        layerSizeBeforeUnsupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the unsupervised softmax.")
    else:
        w, b = unsupervisedSourceLinearBf.getParameters()
        unsupervisedTargetLinearBf = LinearLayer(grandientReversalTarget,
                                                 hiddenLayerSize,
                                                 unsupHiddenLayerSize,
                                                 W=w,
                                                 b=b,
                                                 trainable=True)
        actUnsupervisedTargetLinearBf = ActivationLayer(
            unsupervisedTargetLinearBf, tanh)

        layerBeforeUnsupSoftmax = actUnsupervisedTargetLinearBf
        layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize

        log.info("It inserted the layer before the unsupervised softmax.")

    w, b = unsupervisedSourceLinear.getParameters()
    unsupervisedTargetLinear = LinearLayer(layerBeforeUnsupSoftmax,
                                           layerSizeBeforeUnsupSoftmax,
                                           domainLexicon.getLen(),
                                           W=w,
                                           b=b,
                                           trainable=True)
    unsupervisedTargetSoftmax = ActivationLayer(unsupervisedTargetLinear,
                                                softmax)

    # Set loss and prediction and retrieve all layers
    supervisedLabel = T.lvector("supervisedLabel")
    unsupervisedLabelSource = T.lvector("unsupervisedLabelSource")
    unsupervisedLabelTarget = T.lvector("unsupervisedLabelTarget")

    supervisedOutput = supervisedSoftmax.getOutput()
    supervisedPrediction = ArgmaxPrediction(1).predict(supervisedOutput)
    supervisedLoss = NegativeLogLikelihood().calculateError(
        supervisedOutput, supervisedPrediction, supervisedLabel)

    unsupervisedOutputSource = unsupervisedSourceSoftmax.getOutput()
    unsupervisedPredSource = ArgmaxPrediction(1).predict(
        unsupervisedOutputSource)
    unsupervisedLossSource = NegativeLogLikelihood().calculateError(
        unsupervisedOutputSource, None, unsupervisedLabelSource)

    unsupervisedOutputTarget = unsupervisedTargetSoftmax.getOutput()
    unsupervisedPredTarget = ArgmaxPrediction(1).predict(
        unsupervisedOutputTarget)
    unsupervisedLossTarget = NegativeLogLikelihood().calculateError(
        unsupervisedOutputTarget, None, unsupervisedLabelTarget)

    # Creates model

    if useAdagrad:
        log.info("Using ADAGRAD")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    allLayersSource = supervisedSoftmax.getLayerSet(
    ) | unsupervisedSourceSoftmax.getLayerSet()
    allLayersTarget = unsupervisedTargetSoftmax.getLayerSet()
    unsupervisedLossTarget *= float(
        trainSupervisedBatch.size()) / trainUnsupervisedDatasetBatch.size()

    supervisedTrainMetrics = [
        LossMetric("TrainSupervisedLoss", supervisedLoss),
        AccuracyMetric("TrainSupervisedAcc", supervisedLabel,
                       supervisedPrediction),
        LossMetric("TrainUnsupervisedLoss", unsupervisedLossSource),
        AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelSource,
                       unsupervisedPredSource)
    ]
    unsupervisedTrainMetrics = [
        LossMetric("TrainUnsupervisedLoss", unsupervisedLossTarget),
        AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelTarget,
                       unsupervisedPredTarget)
    ]

    evalMetrics = [
        AccuracyMetric("EvalAcc", supervisedLabel, supervisedPrediction)
    ]

    testMetrics = [
        AccuracyMetric("TestAcc", supervisedLabel, supervisedPrediction)
    ]

    #TODO: Não tive tempo de testar o código depois das modificações
    GradientReversalModel(sourceInput,
                          targetInput,
                          supervisedLabel,
                          unsupervisedLabelSource,
                          unsupervisedLabelTarget,
                          allLayersSource,
                          allLayersTarget,
                          opt,
                          supervisedPrediction,
                          supervisedLoss,
                          unsupervisedLossSource,
                          unsupervisedLossTarget,
                          supervisedTrainMetrics,
                          unsupervisedTrainMetrics,
                          evalMetrics,
                          testMetrics,
                          mode=None)

    # Get dev inputs and output
    log.info("Reading development examples")
    devDatasetReader = TokenLabelReader(kwargs["dev"],
                                        kwargs["token_label_separator"])
    devReader = SyncBatchIterator(devDatasetReader,
                                  inputGenerators, [outputGeneratorTag],
                                  sys.maxint,
                                  shuffle=False)

    callbacks = []
    # log.info("Usando lambda fixo: " + str(_lambda.get_value()))
    log.info("Usando lambda variado. alpha=" + str(kwargs["alpha"]) +
             " height=" + str(kwargs["height"]))
    callbacks.append(
        ChangeLambda(_lambda, kwargs["alpha"], numEpochs, kwargs["height"]))

    if kwargs["additional_dev"]:
        callbacks.append(
            AdditionalDevDataset(model, kwargs["additional_dev"],
                                 kwargs["token_label_separator"],
                                 inputGenerators, outputGeneratorTag))

    # Training Model
    model.train([trainSupervisedBatch, trainUnsupervisedDatasetBatch],
                numEpochs,
                devReader,
                callbacks=callbacks)
Esempio n. 3
0
def main(args):
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    loadPath = args.load_model

    if loadPath:
        with codecs.open(loadPath + ".param", "r",
                         encoding="utf-8") as paramsFile:
            param = json.load(paramsFile, encoding="utf-8")

        hiddenActFunctionName = param['hiddenActFunction']
        hiddenActFunction = method_name(hiddenActFunctionName)

        # Loading Embedding
        log.info("Loading Model")
        wordEmbedding = EmbeddingFactory().createFromW2V(
            loadPath + ".wv", ChosenUnknownStrategy(param["unknown"]))
        labelLexicon = Lexicon()

        for l in param["labels"]:
            labelLexicon.put(l)

        labelLexicon.stopAdd()

        # Loading model
        labelWeights = np.load(loadPath + ".npy").item(0)

        W1 = labelWeights["W_Hidden"]
        b1 = labelWeights["b_Hidden"]
        W2 = labelWeights["W_Softmax"]
        b2 = labelWeights["b_Softmax"]

        hiddenLayerSize = b1.shape[0]
    else:
        W1 = None
        b1 = None
        W2 = None
        b2 = None
        hiddenActFunctionName = args.hidden_activation_function
        hiddenActFunction = method_name(hiddenActFunctionName)

        if args.word_embedding:
            log.info("Reading W2v File")
            wordEmbedding = EmbeddingFactory().createFromW2V(
                args.word_embedding, RandomUnknownStrategy())
            wordEmbedding.stopAdd()
        elif args.hash_lex_size:
            wordEmbedding = RandomEmbedding(args.word_emb_size,
                                            RandomUnknownStrategy(),
                                            HashLexicon(args.hash_lex_size))
        else:
            wordEmbedding = EmbeddingFactory().createRandomEmbedding(
                args.word_emb_size)

        # Get the inputs and output
        if args.labels:
            labelLexicon = createLexiconUsingFile(args.labels)
        else:
            labelLexicon = Lexicon()

        if args.load_hidden_layer:
            # Loading Hidden Layer
            log.info("Loading Hidden Layer")

            hl = np.load(args.load_hidden_layer).item(0)

            W1 = hl["W_Encoder"]
            b1 = hl["b_Encoder"]

            hiddenLayerSize = b1.shape[0]

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = T.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = T.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = SigmoidGlorot(
    ) if hiddenActFunction == sigmoid else GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=None,
                             b=None,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # List of input layers (will be concatenated).
    inputLayers = [maxPooling]

    # Generate word windows.
    wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize,
                                                     wordEmbedding, filters,
                                                     startSymbol, endSymbol)

    # List of input generators.
    inputGenerators = [
        lambda offer: wordWindowFeatureGenerator(offer["tokens"])
    ]

    concatenatedSize = convSize

    # Additional features.
    if args.categorical_features is not None:
        log.info("Using categorical features: %s" %
                 str([ftr[0] for ftr in args.categorical_features]))
        for ftr in args.categorical_features:
            concatenatedSize += ftr[2]
            ftrLexicon = createLexiconUsingFile(ftr[1])
            ftrEmbedding = RandomEmbedding(
                embeddingSize=ftr[2],
                unknownGenerateStrategy=RandomUnknownStrategy(),
                lexicon=ftrLexicon,
            )
            ftrInput = T.lscalar("in_" + ftr[0])
            ftrLayer = EmbeddingLayer(ftrInput,
                                      ftrEmbedding.getEmbeddingMatrix())

            inputGenerators.append(
                lambda offer: ftrLexicon.put(offer[ftr[0]].strip().lower()))
            inputTensors.append(ftrInput)
            inputLayers.append(ftrLayer)

    log.info("Input layers: %s" % str(inputLayers))

    # Concatenate all input layers, when there are more thean one input layer.
    concatenatedInLayers = maxPooling if len(
        inputLayers) == 1 else ConcatenateLayer(inputLayers, axis=0)

    if args.include_hidden_layer:
        # Hidden layer.
        hiddenLinear = LinearLayer(concatenatedInLayers,
                                   concatenatedSize,
                                   hiddenLayerSize,
                                   W=W1,
                                   b=b1,
                                   weightInitialization=weightInit)
        hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction)
    else:
        # Do not use a hidden layer.
        log.info("Not using hidden layer!")
        hiddenAct = concatenatedInLayers
        hiddenLayerSize = concatenatedSize

    # Entrada linear da camada softmax.
    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())
    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Class weights.
    labelWeights = None
    if args.labels_probs:
        numLabels = labelLexicon.getLen()
        labelWeights = np.zeros(numLabels, dtype=theano.config.floatX)
        if args.labels_probs.startswith("@"):
            # Load the dictionary from a JSON file.
            with codecs.open(args.labels_probs[1:], mode="r",
                             encoding="utf8") as f:
                labelDistribution = json.load(f)
        else:
            # The argument value is already a JSON.
            labelDistribution = json.loads(args.labels_probs)

        for k, v in labelDistribution.items():
            # The weight of a class is inversely-proportional to its frequency.
            labelWeights[labelLexicon.getLexiconIndex(k)] = 1.0 / v

        if args.labels_weights_log:
            # Attenuate weights for highly unbalanced classes.
            labelWeights = np.log(labelWeights)

        log.info("Label weights: " + str(labelWeights))

    # Loss function.
    loss = NegativeLogLikelihoodOneExample(labelWeights).calculateError(
        softmaxAct.getOutput()[0], prediction, outLabel)

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]

    if args.train:
        trainDatasetReader = OfertasReader(args.train)
        if args.load_method == "sync":
            log.info("Reading training examples...")
            trainIterator = SyncBatchIterator(trainDatasetReader,
                                              intputGenerators,
                                              outputGenerators,
                                              -1,
                                              shuffle=shuffle)
            wordEmbedding.stopAdd()
        elif args.load_method == "async":
            log.info("Examples will be asynchronously loaded.")
            trainIterator = AsyncBatchIterator(trainDatasetReader,
                                               inputGenerators,
                                               outputGenerators,
                                               -1,
                                               shuffle=shuffle,
                                               maxqSize=1000)
        else:
            log.error("The argument 'load_method' has an invalid value: %s." %
                      args.load_method)
            sys.exit(1)

        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error(
                "Argument eval_per_iteration cannot be used without a dev argument."
            )
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = OfertasReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            -1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    if normalizeMethod is not None and loadPath is not None:
        log.warn(
            "The word embedding of model was normalized. This can change the result of test."
        )

    #     if kwargs["lambda"]:
    #         _lambda = kwargs["lambda"]
    #         log.info("Using L2 with lambda= %.2f", _lambda)
    #         loss += _lambda * (T.sum(T.square(hiddenLinear.getParameters()[0])))

    # Decaimento da taxa de aprendizado.
    decay = 0.0
    if args.decay == "linear":
        decay = 1.0

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error(
            "Unknown algorithm: %s. Expected values are: adagrad or sgd." %
            args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction),
            FMetric("EvalFMetric",
                    outLabel,
                    prediction,
                    labels=labelLexicon.getLexiconDict().values())
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction),
            FMetric("TestFMetric",
                    outLabel,
                    prediction,
                    labels=labelLexicon.getLexiconDict().values())
        ]

        if args.test_probs:
            # Append predicted probabilities for the test set.
            testMetrics.append(
                PredictedProbabilities("TestProbs", softmaxAct.getOutput()))
    else:
        if args.test_probs:
            log.error(
                "The option test_probs requires a test dataset (option test).")
            sys.exit(1)

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = Model(x=inputTensors,
                  y=[outLabel],
                  allLayers=softmaxAct.getLayerSet(),
                  optimizer=opt,
                  prediction=prediction,
                  loss=loss,
                  trainMetrics=trainMetrics,
                  evalMetrics=evalMetrics,
                  testMetrics=testMetrics,
                  mode=mode)

    # Training
    if trainIterator:
        callback = []

        if args.save_model:
            savePath = args.save_model
            modelWriter = OfertasModelWritter(savePath, embeddingLayer,
                                              hiddenLinear, sotmaxLinearInput,
                                              wordEmbedding, labelLexicon,
                                              hiddenActFunctionName)
            callback.append(SaveModelCallback(modelWriter, "eval_acc", True))

        log.info("Training")
        model.train(trainIterator,
                    numEpochs,
                    devIterator,
                    evalPerIteration=evalPerIteration,
                    callbacks=callback)

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = OfertasReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         -1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
Esempio n. 4
0
def trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow, labelLexicon):
    # Build neural network.
    wordWindow = T.lmatrix("word_window")
    inputModel = [wordWindow]

    wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), borrow=borrow,
                                        structGrad=args.struct_grad, trainable=True, name="word_embedding_layer")
    flatWordEmbedding = FlattenLayer(wordEmbeddingLayer)

    charWindowIdxs = T.ltensor4(name="char_window_idx")
    inputModel.append(charWindowIdxs)

    # # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), 20,
                                                         args.conv_size, args.char_window_size, args.char_emb_size,
                                                         tanh, structGrad=args.char_struct_grad,
                                                         name="char_convolution_layer", borrow=borrow)

    layerBeforeLinear = ConcatenateLayer([flatWordEmbedding, charEmbeddingConvLayer])
    sizeLayerBeforeLinear = args.word_window_size * (wordEmbedding.getEmbeddingSize() + args.conv_size)

    hiddenActFunction = method_name(args.hidden_activation_function)
    weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform()

    linearHidden = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, args.hidden_size,
                               weightInitialization=weightInit, name="linear1")
    actHidden = ActivationLayer(linearHidden, hiddenActFunction)

    linearSoftmax = LinearLayer(actHidden, args.hidden_size, labelLexicon.getLen(),
                                weightInitialization=ZeroWeightGenerator(), name="linear_softmax")
    actSoftmax = ActivationLayer(linearSoftmax, softmax)
    prediction = ArgmaxPrediction(1).predict(actSoftmax.getOutput())

    # Output symbolic tensor variable.
    y = T.lvector("y")

    if args.decay.lower() == "normal":
        decay = 0.0
    elif args.decay.lower() == "divide_epoch":
        decay = 1.0
    else:
        log.error("Unknown decay argument: %s" % args.decay)
        sys.exit(1)

    if args.adagrad:
        log.info("Training algorithm: Adagrad")
        opt = Adagrad(lr=args.lr, decay=decay)
    else:
        log.info("Training algorithm: SGD")
        opt = SGD(lr=args.lr, decay=decay)

    # Training loss function.
    loss = NegativeLogLikelihood().calculateError(actSoftmax.getOutput(), prediction, y)

    # L2 regularization.
    if args.l2:
        loss += args.l2 * (T.sum(T.square(linearHidden.getParameters()[0])))

    # # TODO: debug
    # opt.lr.tag.test_value = 0.02

    # Metrics.
    trainMetrics = [
        LossMetric("LossTrain", loss, True),
        AccuracyMetric("AccTrain", y, prediction)
    ]

    evalMetrics = None
    if args.dev:
        evalMetrics = [
            LossMetric("LossDev", loss, True),
            AccuracyMetric("AccDev", y, prediction),
            CustomMetric("CustomMetricDev", y, prediction)
        ]

    testMetrics = None
    if args.test:
        testMetrics = [
            CustomMetric("CustomMetricTest", y, prediction)
        ]

    log.info("Compiling the network...")
    # # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    wnnModel = BasicModel(inputModel, [y], actSoftmax.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics,
                          evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode)

    log.info("Training...")
    wnnModel.train(trainIterator, args.num_epochs, devIterator)
Esempio n. 5
0
    def __init__(self,
                 input,
                 charEmbedding,
                 numMaxCh,
                 convSize,
                 charWindowSize,
                 charEmbSize,
                 charAct=tanh,
                 structGrad=True,
                 trainable=True,
                 name=None,
                 borrow=True):
        """

        :param input: a layer or theano variable

        :param charEmbedding: numpy.array or python list that contains the character vectors

        :param numMaxCh: the number of characters that will be used in a word.
            If the word size is greater than this parameter, so it'll be used the numMaxCh end characters of the word.
            If the word size is lesser than this parameter, so it'll be filled (numMaxCh - word_size) characters at the
                word end.

        :param convSize: number of convolution filters

        :param charWindowSize: the size of the character window

        :param charEmbSize:  the size of the character embedding

        :param charAct: the activation function. If this paramater is None, so no activation function will be used.

        :param structGrad: whether to use structured gradients or not.
            When using small batches (online gradient descent, in the limit),
            the structured gradient is much more efficient because a small
            fraction of word vectors are used on each iteration.
            However, when using large batches (ordinary gradient descent, in the
            limit), ordinary gradients and updates are more efficient because
            most (or all of the) word vectors are used on each iteration.

        :param trainable: set if the layer is trainable or not

        :param name:unique name of the layer. This is use to save the attributes of this object.

        :param borrow: whether the shared variable of this layer will activate the borrow setting.
        """

        # Input variable for this layer. Its shape is (numExs, szWrdWin, numMaxCh, szChWin)
        # where numExs is the number of examples in the training batch,
        #       szWrdWin is the size of the word window,
        #       numMaxCh is the number of characters used to represent words, and
        #       szChWin is the size of the character window.
        self.input = input
        super(EmbeddingConvolutionalLayer,
              self).__init__(self.input, trainable, name)

        self.__output = None
        self.__charWindowSize = charWindowSize
        self.__convSize = convSize

        # This is the fixed size of all words.
        self.maxLenWord = numMaxCh

        # Activation function  of hidden layer
        self.charAct = charAct
        self.__structGrad = structGrad

        # We use the symbolic shape of the input to perform all dimension
        # transformations (reshape) necessary for the computation of this layer.
        shape = T.shape(self.input)
        numExs = shape[0]
        szWrdWin = shape[1]
        numMaxCh = shape[2]
        szChWin = shape[3]

        # Character embedding layer.
        self.__embedLayer = EmbeddingLayer(self.input.flatten(2),
                                           charEmbedding,
                                           borrow=borrow,
                                           structGrad=structGrad,
                                           trainable=self.isTrainable())

        # It chooses, based in the activation function, the way that the weights of liner layer will be initialized.
        if charAct is tanh:
            weightInitialization = GlorotUniform()
        elif charAct is sigmoid:
            weightInitialization = SigmoidGenerator()
        elif charAct is None:
            pass
        else:
            raise Exception("Activation function is not supported")

        # This is the bank of filters. It is an ordinary hidden layer.
        hidInput = ReshapeLayer(
            self.__embedLayer,
            (numExs * szWrdWin * numMaxCh, szChWin * charEmbSize))

        self.__linearLayer = LinearLayer(
            hidInput,
            charWindowSize * charEmbSize,
            self.__convSize,
            weightInitialization=weightInitialization,
            trainable=self.isTrainable())

        if charAct:
            self.actLayer = ActivationLayer(self.__linearLayer, self.charAct)
            layerBeforePolling = self.actLayer
        else:
            layerBeforePolling = self.__linearLayer

        # 3-D tensor with shape (numExs * szWrdWin, numMaxCh, convSize).
        # This tensor is used to perform the max pooling along its 2nd dimension.
        o = ReshapeLayer(layerBeforePolling,
                         (numExs * szWrdWin, numMaxCh, convSize))

        # Max pooling layer. Perform a max op along the character dimension.
        # The shape of the output is equal to (numExs*szWrdWin, convSize).
        m = T.max(o.getOutput(), axis=1)

        # The output is a 2-D tensor with shape (numExs, szWrdWin * convSize).
        self.__output = m.reshape((numExs, szWrdWin * convSize))
Esempio n. 6
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 3:
        log.error("Missing argument: <JSON config file> or/and <Input file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon,
         wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding,
                                                 unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon,
                                           hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon,
                                  embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error(
            "You must provide argument word_embedding or word_lexicon and word_emb_size"
        )

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error(
                "Only one of the parameters label_lexicon and labels can be provided!"
            )
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon,
                                            hasUnknowSymbol=False)
    else:
        log.error(
            "One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" %
                 (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=convW,
                             b=convb,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" %
                 (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1,
                               b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" %
                 (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(
            args.label_weights) != labelLexicon.getLen():
        log.error(
            "Number of label weights (%d) is different from number of labels (%d)!"
            % (len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction,
                                outLabel)

    # Input generators: word window.
    inputGenerators = [
        WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol,
                            endSymbol)
    ]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       mode=mode)

    wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters,
                                     startSymbol, endSymbol)

    # GETS HIDDEN LAYER:
    # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow)

    # GRAPH FOR PREDICTION LAYER
    graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode)

    lblTxt = ["Sim", "Nao"]

    tweets = []
    with open(sys.argv[2]) as inputFile:
        content = inputFile.readlines()
    for line in content:
        tweets.append(line.decode('utf-8').encode('utf-8'))
    #print tweets
    # graph.getResultsFor(t) retorna a predição para dado Tweet t
    try:
        output_file = open("Output.txt", "w")
    except:
        print "Falha em criar o arquivo de saida\n"
    try:
        for t in tweets:
            output_file.write(
                t.replace('\n', '').replace('\t', '') + "\t " +
                lblTxt[graph.getResultsFor(t)] + "\n")
        print "Resultados gerados com sucesso!\n"
    except:
        print "Erro na geração de resultados\n"
Esempio n. 7
0
class EmbeddingConvolutionalLayer(Layer):
    """
    Convolutional layer of embedding features.
    
    The input of this layer is a 4-D tensor whose shape is:
        (numExs, szWrdWin, numMaxCh, szChWin)
    where
        numExs is the number of examples in a training (mini) batch,
        szWrdWin is the size of the word window
            (the convolution is independently performed for each index in this dimension),
        numMaxCh is the number of characters used to represent words
            (the convolution is performed over this dimension),
        szChWin is the size of the character window
            (each input for the convolution filters is composed by this number of features).
    
    The value numMaxCh, the number of characters used to represent a word,
    is fixed for all word to speedup training. For words that are shorter
    than this value, we extend them with an artificial character. For words
    that are longer than this value, we use only the last numMaxCh
    characters in them.
    
    Thus, this layer is not really convolutional (with variable-sized inputs),
    but it is sufficient for many applications and is much faster than an
    ordinary convolutional layer.
    """
    def __init__(self,
                 input,
                 charEmbedding,
                 numMaxCh,
                 convSize,
                 charWindowSize,
                 charEmbSize,
                 charAct=tanh,
                 structGrad=True,
                 trainable=True,
                 name=None,
                 borrow=True):
        """

        :param input: a layer or theano variable

        :param charEmbedding: numpy.array or python list that contains the character vectors

        :param numMaxCh: the number of characters that will be used in a word.
            If the word size is greater than this parameter, so it'll be used the numMaxCh end characters of the word.
            If the word size is lesser than this parameter, so it'll be filled (numMaxCh - word_size) characters at the
                word end.

        :param convSize: number of convolution filters

        :param charWindowSize: the size of the character window

        :param charEmbSize:  the size of the character embedding

        :param charAct: the activation function. If this paramater is None, so no activation function will be used.

        :param structGrad: whether to use structured gradients or not.
            When using small batches (online gradient descent, in the limit),
            the structured gradient is much more efficient because a small
            fraction of word vectors are used on each iteration.
            However, when using large batches (ordinary gradient descent, in the
            limit), ordinary gradients and updates are more efficient because
            most (or all of the) word vectors are used on each iteration.

        :param trainable: set if the layer is trainable or not

        :param name:unique name of the layer. This is use to save the attributes of this object.

        :param borrow: whether the shared variable of this layer will activate the borrow setting.
        """

        # Input variable for this layer. Its shape is (numExs, szWrdWin, numMaxCh, szChWin)
        # where numExs is the number of examples in the training batch,
        #       szWrdWin is the size of the word window,
        #       numMaxCh is the number of characters used to represent words, and
        #       szChWin is the size of the character window.
        self.input = input
        super(EmbeddingConvolutionalLayer,
              self).__init__(self.input, trainable, name)

        self.__output = None
        self.__charWindowSize = charWindowSize
        self.__convSize = convSize

        # This is the fixed size of all words.
        self.maxLenWord = numMaxCh

        # Activation function  of hidden layer
        self.charAct = charAct
        self.__structGrad = structGrad

        # We use the symbolic shape of the input to perform all dimension
        # transformations (reshape) necessary for the computation of this layer.
        shape = T.shape(self.input)
        numExs = shape[0]
        szWrdWin = shape[1]
        numMaxCh = shape[2]
        szChWin = shape[3]

        # Character embedding layer.
        self.__embedLayer = EmbeddingLayer(self.input.flatten(2),
                                           charEmbedding,
                                           borrow=borrow,
                                           structGrad=structGrad,
                                           trainable=self.isTrainable())

        # It chooses, based in the activation function, the way that the weights of liner layer will be initialized.
        if charAct is tanh:
            weightInitialization = GlorotUniform()
        elif charAct is sigmoid:
            weightInitialization = SigmoidGenerator()
        elif charAct is None:
            pass
        else:
            raise Exception("Activation function is not supported")

        # This is the bank of filters. It is an ordinary hidden layer.
        hidInput = ReshapeLayer(
            self.__embedLayer,
            (numExs * szWrdWin * numMaxCh, szChWin * charEmbSize))

        self.__linearLayer = LinearLayer(
            hidInput,
            charWindowSize * charEmbSize,
            self.__convSize,
            weightInitialization=weightInitialization,
            trainable=self.isTrainable())

        if charAct:
            self.actLayer = ActivationLayer(self.__linearLayer, self.charAct)
            layerBeforePolling = self.actLayer
        else:
            layerBeforePolling = self.__linearLayer

        # 3-D tensor with shape (numExs * szWrdWin, numMaxCh, convSize).
        # This tensor is used to perform the max pooling along its 2nd dimension.
        o = ReshapeLayer(layerBeforePolling,
                         (numExs * szWrdWin, numMaxCh, convSize))

        # Max pooling layer. Perform a max op along the character dimension.
        # The shape of the output is equal to (numExs*szWrdWin, convSize).
        m = T.max(o.getOutput(), axis=1)

        # The output is a 2-D tensor with shape (numExs, szWrdWin * convSize).
        self.__output = m.reshape((numExs, szWrdWin * convSize))

    def updateAllCharIndexes(self, charIdxWord):
        for Idx in xrange(len(charIdxWord)):
            indexes = []

            for charIdx in xrange(len(charIdxWord[Idx])):
                indexes.append(self.getCharIndexes(charIdx, charIdxWord[Idx]))

            self.AllCharWindowIndexes.append(indexes)

    def getParameters(self):
        return self.__embedLayer.getParameters(
        ) + self.__linearLayer.getParameters()

    def getDefaultGradParameters(self):
        return self.__linearLayer.getDefaultGradParameters(
        ) + self.__embedLayer.getDefaultGradParameters()

    def getStructuredParameters(self):
        return self.__linearLayer.getStructuredParameters(
        ) + self.__embedLayer.getStructuredParameters()

    def getUpdates(self, cost, lr, sumSqGrads=None):
        return self.__embedLayer.getUpdates(cost, lr, sumSqGrads)

    def getNormalizationUpdates(self, strategy, coef):
        return self.__embedLayer.getNormalizationUpdate(strategy, coef)

    def getOutput(self):
        return self.__output

    def getAttributes(self):
        keyValueList = self.__embedLayer.getAttributes().items()
        keyValueList += self.__linearLayer.getAttributes().items()

        dict = {}

        for key, value in keyValueList:
            dict[key] = value

        return dict

    def load(self, attributes):
        self.__embedLayer.load(attributes)
        self.__linearLayer.load(attributes)

    @staticmethod
    def getEmbeddingFromPersistenceManager(persistenceManager, name):
        """
        Return the embedding vector from the database

        :type persistenceManager: persistence.PersistentManager.PersistentManager
        :param persistenceManager:

        :param name: name of object which the embedding was saved as attribute

        :return:
        """
        return EmbeddingLayer.getEmbeddingFromPersistenceManager(
            persistenceManager, name)
Esempio n. 8
0
def main(args):
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.wv_normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)
        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None
    hiddenActFunction = tanh

    if args.word_embedding:
        log.info("Reading W2v File")
        (lexicon,
         wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding,
                                                 unknownSymbol='unknown')
        lexicon.stopAdd()
    else:
        wordEmbedding = EmbeddingFactory().createRandomEmbedding(
            args.word_emb_size)

    # Get the inputs and output
    if args.labels:
        labelLexicon = Lexicon.fromTextFile(args.labels, hasUnknowSymbol=False)
    else:
        labelLexicon = Lexicon()

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = T.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = T.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = SigmoidGlorot(
    ) if hiddenActFunction == sigmoid else GlorotUniform()

    # Convolution layer. Convolução no texto de um documento.
    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=None,
                             b=None,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # Generate word windows.
    wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, lexicon,
                                                     filters, startSymbol,
                                                     endSymbol)

    # List of input generators.
    inputGenerators = [wordWindowFeatureGenerator]

    # Hidden layer.
    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1,
                               b=b1,
                               weightInitialization=weightInit)
    hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction)

    # Entrada linear da camada softmax.
    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())
    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    loss = NegativeLogLikelihoodOneExample().calculateError(
        softmaxAct.getOutput()[0], prediction, outLabel)

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]

    if args.train:
        trainDatasetReader = DocReader(args.train)

        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(trainDatasetReader,
                                          inputGenerators,
                                          outputGenerators,
                                          -1,
                                          shuffle=shuffle)
        lexicon.stopAdd()
        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error(
                "Argument eval_per_iteration cannot be used without a dev argument."
            )
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = DocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            -1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    if args.decay == "linear":
        decay = 1.0
    elif args.decay == "none":
        decay = 0.0
    else:
        log.error("Unknown decay strategy %s. Expected: none or linear." %
                  args.decay)
        sys.exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error(
            "Unknown algorithm: %s. Expected values are: adagrad or sgd." %
            args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction)
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction)
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator,
                    numEpochs,
                    devIterator,
                    evalPerIteration=evalPerIteration)

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = DocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         -1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
Esempio n. 9
0
def main(**kwargs):
    log = logging.getLogger(__name__)
    log.info(kwargs)

    if kwargs["seed"] != None:
        random.seed(kwargs["seed"])
        np.random.seed(kwargs["seed"])

    filters = []

    for filterName in kwargs["filters"]:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Usando o filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    # Get the inputs and output
    wordWindowSize = kwargs["word_window_size"]
    hiddenLayerSize = kwargs["hidden_size"]
    batchSize = kwargs["batch_size"]
    startSymbol = kwargs["start_symbol"]
    numEpochs = kwargs["num_epochs"]
    lr = kwargs["lr"]
    labelLexicon = createLexiconUsingFile(kwargs["label_file"])

    log.info("Reading training examples")

    log.info("Reading W2v File1")
    embedding1 = EmbeddingFactory().createFromW2V(kwargs["word_embedding1"],
                                                  RandomUnknownStrategy())

    # Supervised part
    # Learner1
    input1 = T.lmatrix(name="input1")

    embeddingLayer1 = EmbeddingLayer(input1,
                                     embedding1.getEmbeddingMatrix(),
                                     trainable=True)
    flatten1 = FlattenLayer(embeddingLayer1)

    linear11 = LinearLayer(flatten1,
                           wordWindowSize * embedding1.getEmbeddingSize(),
                           hiddenLayerSize,
                           weightInitialization=GlorotUniform())
    act11 = ActivationLayer(linear11, tanh)

    linear12 = LinearLayer(act11,
                           hiddenLayerSize,
                           labelLexicon.getLen(),
                           weightInitialization=ZeroWeightGenerator())
    act12 = ActivationLayer(linear12, softmax)

    ## Learner2
    log.info("Reading W2v File2")
    embedding2 = EmbeddingFactory().createFromW2V(kwargs["word_embedding2"],
                                                  RandomUnknownStrategy())

    input2 = T.lmatrix(name="input2")

    embeddingLayer2 = EmbeddingLayer(input2,
                                     embedding2.getEmbeddingMatrix(),
                                     trainable=True)
    flatten2 = FlattenLayer(embeddingLayer2)

    linear21 = LinearLayer(flatten2,
                           wordWindowSize * embedding2.getEmbeddingSize(),
                           hiddenLayerSize,
                           weightInitialization=GlorotUniform())
    act21 = ActivationLayer(linear21, tanh)

    linear22 = LinearLayer(act21,
                           hiddenLayerSize,
                           labelLexicon.getLen(),
                           weightInitialization=ZeroWeightGenerator())
    act22 = ActivationLayer(linear22, softmax)

    y = T.lvector("y")

    # Set loss and prediction and retrieve all layers
    output1 = act12.getOutput()
    prediction1 = ArgmaxPrediction(1).predict(output1)
    loss1 = NegativeLogLikelihood().calculateError(output1, prediction1, y)

    if kwargs["l2"][0]:
        _lambda1 = kwargs["l2"][0]
        log.info("Using L2 with lambda= %.2f", _lambda1)
        loss1 += _lambda1 * (T.sum(T.square(linear11.getParameters()[0])))

    output2 = act22.getOutput()
    prediction2 = ArgmaxPrediction(1).predict(output2)
    loss2 = NegativeLogLikelihood().calculateError(output2, prediction2, y)

    if kwargs["l2"][1]:
        _lambda2 = kwargs["l2"][1]
        log.info("Using L2 with lambda= %.2f", _lambda2)
        loss2 += _lambda2 * (T.sum(T.square(linear21.getParameters()[0])))

    loss = loss1 + loss2

    ## CoLearningPrediction
    output = T.stack([linear12.getOutput(), linear22.getOutput()])
    # return T.argmax(output, 2)[T.argmax(T.max(output, 2), 0),T.arange(output.shape[1])]
    average = T.mean(output, 0)
    prediction = ArgmaxPrediction(1).predict(
        ActivationLayer(average, softmax).getOutput())
    # prediction = CoLearningWnnPrediction().predict([output1, output2])

    supervisedModeUnit = ModelUnit("supervised_wnn", [input1, input2],
                                   y,
                                   loss,
                                   prediction=prediction)

    # Unsupervised part

    ## Learner1
    inputUnsuper1 = T.lmatrix(name="input_unsupervised_1")

    embeddingLayerUnsuper1 = EmbeddingLayer(inputUnsuper1,
                                            embeddingLayer1.getParameters()[0],
                                            trainable=True)

    flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1)

    w, b = linear11.getParameters()
    linearUnsuper11 = LinearLayer(flattenUnsuper1,
                                  wordWindowSize *
                                  embedding1.getEmbeddingSize(),
                                  hiddenLayerSize,
                                  W=w,
                                  b=b)
    actUnsupervised11 = ActivationLayer(linearUnsuper11, tanh)

    w, b = linear12.getParameters()
    linearUnsuper12 = LinearLayer(actUnsupervised11,
                                  hiddenLayerSize,
                                  labelLexicon.getLen(),
                                  W=w,
                                  b=b)
    actUnsuper12 = ActivationLayer(linearUnsuper12, softmax)

    ## Learner2
    inputUnsuper2 = T.lmatrix(name="input_unsupervised_2")

    embeddingLayerUnsuper2 = EmbeddingLayer(inputUnsuper2,
                                            embeddingLayer2.getParameters()[0],
                                            trainable=True)
    flattenUnsuper2 = FlattenLayer(embeddingLayerUnsuper2)

    w, b = linear21.getParameters()
    linearUnsuper21 = LinearLayer(flattenUnsuper2,
                                  wordWindowSize *
                                  embedding2.getEmbeddingSize(),
                                  hiddenLayerSize,
                                  W=w,
                                  b=b)
    actUnsuper21 = ActivationLayer(linearUnsuper21, tanh)

    w, b = linear22.getParameters()
    linearUnsuper22 = LinearLayer(actUnsuper21,
                                  hiddenLayerSize,
                                  labelLexicon.getLen(),
                                  W=w,
                                  b=b)
    actUnsuper22 = ActivationLayer(linearUnsuper22, softmax)

    # Set loss and prediction and retrieve all layers
    outputUns1 = actUnsuper12.getOutput()
    predictionUns1 = ArgmaxPrediction(1).predict(outputUns1)

    outputUns2 = actUnsuper22.getOutput()
    predictionUns2 = ArgmaxPrediction(1).predict(outputUns2)
    #
    # unsupervisedLoss = kwargs["lambda"] * (
    #         NegativeLogLikelihood().calculateError(outputUns1, predictionUns1, predictionUns2) +
    #         NegativeLogLikelihood().calculateError(outputUns2, predictionUns2, predictionUns1))

    _lambdaShared = theano.shared(value=kwargs["lambda"],
                                  name='lambda',
                                  borrow=True)

    unsupervisedLoss = _lambdaShared * (NegativeLogLikelihood().calculateError(
        outputUns1, predictionUns1, predictionUns2) + NegativeLogLikelihood(
        ).calculateError(outputUns2, predictionUns2, predictionUns1))

    unsupervisedUnit = ModelUnit("unsupervised_wnn",
                                 [inputUnsuper1, inputUnsuper2],
                                 None,
                                 unsupervisedLoss,
                                 yWillBeReceived=False)

    # Creates model
    model = CoLearningModel(kwargs["loss_uns_epoch"])

    model.addTrainingModelUnit(supervisedModeUnit, metrics=["loss", "acc"])
    model.addTrainingModelUnit(unsupervisedUnit, metrics=["loss"])

    model.setEvaluatedModelUnit(supervisedModeUnit, metrics=["acc"])

    # Compile Model
    opt1 = SGD(lr=lr[0], decay=1.0)
    opt2 = SGD(lr=lr[1], decay=1.0)

    log.info("Compiling the model")
    model.compile([(opt1, {
        supervisedModeUnit: act12.getLayerSet(),
        unsupervisedUnit: actUnsuper12.getLayerSet()
    }),
                   (opt2, {
                       supervisedModeUnit: act22.getLayerSet(),
                       unsupervisedUnit: actUnsuper22.getLayerSet()
                   })])

    # Generators
    inputGenerator1 = WordWindowGenerator(wordWindowSize, embedding1, filters,
                                          startSymbol)
    inputGenerator2 = WordWindowGenerator(wordWindowSize, embedding2, filters,
                                          startSymbol)
    outputGenerator = LabelGenerator(labelLexicon)

    # Reading supervised and unsupervised data sets.
    trainSupervisedDatasetReader = TokenLabelReader(
        kwargs["train_supervised"], kwargs["token_label_separator"])
    trainSupervisedDatasetReader = SyncBatchIterator(
        trainSupervisedDatasetReader, [inputGenerator1, inputGenerator2],
        [outputGenerator], batchSize[0])

    trainUnsupervisedDataset = TokenReader(kwargs["train_unsupervised"])
    trainUnsupervisedDatasetReader = SyncBatchIterator(
        trainUnsupervisedDataset, [inputGenerator1, inputGenerator2], None,
        batchSize[1])

    embedding1.stopAdd()
    embedding2.stopAdd()
    labelLexicon.stopAdd()

    # Get dev inputs and output
    log.info("Reading development examples")
    devDatasetReader = TokenLabelReader(kwargs["dev"],
                                        kwargs["token_label_separator"])
    devReader = SyncBatchIterator(devDatasetReader,
                                  [inputGenerator1, inputGenerator2],
                                  [outputGenerator],
                                  sys.maxint,
                                  shuffle=False)

    lambdaChange = ChangeLambda(_lambdaShared, kwargs["lambda"],
                                kwargs["loss_uns_epoch"])
    lossCallback = LossCallback(loss1, loss2, input1, input2, y)

    # trainUnsupervisedDatasetReaderAcc = SyncBatchIterator(trainUnsupervisedDataset,
    #                                                       [inputGenerator1, inputGenerator2],
    #                                                       [outputGenerator], sys.maxint)

    # accCallBack = AccCallBack(prediction1, prediction2, input1, input2,
    #                           unsurpervisedDataset=trainUnsupervisedDatasetReaderAcc)

    # Training Model
    model.train([trainSupervisedDatasetReader, trainUnsupervisedDatasetReader],
                numEpochs,
                devReader,
                callbacks=[lambdaChange, lossCallback])
Esempio n. 10
0
def mainWnnNegativeSampling(args):
    # Reading parameters
    embeddingMatrix = None
    wordEmbeddingSize = args.word_embedding_size
    windowSize = args.window_size
    hiddenLayerSize = args.hidden_size
    startSymbol = args.start_symbol
    # endSymbol = args.end_symbol
    endSymbol = startSymbol
    noiseRate = args.noise_rate

    # todo: o algoritmo não suporta mini batch. Somente treinamento estocástico.
    batchSize = 1

    shuffle = args.shuffle
    lr = args.lr
    numEpochs = args.num_epochs
    power = args.power

    minLr = args.min_lr
    numExUpdLr = args.num_examples_updt_lr

    log = logging.getLogger(__name__)

    log.info(str(args))

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)
    #
    # if args.decay.lower() == "normal":
    #     decay = 0.0
    # elif args.decay.lower() == "divide_epoch":
    #     decay = 1.0

    parametersToSaveOrLoad = {"hidden_size", "window_size", "start_symbol"}

    # Calculate the frequency of each word
    trainReader = TokenReader(args.train)
    wordLexicon = Lexicon("UUKNNN", "lexicon")
    wordLexicon.put(startSymbol, False)

    totalNumOfTokens = 0
    for tokens, labels in trainReader.read():
        # we don't count the </s>, because this token is only insert in the sentence to count its frequency.
        totalNumOfTokens += len(tokens)

        # Word2vec considers that the number of lines is the frequency of </s>
        tokens += [startSymbol]

        for token in tokens:
            wordLexicon.put(token)

    # Prune the words with the frequency less than min_count
    wordLexicon.prune(args.min_count)
    wordLexicon.stopAdd()

    # Calculte the unigram distribution
    frequency = np.power(wordLexicon.getFrequencyOfAllWords(), power)
    total = float(frequency.sum())

    # # Print the distribution of all words
    # for _ in xrange(len(frequency)):
    #     print "%s\t%d\t%.4f" % (wordLexicon.getLexicon(_), frequency[_],frequency[_]/float(total))

    sampler = Sampler(frequency / float(total))

    # Create a random embedding for each word
    wordEmbedding = Embedding(wordLexicon, None, wordEmbeddingSize)
    log.info("Lexicon size: %d" % (wordLexicon.getLen()))

    # Create NN
    x = T.lmatrix("word_window")
    y = T.lvector("labels")

    wordEmbeddingLayer = EmbeddingLayer(x,
                                        wordEmbedding.getEmbeddingMatrix(),
                                        name="embedding")
    flatten = FlattenLayer(wordEmbeddingLayer)

    linear1 = LinearLayer(flatten,
                          wordEmbeddingSize * windowSize,
                          hiddenLayerSize,
                          name="linear1")
    act1 = ActivationLayer(linear1, tanh)

    # Softmax regression. It's like a logistic regression
    linear2 = LinearLayer(act1,
                          hiddenLayerSize,
                          1,
                          weightInitialization=ZeroWeightGenerator(),
                          name="linear_softmax_regresion")

    act2 = ActivationLayer(linear2, sigmoid)
    # We clip the output of -sigmoid, because this output can be 0  and ln(0) is infinite, which can cause problems.
    output = T.flatten(T.clip(act2.getOutput(), 10**-5, 1 - 10**-5))

    # Loss Functions
    negativeSamplingLoss = T.nnet.binary_crossentropy(output, y).sum()
    # Set training
    inputGenerators = [
        WordWindowGenerator(windowSize, wordLexicon, [], startSymbol,
                            endSymbol)
    ]

    outputGenerators = [ConstantLabel(labelLexicon=None, label=1)]

    trainIterator = SyncBatchIterator(trainReader, inputGenerators,
                                      outputGenerators, batchSize, shuffle)

    trainMetrics = [LossMetric("lossTrain", negativeSamplingLoss)]

    allLayers = act2.getLayerSet()

    # opt = SGD(lr=lr, decay=decay)
    opt = SGD(lr=lr)

    model = NegativeSamplingModel(args.t, noiseRate, sampler, minLr,
                                  numExUpdLr, totalNumOfTokens, numEpochs, [x],
                                  [y], allLayers, opt, negativeSamplingLoss,
                                  trainMetrics)
    # Save Model
    if args.save_model:
        savePath = args.save_model
        objsToSave = list(act2.getLayerSet()) + [wordLexicon]

        modelWriter = ModelWriter(savePath, objsToSave, args,
                                  parametersToSaveOrLoad)

    # Training
    model.train(trainIterator, numEpochs=numEpochs, callbacks=[])

    if args.save_model:
        modelWriter.save()
Esempio n. 11
0
def mainWnn(args):
    ################################################
    # Initializing parameters
    ##############################################
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters",
                              "alg", "hidden_activation_function", "word_window_size", "char_window_size",
                              "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size",
                              "use_capitalization", "start_symbol", "end_symbol", "with_hidden"}

    # Load parameters of the saving model
    if args.load_model:
        persistentManager = H5py(args.load_model)
        savedParameters = json.loads(persistentManager.getAttribute("parameters"))

        if savedParameters.get("charwnn_filters", None) != None:
            savedParameters["char_filters"] = savedParameters["charwnn_filters"]
            savedParameters.pop("charwnn_filters")
            print savedParameters

        log.info("Loading parameters of the model")
        args = args._replace(**savedParameters)

    log.info(str(args))

    # Read the parameters
    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization.lower() if args.normalization is not None else None
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    hiddenActFunctionName = args.hidden_activation_function
    embeddingSize = args.word_emb_size

    withCharWNN = args.with_charwnn
    charEmbeddingSize = args.char_emb_size
    charWindowSize = args.char_window_size
    startSymbolChar = "</s>"

    suffixEmbSize = args.suffix_emb_size
    capEmbSize = args.cap_emb_size

    useSuffixFeatures = args.suffix_size > 0
    useCapFeatures = args.use_capitalization

    # Insert the character that will be used to fill the matrix
    # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
    artificialChar = "ART_CHAR"

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if args.alg == "window_stn":
        isSentenceModel = True
    elif args.alg == "window_word":
        isSentenceModel = False
    else:
        raise Exception("The value of model_type isn't valid.")

    batchSize = -1 if isSentenceModel else args.batch_size
    wordFilters = []

    # Lendo Filtros do wnn
    log.info("Lendo filtros básicos")
    wordFilters = getFilters(args.word_filters, log)

    # Lendo Filtros do charwnn
    log.info("Lendo filtros do charwnn")
    charFilters = getFilters(args.char_filters, log)

    # Lendo Filtros do suffix
    log.info("Lendo filtros do sufixo")
    suffixFilters = getFilters(args.suffix_filters, log)

    # Lendo Filtros da capitalização
    log.info("Lendo filtros da capitalização")
    capFilters = getFilters(args.cap_filters, log)

    ################################################
    # Create the lexicon and go out after this
    ################################################
    if args.create_only_lexicon:
        inputGenerators = []
        lexiconsToSave = []

        if args.word_lexicon and not os.path.exists(args.word_lexicon):
            wordLexicon = Lexicon("UUUNKKK", "labelLexicon")

            inputGenerators.append(
                WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol))
            lexiconsToSave.append((wordLexicon, args.word_lexicon))

        if not os.path.exists(args.label_file):
            labelLexicon = Lexicon(None, "labelLexicon")
            outputGenerator = [LabelGenerator(labelLexicon)]
            lexiconsToSave.append((labelLexicon, args.label_file))
        else:
            outputGenerator = None

        if args.char_lexicon and not os.path.exists(args.char_lexicon):
            charLexicon = Lexicon("UUUNKKK", "charLexicon")

            charLexicon.put(startSymbolChar)
            charLexicon.put(artificialChar)

            inputGenerators.append(
                CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                         startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                         filters=charFilters))

            lexiconsToSave.append((charLexicon, args.char_lexicon))

        if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon):
            suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon")

            if args.suffix_size <= 0:
                raise Exception(
                    "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.")

            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

            lexiconsToSave.append((suffixLexicon, args.suffix_lexicon))

        if args.cap_lexicon and not os.path.exists(args.cap_lexicon):
            capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon")

            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

            lexiconsToSave.append((capLexicon, args.cap_lexicon))

        if len(inputGenerators) == 0:
            inputGenerators = None

        if not (inputGenerators or outputGenerator):
            log.info("All lexicons have been generated.")
            return

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize,
                                        shuffle=shuffle)

        for lexicon, pathToSave in lexiconsToSave:
            lexicon.save(pathToSave)

        log.info("Lexicons were generated with success!")

        return

    ################################################
    # Starting training
    ###########################################

    if withCharWNN and (useSuffixFeatures or useCapFeatures):
        raise Exception("It's impossible to use hand-crafted features with Charwnn.")

    # Read word lexicon and create word embeddings
    if args.load_model:
        wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon")
        vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer")

        wordEmbedding = Embedding(wordLexicon, vectors)

    elif args.word_embedding:
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Read char lexicon and create char embeddings
    if withCharWNN:
        if args.load_model:
            charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon")
            vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                     "char_convolution_layer")

            charEmbedding = Embedding(charLexicon, vectors)
        elif args.char_lexicon:
            charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")
            charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize)
        else:
            log.error("You need to set one of these parameters: load_model or char_lexicon")
            return
    else:
        # Read suffix lexicon if suffix size is greater than 0
        if useSuffixFeatures:
            if args.load_model:
                suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "suffix_embedding")

                suffixEmbedding = Embedding(suffixLexicon, vectors)
            elif args.suffix_lexicon:
                suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon")
                suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or suffix_lexicon")
                return

        # Read capitalization lexicon
        if useCapFeatures:
            if args.load_model:
                capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "cap_embedding")

                capEmbedding = Embedding(capLexicon, vectors)
            elif args.cap_lexicon:
                capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon")
                capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or cap_lexicon")
                return

    # Read labels
    if args.load_model:
        labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon")
    elif args.label_file:
        labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Normalize the word embedding
    if not normalizeMethod:
        pass
    elif normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    else:
        log.error("Unknown normalization method: %s" % normalizeMethod)
        sys.exit(1)

    if normalizeMethod is not None and args.load_model is not None:
        log.warn("The word embedding of model was normalized. This can change the result of test.")

    # Build neural network
    if isSentenceModel:
        raise NotImplementedError("Sentence model is not implemented!")
    else:
        wordWindow = T.lmatrix("word_window")
        inputModel = [wordWindow]

        wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True,
                                            name="word_embedding_layer")
        flatten = FlattenLayer(wordEmbeddingLayer)

        if withCharWNN:
            # Use the convolution
            log.info("Using charwnn")
            convSize = args.conv_size

            if args.charwnn_with_act:
                charAct = tanh
            else:
                charAct = None

            charWindowIdxs = T.ltensor4(name="char_window_idx")
            inputModel.append(charWindowIdxs)

            charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(),
                                                                 numMaxChar, convSize, charWindowSize,
                                                                 charEmbeddingSize, charAct,
                                                                 name="char_convolution_layer")
            layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer])
            sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize)
        elif useSuffixFeatures or useCapFeatures:
            # Use hand-crafted features
            concatenateInputs = [flatten]
            nmFetauresByWord = wordEmbedding.getEmbeddingSize()

            if useSuffixFeatures:
                log.info("Using suffix features")

                suffixInput = T.lmatrix("suffix_input")
                suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(),
                                                name="suffix_embedding")
                suffixFlatten = FlattenLayer(suffixEmbLayer)
                concatenateInputs.append(suffixFlatten)

                nmFetauresByWord += suffixEmbedding.getEmbeddingSize()
                inputModel.append(suffixInput)

            if useCapFeatures:
                log.info("Using capitalization features")

                capInput = T.lmatrix("capitalization_input")
                capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(),
                                             name="cap_embedding")
                capFlatten = FlattenLayer(capEmbLayer)
                concatenateInputs.append(capFlatten)

                nmFetauresByWord += capEmbedding.getEmbeddingSize()
                inputModel.append(capInput)

            layerBeforeLinear = ConcatenateLayer(concatenateInputs)
            sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord
        else:
            # Use only the word embeddings
            layerBeforeLinear = flatten
            sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize()

        # The rest of the NN
        if args.with_hidden:
            hiddenActFunction = method_name(hiddenActFunctionName)
            weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform()

            linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize,
                                  weightInitialization=weightInit, name="linear1")
            act1 = ActivationLayer(linear1, hiddenActFunction)

            layerBeforeSoftmax = act1
            sizeLayerBeforeSoftmax = hiddenLayerSize
            log.info("Using hidden layer")
        else:
            layerBeforeSoftmax = layerBeforeLinear
            sizeLayerBeforeSoftmax = sizeLayerBeforeLinear
            log.info("Not using hidden layer")

        linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(),
                              weightInitialization=ZeroWeightGenerator(),
                              name="linear_softmax")
        act2 = ActivationLayer(linear2, softmax)
        prediction = ArgmaxPrediction(1).predict(act2.getOutput())

    # Load the model
    if args.load_model:
        alreadyLoaded = set([wordEmbeddingLayer])

        for o in (act2.getLayerSet() - alreadyLoaded):
            if o.getName():
                persistentManager.load(o)

    # Set the input and output
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)]

    if withCharWNN:
        inputGenerators.append(
            CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                     startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                     filters=charFilters))
    else:
        if useSuffixFeatures:
            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

        if useCapFeatures:
            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

    outputGenerator = LabelGenerator(labelLexicon)

    if args.train:
        log.info("Reading training examples")

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize,
                                        shuffle=shuffle)

        # Get dev inputs and output
        dev = args.dev

        if dev:
            log.info("Reading development examples")
            devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator)
            devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint,
                                          shuffle=False)
        else:
            devReader = None
    else:
        trainReader = None
        devReader = None

    y = T.lvector("y")

    if args.decay.lower() == "normal":
        decay = 0.0
    elif args.decay.lower() == "divide_epoch":
        decay = 1.0

    if args.adagrad:
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()

    log.info("Size of  word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize))

    if withCharWNN:
        log.info("Size of  char dictionary and char embedding size: %d and %d" % (
            charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    if useSuffixFeatures:
        log.info("Size of  suffix dictionary and suffix embedding size: %d and %d" % (
            suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize()))

    if useCapFeatures:
        log.info("Size of  capitalization dictionary and capitalization embedding size: %d and %d" % (
            capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize()))

    # Compiling
    loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y)

    if args.lambda_L2:
        _lambda = args.lambda_L2
        log.info("Using L2 with lambda= %.2f", _lambda)
        loss += _lambda * (T.sum(T.square(linear1.getParameters()[0])))

    trainMetrics = [
        LossMetric("LossTrain", loss, True),
        AccuracyMetric("AccTrain", y, prediction),
    ]

    evalMetrics = [
        LossMetric("LossDev", loss, True),
        AccuracyMetric("AccDev", y, prediction),
    ]

    testMetrics = [
        LossMetric("LossTest", loss, True),
        AccuracyMetric("AccTest", y, prediction),
    ]

    wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics,
                          evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None)
    # Training
    if trainReader:
        callback = []

        if args.save_model:
            savePath = args.save_model
            objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon]

            if withCharWNN:
                objsToSave.append(charLexicon)

            if useSuffixFeatures:
                objsToSave.append(suffixLexicon)

            if useCapFeatures:
                objsToSave.append(capLexicon)

            modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad)

            # Save the model with best acc in dev
            if args.save_by_acc:
                callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True))

        log.info("Training")
        wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback)

        # Save the model at the end of training
        if args.save_model and not args.save_by_acc:
            modelWriter.save()

    # Testing
    if args.test:
        log.info("Reading test examples")
        testDatasetReader = TokenLabelReader(args.test, args.token_label_separator)
        testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing")
        wnnModel.test(testReader)

        if args.print_prediction:
            f = codecs.open(args.print_prediction, "w", encoding="utf-8")

            for x, labels in testReader:
                inputs = x

                predictions = wnnModel.prediction(inputs)

                for prediction in predictions:
                    f.write(labelLexicon.getLexicon(prediction))
                    f.write("\n")