コード例 #1
0
    def __init__(self, model, sourceDataset, tokenLabelSeparator,
                 inputGenerator, outputGeneratorTag):
        # Get dev inputs and output
        self.log = logging.getLogger(__name__)
        self.log.info("Reading additional dev examples")
        devDatasetReader = TokenLabelReader(sourceDataset, tokenLabelSeparator)
        devReader = SyncBatchIterator(devDatasetReader,
                                      inputGenerator, [outputGeneratorTag],
                                      sys.maxint,
                                      shuffle=False)

        self.devReader = devReader
        self.model = model
コード例 #2
0
def main(**kwargs):
    log = logging.getLogger(__name__)
    log.info(kwargs)

    if kwargs["seed"] != None:
        random.seed(kwargs["seed"])
        np.random.seed(kwargs["seed"])

    filters = []

    for filterName in kwargs["filters"]:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Usando o filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    wordWindowSize = kwargs["word_window_size"]
    hiddenLayerSize = kwargs["hidden_size"]
    batchSize = kwargs["batch_size"]
    startSymbol = kwargs["start_symbol"]
    endSymbol = kwargs["end_symbol"]
    numEpochs = kwargs["num_epochs"]
    lr = kwargs["lr"]
    tagLexicon = createLexiconUsingFile(kwargs["label_file"])
    # _lambda = theano.shared(kwargs["lambda"], "lambda")
    _lambda = theano.shared(0.0, "lambda")
    useAdagrad = kwargs["adagrad"]
    shuffle = kwargs["shuffle"]
    supHiddenLayerSize = kwargs["hidden_size_supervised_part"]
    unsupHiddenLayerSize = kwargs["hidden_size_unsupervised_part"]
    normalization = kwargs["normalization"]
    activationHiddenExtractor = kwargs["activation_hidden_extractor"]

    withCharWNN = kwargs["with_charwnn"]
    convSize = kwargs["conv_size"]
    charEmbeddingSize = kwargs["char_emb_size"]
    charWindowSize = kwargs["char_window_size"]
    startSymbolChar = "</s>"

    if kwargs["charwnn_with_act"]:
        charAct = tanh
    else:
        charAct = None

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if kwargs["decay"].lower() == "normal":
        decay = 0.0
    elif kwargs["decay"].lower() == "divide_epoch":
        decay = 1.0

    # Add the lexicon of target
    domainLexicon = Lexicon()

    domainLexicon.put("0")
    domainLexicon.put("1")
    domainLexicon.stopAdd()

    log.info("Reading W2v File1")
    wordEmbedding = EmbeddingFactory().createFromW2V(kwargs["word_embedding"],
                                                     RandomUnknownStrategy())

    log.info("Reading training examples")
    # Generators
    inputGenerators = [
        WordWindowGenerator(wordWindowSize, wordEmbedding, filters,
                            startSymbol)
    ]
    outputGeneratorTag = LabelGenerator(tagLexicon)

    if withCharWNN:
        # Create the character embedding
        charEmbedding = EmbeddingFactory().createRandomEmbedding(
            charEmbeddingSize)

        # Insert the padding of the character window
        charEmbedding.put(startSymbolChar)

        # Insert the character that will be used to fill the matrix
        # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
        artificialChar = "ART_CHAR"
        charEmbedding.put(artificialChar)

        inputGenerators.append(
            CharacterWindowGenerator(charEmbedding,
                                     numMaxChar,
                                     charWindowSize,
                                     wordWindowSize,
                                     artificialChar,
                                     startSymbolChar,
                                     startPaddingWrd=startSymbol,
                                     endPaddingWrd=endSymbol))

    unsupervisedLabelSource = ConstantLabel(domainLexicon, "0")

    # Reading supervised and unsupervised data sets.
    trainSupervisedDatasetReader = TokenLabelReader(
        kwargs["train_source"], kwargs["token_label_separator"])
    trainSupervisedBatch = SyncBatchIterator(
        trainSupervisedDatasetReader,
        inputGenerators, [outputGeneratorTag, unsupervisedLabelSource],
        batchSize[0],
        shuffle=shuffle)

    # Get Unsupervised Input
    unsupervisedLabelTarget = ConstantLabel(domainLexicon, "1")

    trainUnsupervisedDatasetReader = TokenReader(kwargs["train_target"])
    trainUnsupervisedDatasetBatch = SyncBatchIterator(
        trainUnsupervisedDatasetReader,
        inputGenerators, [unsupervisedLabelTarget],
        batchSize[1],
        shuffle=shuffle)

    # Stopping to add new words, labels and chars
    wordEmbedding.stopAdd()
    tagLexicon.stopAdd()
    domainLexicon.stopAdd()

    if withCharWNN:
        charEmbedding.stopAdd()

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Size of  word dictionary and word embedding size: %d and %d" %
             (dictionarySize, embeddingSize))
    log.info(
        "Size of  char dictionary and char embedding size: %d and %d" %
        (charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    # Word Embedding Normalization
    if normalization == "zscore":
        wordEmbedding.zscoreNormalization()
    elif normalization == "minmax":
        wordEmbedding.minMaxNormalization()
    elif normalization == "mean":
        wordEmbedding.meanNormalization()
    elif normalization == "none" or not normalization:
        pass
    else:
        raise Exception()

    # Source input
    wordWindowSource = T.lmatrix(name="windowSource")
    sourceInput = [wordWindowSource]

    # Create the layers related with the extractor of features
    embeddingLayerSrc = EmbeddingLayer(wordWindowSource,
                                       wordEmbedding.getEmbeddingMatrix(),
                                       trainable=True)
    flattenSrc = FlattenLayer(embeddingLayerSrc)

    if withCharWNN:
        log.info("Using charwnn")

        # Create the charwn
        charWindowIdxSrc = T.ltensor4(name="char_window_idx_source")
        sourceInput.append(charWindowIdxSrc)

        charEmbeddingConvLayerSrc = EmbeddingConvolutionalLayer(
            charWindowIdxSrc, charEmbedding.getEmbeddingMatrix(), numMaxChar,
            convSize, charWindowSize, charEmbeddingSize, charAct)
        layerBeforeLinearSrc = ConcatenateLayer(
            [flattenSrc, charEmbeddingConvLayerSrc])
        sizeLayerBeforeLinearSrc = wordWindowSize * (
            wordEmbedding.getEmbeddingSize() + convSize)
    else:
        layerBeforeLinearSrc = flattenSrc
        sizeLayerBeforeLinearSrc = wordWindowSize * wordEmbedding.getEmbeddingSize(
        )

    if activationHiddenExtractor == "tanh":
        log.info("Using tanh in the hidden layer of extractor")

        linear1 = LinearLayer(layerBeforeLinearSrc,
                              sizeLayerBeforeLinearSrc,
                              hiddenLayerSize,
                              weightInitialization=GlorotUniform())
        act1 = ActivationLayer(linear1, tanh)
    elif activationHiddenExtractor == "sigmoid":
        log.info("Using sigmoid in the hidden layer of extractor")

        linear1 = LinearLayer(layerBeforeLinearSrc,
                              sizeLayerBeforeLinearSrc,
                              hiddenLayerSize,
                              weightInitialization=SigmoidGenerator())
        act1 = ActivationLayer(linear1, sigmoid)
    else:
        raise Exception()

    # Create the layers with the Tagger
    if supHiddenLayerSize == 0:
        layerBeforeSupSoftmax = act1
        layerSizeBeforeSupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the supervised softmax.")
    else:
        linear2 = LinearLayer(act1,
                              hiddenLayerSize,
                              supHiddenLayerSize,
                              weightInitialization=GlorotUniform())
        act2 = ActivationLayer(linear2, tanh)

        layerBeforeSupSoftmax = act2
        layerSizeBeforeSupSoftmax = supHiddenLayerSize

        log.info("It inserted the layer before the supervised softmax.")

    supervisedLinear = LinearLayer(layerBeforeSupSoftmax,
                                   layerSizeBeforeSupSoftmax,
                                   tagLexicon.getLen(),
                                   weightInitialization=ZeroWeightGenerator())
    supervisedSoftmax = ActivationLayer(supervisedLinear, softmax)

    # Create the layers with the domain classifier
    gradientReversalSource = GradientReversalLayer(act1, _lambda)

    if unsupHiddenLayerSize == 0:
        layerBeforeUnsupSoftmax = gradientReversalSource
        layerSizeBeforeUnsupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the unsupervised softmax.")
    else:
        unsupervisedSourceLinearBf = LinearLayer(
            gradientReversalSource,
            hiddenLayerSize,
            unsupHiddenLayerSize,
            weightInitialization=GlorotUniform())
        actUnsupervisedSourceBf = ActivationLayer(unsupervisedSourceLinearBf,
                                                  tanh)

        layerBeforeUnsupSoftmax = actUnsupervisedSourceBf
        layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize

        log.info("It inserted the layer before the unsupervised softmax.")

    unsupervisedSourceLinear = LinearLayer(
        layerBeforeUnsupSoftmax,
        layerSizeBeforeUnsupSoftmax,
        domainLexicon.getLen(),
        weightInitialization=ZeroWeightGenerator())
    unsupervisedSourceSoftmax = ActivationLayer(unsupervisedSourceLinear,
                                                softmax)

    ## Target Part
    windowTarget = T.lmatrix(name="windowTarget")
    targetInput = [windowTarget]

    # Create the layers related with the extractor of features
    embeddingLayerUnsuper1 = EmbeddingLayer(
        windowTarget, embeddingLayerSrc.getParameters()[0], trainable=True)
    flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1)

    if withCharWNN:
        log.info("Using charwnn")

        # Create the charwn
        charWindowIdxTgt = T.ltensor4(name="char_window_idx_target")
        targetInput.append(charWindowIdxTgt)

        charEmbeddingConvLayerTgt = EmbeddingConvolutionalLayer(
            charWindowIdxTgt,
            charEmbeddingConvLayerSrc.getParameters()[0],
            numMaxChar,
            convSize,
            charWindowSize,
            charEmbeddingSize,
            charAct,
            trainable=True)
        layerBeforeLinearTgt = ConcatenateLayer(
            [flattenUnsuper1, charEmbeddingConvLayerTgt])
        sizeLayerBeforeLinearTgt = wordWindowSize * (
            wordEmbedding.getEmbeddingSize() + convSize)
    else:
        layerBeforeLinearTgt = flattenUnsuper1
        sizeLayerBeforeLinearTgt = wordWindowSize * wordEmbedding.getEmbeddingSize(
        )

    w, b = linear1.getParameters()
    linearUnsuper1 = LinearLayer(layerBeforeLinearTgt,
                                 sizeLayerBeforeLinearTgt,
                                 hiddenLayerSize,
                                 W=w,
                                 b=b,
                                 trainable=True)

    if activationHiddenExtractor == "tanh":
        log.info("Using tanh in the hidden layer of extractor")
        actUnsupervised1 = ActivationLayer(linearUnsuper1, tanh)
    elif activationHiddenExtractor == "sigmoid":
        log.info("Using sigmoid in the hidden layer of extractor")
        actUnsupervised1 = ActivationLayer(linearUnsuper1, sigmoid)
    else:
        raise Exception()

    # Create the layers with the domain classifier
    grandientReversalTarget = GradientReversalLayer(actUnsupervised1, _lambda)

    if unsupHiddenLayerSize == 0:
        layerBeforeUnsupSoftmax = grandientReversalTarget
        layerSizeBeforeUnsupSoftmax = hiddenLayerSize
        log.info("It didn't insert the layer before the unsupervised softmax.")
    else:
        w, b = unsupervisedSourceLinearBf.getParameters()
        unsupervisedTargetLinearBf = LinearLayer(grandientReversalTarget,
                                                 hiddenLayerSize,
                                                 unsupHiddenLayerSize,
                                                 W=w,
                                                 b=b,
                                                 trainable=True)
        actUnsupervisedTargetLinearBf = ActivationLayer(
            unsupervisedTargetLinearBf, tanh)

        layerBeforeUnsupSoftmax = actUnsupervisedTargetLinearBf
        layerSizeBeforeUnsupSoftmax = unsupHiddenLayerSize

        log.info("It inserted the layer before the unsupervised softmax.")

    w, b = unsupervisedSourceLinear.getParameters()
    unsupervisedTargetLinear = LinearLayer(layerBeforeUnsupSoftmax,
                                           layerSizeBeforeUnsupSoftmax,
                                           domainLexicon.getLen(),
                                           W=w,
                                           b=b,
                                           trainable=True)
    unsupervisedTargetSoftmax = ActivationLayer(unsupervisedTargetLinear,
                                                softmax)

    # Set loss and prediction and retrieve all layers
    supervisedLabel = T.lvector("supervisedLabel")
    unsupervisedLabelSource = T.lvector("unsupervisedLabelSource")
    unsupervisedLabelTarget = T.lvector("unsupervisedLabelTarget")

    supervisedOutput = supervisedSoftmax.getOutput()
    supervisedPrediction = ArgmaxPrediction(1).predict(supervisedOutput)
    supervisedLoss = NegativeLogLikelihood().calculateError(
        supervisedOutput, supervisedPrediction, supervisedLabel)

    unsupervisedOutputSource = unsupervisedSourceSoftmax.getOutput()
    unsupervisedPredSource = ArgmaxPrediction(1).predict(
        unsupervisedOutputSource)
    unsupervisedLossSource = NegativeLogLikelihood().calculateError(
        unsupervisedOutputSource, None, unsupervisedLabelSource)

    unsupervisedOutputTarget = unsupervisedTargetSoftmax.getOutput()
    unsupervisedPredTarget = ArgmaxPrediction(1).predict(
        unsupervisedOutputTarget)
    unsupervisedLossTarget = NegativeLogLikelihood().calculateError(
        unsupervisedOutputTarget, None, unsupervisedLabelTarget)

    # Creates model

    if useAdagrad:
        log.info("Using ADAGRAD")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    allLayersSource = supervisedSoftmax.getLayerSet(
    ) | unsupervisedSourceSoftmax.getLayerSet()
    allLayersTarget = unsupervisedTargetSoftmax.getLayerSet()
    unsupervisedLossTarget *= float(
        trainSupervisedBatch.size()) / trainUnsupervisedDatasetBatch.size()

    supervisedTrainMetrics = [
        LossMetric("TrainSupervisedLoss", supervisedLoss),
        AccuracyMetric("TrainSupervisedAcc", supervisedLabel,
                       supervisedPrediction),
        LossMetric("TrainUnsupervisedLoss", unsupervisedLossSource),
        AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelSource,
                       unsupervisedPredSource)
    ]
    unsupervisedTrainMetrics = [
        LossMetric("TrainUnsupervisedLoss", unsupervisedLossTarget),
        AccuracyMetric("TrainUnsupervisedAccuracy", unsupervisedLabelTarget,
                       unsupervisedPredTarget)
    ]

    evalMetrics = [
        AccuracyMetric("EvalAcc", supervisedLabel, supervisedPrediction)
    ]

    testMetrics = [
        AccuracyMetric("TestAcc", supervisedLabel, supervisedPrediction)
    ]

    #TODO: Não tive tempo de testar o código depois das modificações
    GradientReversalModel(sourceInput,
                          targetInput,
                          supervisedLabel,
                          unsupervisedLabelSource,
                          unsupervisedLabelTarget,
                          allLayersSource,
                          allLayersTarget,
                          opt,
                          supervisedPrediction,
                          supervisedLoss,
                          unsupervisedLossSource,
                          unsupervisedLossTarget,
                          supervisedTrainMetrics,
                          unsupervisedTrainMetrics,
                          evalMetrics,
                          testMetrics,
                          mode=None)

    # Get dev inputs and output
    log.info("Reading development examples")
    devDatasetReader = TokenLabelReader(kwargs["dev"],
                                        kwargs["token_label_separator"])
    devReader = SyncBatchIterator(devDatasetReader,
                                  inputGenerators, [outputGeneratorTag],
                                  sys.maxint,
                                  shuffle=False)

    callbacks = []
    # log.info("Usando lambda fixo: " + str(_lambda.get_value()))
    log.info("Usando lambda variado. alpha=" + str(kwargs["alpha"]) +
             " height=" + str(kwargs["height"]))
    callbacks.append(
        ChangeLambda(_lambda, kwargs["alpha"], numEpochs, kwargs["height"]))

    if kwargs["additional_dev"]:
        callbacks.append(
            AdditionalDevDataset(model, kwargs["additional_dev"],
                                 kwargs["token_label_separator"],
                                 inputGenerators, outputGeneratorTag))

    # Training Model
    model.train([trainSupervisedBatch, trainUnsupervisedDatasetBatch],
                numEpochs,
                devReader,
                callbacks=callbacks)
コード例 #3
0
def main():
    full_path = os.path.realpath(__file__)
    path, filename = os.path.split(full_path)
    logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={})
    log = logging.getLogger(__name__)

    if len(sys.argv) != 2:
        log.error("Missing argument: <JSON config file>")
        exit(1)

    argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1])
    args = dict2obj(argsDict, 'ShortDocArguments')
    logging.getLogger(__name__).info(argsDict)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None

    wordEmbedding = None
    if args.word_embedding:
        log.info("Reading W2v File")
        (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__")
        wordLexicon.stopAdd()
    elif args.word_lexicon and args.word_emb_size:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False)
        wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size)
        wordLexicon.stopAdd()
    else:
        log.error("You must provide argument word_embedding or word_lexicon and word_emb_size")

    # Create the lexicon of labels.
    labelLexicon = None
    if args.labels is not None:
        if args.label_lexicon is not None:
            log.error("Only one of the parameters label_lexicon and labels can be provided!")
            exit(1)
        labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False)
    elif args.label_lexicon is not None:
        labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False)
    else:
        log.error("One of the parameters label_lexicon or labels must be provided!")
        exit(1)

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = tensor.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = tensor.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable)

    # if not args.train and args.load_wordEmbedding:
    #     attrs = np.load(args.load_wordEmbedding)
    #     embeddingLayer.load(attrs)
    #     log.info("Loaded word embedding (shape %s) from file %s" % (
    #         str(attrs[0].shape), args.load_wordEmbedding))

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convW = None
    convb = None

    if not args.train and args.load_conv:
        convNPY = np.load(args.load_conv)
        convW = convNPY[0]
        convb = convNPY[1]
        log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv))

    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize, W=convW, b=convb,
                             weightInitialization=weightInit)

    if args.conv_act:
        convOut = ActivationLayer(convLinear, tanh)
    else:
        convOut = convLinear

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convOut)

    # Hidden layer.
    if not args.train and args.load_hiddenLayer:
        hiddenNPY = np.load(args.load_hiddenLayer)
        W1 = hiddenNPY[0]
        b1 = hiddenNPY[1]
        log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer))

    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1, b=b1,
                               weightInitialization=weightInit)

    hiddenAct = ActivationLayer(hiddenLinear, tanh)

    # Entrada linear da camada softmax.
    if not args.train and args.load_softmax:
        hiddenNPY = np.load(args.load_softmax)
        W2 = hiddenNPY[0]
        b2 = hiddenNPY[1]
        log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax))

    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2, b=b2,
                                    weightInitialization=ZeroWeightGenerator())

    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen():
        log.error("Number of label weights (%d) is different from number of labels (%d)!" % (
            len(args.label_weights), labelLexicon.getLen()))
    nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights)
    loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel)

    # Input generators: word window.
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)]

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]
    # outputGenerators = [lambda label: labelLexicon.put(label)]

    evalPerIteration = None
    if args.train:
        trainDatasetReader = ShortDocReader(args.train)
        if args.load_method == "sync":
            log.info("Reading training examples...")
            trainIterator = SyncBatchIterator(trainDatasetReader,
                                              inputGenerators,
                                              outputGenerators,
                                              - 1,
                                              shuffle=shuffle)
            wordLexicon.stopAdd()
        elif args.load_method == "async":
            log.info("Examples will be asynchronously loaded.")
            trainIterator = AsyncBatchIterator(trainDatasetReader,
                                               inputGenerators,
                                               outputGenerators,
                                               - 1,
                                               shuffle=shuffle,
                                               maxqSize=1000)
        else:
            log.error("The argument 'load_method' has an invalid value: %s." % args.load_method)
            sys.exit(1)

        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error("Argument eval_per_iteration cannot be used without a dev argument.")
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = ShortDocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            - 1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    decay = None
    if args.decay == "none":
        decay = 0.0
    elif args.decay == "linear":
        decay = 1.0
    else:
        log.error("Unknown decay parameter %s." % args.decay)
        exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error("Unknown algorithm: %s." % args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction),
            FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction),
            FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values())
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration)

    # Saving model after training
        if args.save_wordEmbedding:
            embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon)
            log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding))
        if args.save_conv:
            convLinear.save(args.save_conv)
            log.info("Saved convolution layer to file: %s" % (args.save_conv))
        if args.save_hiddenLayer:
            hiddenLinear.save(args.save_hiddenLayer)
            log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer))
        if args.save_softmax:
            sotmaxLinearInput.save(args.save_softmax)
            log.info("Saved softmax to file: %s" % (args.save_softmax))

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = ShortDocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         - 1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
コード例 #4
0
def main(**kwargs):
    log = logging.getLogger(__name__)
    log.info(kwargs)

    lr = kwargs["lr"]
    wordWindowSize = kwargs["word_window_size"]
    startSymbol = kwargs["start_symbol"]
    endSymbol = kwargs["end_symbol"]
    numEpochs = kwargs["num_epochs"]
    encoderSize = kwargs["encoder_size"]
    batchSize = kwargs["batch_size"]
    noiseRate = kwargs["noise_rate"]
    saveModel = kwargs["save_model"]
    sync = kwargs["sync"]

    filters = []

    for filterName in kwargs["filters"]:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Usando o filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    log.info("Reading W2v File")
    embedding = EmbeddingFactory().createFromW2V(kwargs["word_embedding"],
                                                 RandomUnknownStrategy())
    embedding.minMaxNormalization()

    datasetReader = TokenReader(kwargs["train"])
    inputGenerator = WordWindowGenerator(wordWindowSize, embedding, filters,
                                         startSymbol, endSymbol)

    if sync:
        log.info("Loading e pre-processing train data set")
        trainBatchGenerator = SyncBatchIterator(datasetReader,
                                                [inputGenerator], None,
                                                batchSize)
    else:
        trainBatchGenerator = AsyncBatchIterator(datasetReader,
                                                 [inputGenerator], None,
                                                 batchSize)
        # We can't stop, because the data set is reading in a asynchronous way
        # embedding.stopAdd()

    input = T.lmatrix("window_words")

    # Window of words
    embeddingLayer = EmbeddingLayer(input,
                                    embedding.getEmbeddingMatrix(),
                                    trainable=False)
    flatten = FlattenLayer(embeddingLayer)

    # Input of the hidden layer
    x = flatten.getOutput()

    # Encoder
    linear1 = LinearLayer(flatten,
                          wordWindowSize * embedding.getEmbeddingSize(),
                          encoderSize,
                          weightInitialization=SigmoidGlorot())
    act1 = ActivationLayer(linear1, sigmoid)

    # Decoder
    linear2 = TiedLayer(act1,
                        linear1.getParameters()[0],
                        wordWindowSize * embedding.getEmbeddingSize())
    act2 = ActivationLayer(linear2, sigmoid)

    # Creates the model
    mdaModel = Model(input, x, act2)

    sgd = SGD(lr, decay=0.0)
    encoderW = linear1.getParameters()[0]
    encoderOutput = act1.getOutput()
    loss = MDALoss(MeanSquaredError(), True, noiseRate, encoderW,
                   encoderOutput, x)

    log.info("Compiling the model")
    mdaModel.compile(sgd, loss)

    cbs = []

    if saveModel:
        writter = MDAModelWritter(saveModel, linear1, linear2)
        cbs.append(SaveModelCallback(writter, "loss", False))
    log.info("Traning model")
    mdaModel.train(trainBatchGenerator, numEpochs, callbacks=cbs)
コード例 #5
0
def mainWnnNer(args):
    # Initializing parameters.
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    log.info({"type": "args", "args": args})

    # GPU configuration.
    log.info({"floatX": str(theano.config.floatX), "device": str(theano.config.device)})

    # Parameters.
    # lr = args.lr
    # startSymbol = args.start_symbol
    # endSymbol = args.end_symbol
    # numEpochs = args.num_epochs
    # shuffle = args.shuffle
    # normalization = args.normalization
    # wordWindowSize = args.word_window_size
    # hiddenLayerSize = args.hidden_size
    # hiddenActFunctionName = args.hidden_activation_function
    # embeddingSize = args.word_emb_size
    # batchSize = args.batch_size
    # structGrad = args.struct_grad
    # charStructGrad = args.char_struct_grad
    #
    # charEmbeddingSize = args.char_emb_size
    # charWindowSize = args.char_window_size
    # charConvSize = args.conv_size

    # Word filters.
    log.info("Loading word filters...")
    wordFilters = getFilters(args.word_filters, log)

    # Loading/creating word lexicon and word embedding.
    if args.word_embedding is not None:
        log.info("Loading word embedding...")
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon is not None:
        log.info("Loading word lexicon...")
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=args.word_emb_size)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        sys.exit(1)

    # Loading char lexicon.
    log.info("Loading char lexicon...")
    charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")

    # Character embedding.
    charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=args.char_emb_size)

    # Loading label lexicon.
    log.info("Loading label lexicon...")
    labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")

    # Normalize the word embedding
    if args.normalization is not None:
        normFactor = 1
        if args.norm_factor is not None:
            normFactor = args.norm_factor

        if args.normalization == "minmax":
            log.info("Normalizing word embedding: minmax")
            wordEmbedding.minMaxNormalization(norm_coef=normFactor)
        elif args.normalization == "mean":
            log.info("Normalizing word embedding: mean")
            wordEmbedding.meanNormalization(norm_coef=normFactor)
        else:
            log.error("Unknown normalization method: %s" % args.normalization)
            sys.exit(1)
    elif args.normFactor is not None:
        log.error("Parameter norm_factor cannot be present without normalization.")
        sys.exit(1)

    dictionarySize = wordEmbedding.getNumberOfVectors()
    log.info("Size of word lexicon is %d and word embedding size is %d" % (dictionarySize, args.word_emb_size))

    # Setup the input and (golden) output generators (readers).
    inputGenerators = [
        WordWindowGenerator(args.word_window_size, wordLexicon, wordFilters, args.start_symbol, args.end_symbol),
        CharacterWindowGenerator(lexicon=charLexicon, numMaxChar=20, charWindowSize=args.char_window_size,
                                 wrdWindowSize=args.word_window_size, artificialChar="ART_CHAR", startPadding="</s>",
                                 startPaddingWrd=args.start_symbol, endPaddingWrd=args.end_symbol,
                                 filters=getFilters([], log))
    ]
    outputGenerator = LabelGenerator(labelLexicon)

    if args.cv is not None:
        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators,
                                          [outputGenerator], args.batch_size, shuffle=args.shuffle,
                                          numCVFolds=args.cv.numFolds)
        cvGenerators = trainIterator.getCVGenerators()
        iFold = 0
        numFolds = len(cvGenerators)
        for train, dev in cvGenerators:
            log.info({"cv": {"fold": iFold, "numFolds": numFolds}})
            trainNetwork(args, log, trainIterator=train, devIterator=dev, wordEmbedding=wordEmbedding,
                         charEmbedding=charEmbedding, borrow=False, labelLexicon=labelLexicon)
    else:
        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators,
                                          [outputGenerator], args.batch_size, shuffle=args.shuffle)

        # Get dev inputs and (golden) outputs.
        devIterator = None
        if args.dev is not None:
            log.info("Reading development examples")
            devIterator = SyncBatchIterator(TokenLabelPerLineReader(args.dev, labelTknSep='\t'), inputGenerators,
                                            [outputGenerator], sys.maxint, shuffle=False)

        trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow=True,
                     labelLexicon=labelLexicon)

    # Testing.
    if args.test:
        log.info("Reading test dataset...")
        testIterator = SyncBatchIterator(TokenLabelPerLineReader(args.test, labelTknSep='\t'), inputGenerators,
                                         [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing...")
        wnnModel.test(testIterator)

    log.info("Done!")
コード例 #6
0
def main(args):
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    loadPath = args.load_model

    if loadPath:
        with codecs.open(loadPath + ".param", "r",
                         encoding="utf-8") as paramsFile:
            param = json.load(paramsFile, encoding="utf-8")

        hiddenActFunctionName = param['hiddenActFunction']
        hiddenActFunction = method_name(hiddenActFunctionName)

        # Loading Embedding
        log.info("Loading Model")
        wordEmbedding = EmbeddingFactory().createFromW2V(
            loadPath + ".wv", ChosenUnknownStrategy(param["unknown"]))
        labelLexicon = Lexicon()

        for l in param["labels"]:
            labelLexicon.put(l)

        labelLexicon.stopAdd()

        # Loading model
        labelWeights = np.load(loadPath + ".npy").item(0)

        W1 = labelWeights["W_Hidden"]
        b1 = labelWeights["b_Hidden"]
        W2 = labelWeights["W_Softmax"]
        b2 = labelWeights["b_Softmax"]

        hiddenLayerSize = b1.shape[0]
    else:
        W1 = None
        b1 = None
        W2 = None
        b2 = None
        hiddenActFunctionName = args.hidden_activation_function
        hiddenActFunction = method_name(hiddenActFunctionName)

        if args.word_embedding:
            log.info("Reading W2v File")
            wordEmbedding = EmbeddingFactory().createFromW2V(
                args.word_embedding, RandomUnknownStrategy())
            wordEmbedding.stopAdd()
        elif args.hash_lex_size:
            wordEmbedding = RandomEmbedding(args.word_emb_size,
                                            RandomUnknownStrategy(),
                                            HashLexicon(args.hash_lex_size))
        else:
            wordEmbedding = EmbeddingFactory().createRandomEmbedding(
                args.word_emb_size)

        # Get the inputs and output
        if args.labels:
            labelLexicon = createLexiconUsingFile(args.labels)
        else:
            labelLexicon = Lexicon()

        if args.load_hidden_layer:
            # Loading Hidden Layer
            log.info("Loading Hidden Layer")

            hl = np.load(args.load_hidden_layer).item(0)

            W1 = hl["W_Encoder"]
            b1 = hl["b_Encoder"]

            hiddenLayerSize = b1.shape[0]

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = T.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = T.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = SigmoidGlorot(
    ) if hiddenActFunction == sigmoid else GlorotUniform()

    # Convolution layer. Convolução no texto de uma oferta.
    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=None,
                             b=None,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # List of input layers (will be concatenated).
    inputLayers = [maxPooling]

    # Generate word windows.
    wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize,
                                                     wordEmbedding, filters,
                                                     startSymbol, endSymbol)

    # List of input generators.
    inputGenerators = [
        lambda offer: wordWindowFeatureGenerator(offer["tokens"])
    ]

    concatenatedSize = convSize

    # Additional features.
    if args.categorical_features is not None:
        log.info("Using categorical features: %s" %
                 str([ftr[0] for ftr in args.categorical_features]))
        for ftr in args.categorical_features:
            concatenatedSize += ftr[2]
            ftrLexicon = createLexiconUsingFile(ftr[1])
            ftrEmbedding = RandomEmbedding(
                embeddingSize=ftr[2],
                unknownGenerateStrategy=RandomUnknownStrategy(),
                lexicon=ftrLexicon,
            )
            ftrInput = T.lscalar("in_" + ftr[0])
            ftrLayer = EmbeddingLayer(ftrInput,
                                      ftrEmbedding.getEmbeddingMatrix())

            inputGenerators.append(
                lambda offer: ftrLexicon.put(offer[ftr[0]].strip().lower()))
            inputTensors.append(ftrInput)
            inputLayers.append(ftrLayer)

    log.info("Input layers: %s" % str(inputLayers))

    # Concatenate all input layers, when there are more thean one input layer.
    concatenatedInLayers = maxPooling if len(
        inputLayers) == 1 else ConcatenateLayer(inputLayers, axis=0)

    if args.include_hidden_layer:
        # Hidden layer.
        hiddenLinear = LinearLayer(concatenatedInLayers,
                                   concatenatedSize,
                                   hiddenLayerSize,
                                   W=W1,
                                   b=b1,
                                   weightInitialization=weightInit)
        hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction)
    else:
        # Do not use a hidden layer.
        log.info("Not using hidden layer!")
        hiddenAct = concatenatedInLayers
        hiddenLayerSize = concatenatedSize

    # Entrada linear da camada softmax.
    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())
    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Class weights.
    labelWeights = None
    if args.labels_probs:
        numLabels = labelLexicon.getLen()
        labelWeights = np.zeros(numLabels, dtype=theano.config.floatX)
        if args.labels_probs.startswith("@"):
            # Load the dictionary from a JSON file.
            with codecs.open(args.labels_probs[1:], mode="r",
                             encoding="utf8") as f:
                labelDistribution = json.load(f)
        else:
            # The argument value is already a JSON.
            labelDistribution = json.loads(args.labels_probs)

        for k, v in labelDistribution.items():
            # The weight of a class is inversely-proportional to its frequency.
            labelWeights[labelLexicon.getLexiconIndex(k)] = 1.0 / v

        if args.labels_weights_log:
            # Attenuate weights for highly unbalanced classes.
            labelWeights = np.log(labelWeights)

        log.info("Label weights: " + str(labelWeights))

    # Loss function.
    loss = NegativeLogLikelihoodOneExample(labelWeights).calculateError(
        softmaxAct.getOutput()[0], prediction, outLabel)

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]

    if args.train:
        trainDatasetReader = OfertasReader(args.train)
        if args.load_method == "sync":
            log.info("Reading training examples...")
            trainIterator = SyncBatchIterator(trainDatasetReader,
                                              intputGenerators,
                                              outputGenerators,
                                              -1,
                                              shuffle=shuffle)
            wordEmbedding.stopAdd()
        elif args.load_method == "async":
            log.info("Examples will be asynchronously loaded.")
            trainIterator = AsyncBatchIterator(trainDatasetReader,
                                               inputGenerators,
                                               outputGenerators,
                                               -1,
                                               shuffle=shuffle,
                                               maxqSize=1000)
        else:
            log.error("The argument 'load_method' has an invalid value: %s." %
                      args.load_method)
            sys.exit(1)

        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error(
                "Argument eval_per_iteration cannot be used without a dev argument."
            )
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = OfertasReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            -1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    if normalizeMethod is not None and loadPath is not None:
        log.warn(
            "The word embedding of model was normalized. This can change the result of test."
        )

    #     if kwargs["lambda"]:
    #         _lambda = kwargs["lambda"]
    #         log.info("Using L2 with lambda= %.2f", _lambda)
    #         loss += _lambda * (T.sum(T.square(hiddenLinear.getParameters()[0])))

    # Decaimento da taxa de aprendizado.
    decay = 0.0
    if args.decay == "linear":
        decay = 1.0

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error(
            "Unknown algorithm: %s. Expected values are: adagrad or sgd." %
            args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction),
            FMetric("EvalFMetric",
                    outLabel,
                    prediction,
                    labels=labelLexicon.getLexiconDict().values())
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction),
            FMetric("TestFMetric",
                    outLabel,
                    prediction,
                    labels=labelLexicon.getLexiconDict().values())
        ]

        if args.test_probs:
            # Append predicted probabilities for the test set.
            testMetrics.append(
                PredictedProbabilities("TestProbs", softmaxAct.getOutput()))
    else:
        if args.test_probs:
            log.error(
                "The option test_probs requires a test dataset (option test).")
            sys.exit(1)

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = Model(x=inputTensors,
                  y=[outLabel],
                  allLayers=softmaxAct.getLayerSet(),
                  optimizer=opt,
                  prediction=prediction,
                  loss=loss,
                  trainMetrics=trainMetrics,
                  evalMetrics=evalMetrics,
                  testMetrics=testMetrics,
                  mode=mode)

    # Training
    if trainIterator:
        callback = []

        if args.save_model:
            savePath = args.save_model
            modelWriter = OfertasModelWritter(savePath, embeddingLayer,
                                              hiddenLinear, sotmaxLinearInput,
                                              wordEmbedding, labelLexicon,
                                              hiddenActFunctionName)
            callback.append(SaveModelCallback(modelWriter, "eval_acc", True))

        log.info("Training")
        model.train(trainIterator,
                    numEpochs,
                    devIterator,
                    evalPerIteration=evalPerIteration,
                    callbacks=callback)

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = OfertasReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         -1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
コード例 #7
0
def main(args):
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.wv_normalization
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    convSize = args.conv_size

    # Load classes for filters.
    filters = []
    for filterName in args.filters:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Filtro: " + moduleName + " " + className)
        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    W1 = None
    b1 = None
    W2 = None
    b2 = None
    hiddenActFunction = tanh

    if args.word_embedding:
        log.info("Reading W2v File")
        (lexicon,
         wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding,
                                                 unknownSymbol='unknown')
        lexicon.stopAdd()
    else:
        wordEmbedding = EmbeddingFactory().createRandomEmbedding(
            args.word_emb_size)

    # Get the inputs and output
    if args.labels:
        labelLexicon = Lexicon.fromTextFile(args.labels, hasUnknowSymbol=False)
    else:
        labelLexicon = Lexicon()

    #
    # Build the network model (Theano graph).
    #

    # TODO: debug
    # theano.config.compute_test_value = 'warn'
    # ex = trainIterator.next()
    # inWords.tag.test_value = ex[0][0]
    # outLabel.tag.test_value = ex[1][0]

    # Matriz de entrada. Cada linha representa um token da oferta. Cada token é
    # representado por uma janela de tokens (token central e alguns tokens
    # próximos). Cada valor desta matriz corresponde a um índice que representa
    # um token no embedding.
    inWords = T.lmatrix("inWords")

    # Categoria correta de uma oferta.
    outLabel = T.lscalar("outLabel")

    # List of input tensors. One for each input layer.
    inputTensors = [inWords]

    # Whether the word embedding will be updated during training.
    embLayerTrainable = not args.fix_word_embedding

    if not embLayerTrainable:
        log.info("Not updating the word embedding!")

    # Lookup table for word features.
    embeddingLayer = EmbeddingLayer(inWords,
                                    wordEmbedding.getEmbeddingMatrix(),
                                    trainable=embLayerTrainable)

    # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding).
    # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída
    # com a forma (numTokens, szWindow * szEmbedding).
    flattenInput = FlattenLayer(embeddingLayer)

    # Random weight initialization procedure.
    weightInit = SigmoidGlorot(
    ) if hiddenActFunction == sigmoid else GlorotUniform()

    # Convolution layer. Convolução no texto de um documento.
    convLinear = LinearLayer(flattenInput,
                             wordWindowSize * wordEmbedding.getEmbeddingSize(),
                             convSize,
                             W=None,
                             b=None,
                             weightInitialization=weightInit)

    # Max pooling layer.
    maxPooling = MaxPoolingLayer(convLinear)

    # Generate word windows.
    wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, lexicon,
                                                     filters, startSymbol,
                                                     endSymbol)

    # List of input generators.
    inputGenerators = [wordWindowFeatureGenerator]

    # Hidden layer.
    hiddenLinear = LinearLayer(maxPooling,
                               convSize,
                               hiddenLayerSize,
                               W=W1,
                               b=b1,
                               weightInitialization=weightInit)
    hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction)

    # Entrada linear da camada softmax.
    sotmaxLinearInput = LinearLayer(hiddenAct,
                                    hiddenLayerSize,
                                    labelLexicon.getLen(),
                                    W=W2,
                                    b=b2,
                                    weightInitialization=ZeroWeightGenerator())
    # Softmax.
    # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1))
    softmaxAct = ActivationLayer(sotmaxLinearInput, softmax)

    # Prediction layer (argmax).
    prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput())

    # Loss function.
    loss = NegativeLogLikelihoodOneExample().calculateError(
        softmaxAct.getOutput()[0], prediction, outLabel)

    # Output generator: generate one label per offer.
    outputGenerators = [TextLabelGenerator(labelLexicon)]

    if args.train:
        trainDatasetReader = DocReader(args.train)

        log.info("Reading training examples...")
        trainIterator = SyncBatchIterator(trainDatasetReader,
                                          inputGenerators,
                                          outputGenerators,
                                          -1,
                                          shuffle=shuffle)
        lexicon.stopAdd()
        labelLexicon.stopAdd()

        # Get dev inputs and output
        dev = args.dev
        evalPerIteration = args.eval_per_iteration
        if not dev and evalPerIteration > 0:
            log.error(
                "Argument eval_per_iteration cannot be used without a dev argument."
            )
            sys.exit(1)

        if dev:
            log.info("Reading development examples")
            devReader = DocReader(args.dev)
            devIterator = SyncBatchIterator(devReader,
                                            inputGenerators,
                                            outputGenerators,
                                            -1,
                                            shuffle=False)
        else:
            devIterator = None
    else:
        trainIterator = None
        devIterator = None

    if normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    elif normalizeMethod == "zscore":
        log.info("Normalization: zscore normalization")
        wordEmbedding.zscoreNormalization()
    elif normalizeMethod:
        log.error("Normalization: unknown value %s" % normalizeMethod)
        sys.exit(1)

    # Decaimento da taxa de aprendizado.
    if args.decay == "linear":
        decay = 1.0
    elif args.decay == "none":
        decay = 0.0
    else:
        log.error("Unknown decay strategy %s. Expected: none or linear." %
                  args.decay)
        sys.exit(1)

    # Algoritmo de aprendizado.
    if args.alg == "adagrad":
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    elif args.alg == "sgd":
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)
    else:
        log.error(
            "Unknown algorithm: %s. Expected values are: adagrad or sgd." %
            args.alg)
        sys.exit(1)

    # TODO: debug
    # opt.lr.tag.test_value = 0.05

    # Printing embedding information.
    dictionarySize = wordEmbedding.getNumberOfVectors()
    embeddingSize = wordEmbedding.getEmbeddingSize()
    log.info("Dictionary size: %d" % dictionarySize)
    log.info("Embedding size: %d" % embeddingSize)
    log.info("Number of categories: %d" % labelLexicon.getLen())

    # Train metrics.
    trainMetrics = None
    if trainIterator:
        trainMetrics = [
            LossMetric("TrainLoss", loss),
            AccuracyMetric("TrainAccuracy", outLabel, prediction)
        ]

    # Evaluation metrics.
    evalMetrics = None
    if devIterator:
        evalMetrics = [
            LossMetric("EvalLoss", loss),
            AccuracyMetric("EvalAccuracy", outLabel, prediction)
        ]

    # Test metrics.
    testMetrics = None
    if args.test:
        testMetrics = [
            LossMetric("TestLoss", loss),
            AccuracyMetric("TestAccuracy", outLabel, prediction)
        ]

    # TODO: debug
    # mode = theano.compile.debugmode.DebugMode(optimizer=None)
    mode = None
    model = BasicModel(x=inputTensors,
                       y=[outLabel],
                       allLayers=softmaxAct.getLayerSet(),
                       optimizer=opt,
                       prediction=prediction,
                       loss=loss,
                       trainMetrics=trainMetrics,
                       evalMetrics=evalMetrics,
                       testMetrics=testMetrics,
                       mode=mode)

    # Training
    if trainIterator:
        log.info("Training")
        model.train(trainIterator,
                    numEpochs,
                    devIterator,
                    evalPerIteration=evalPerIteration)

    # Testing
    if args.test:
        log.info("Reading test examples")
        testReader = DocReader(args.test)
        testIterator = SyncBatchIterator(testReader,
                                         inputGenerators,
                                         outputGenerators,
                                         -1,
                                         shuffle=False)

        log.info("Testing")
        model.test(testIterator)
コード例 #8
0
def main(**kwargs):
    log = logging.getLogger(__name__)
    log.info(kwargs)

    if kwargs["seed"] != None:
        random.seed(kwargs["seed"])
        np.random.seed(kwargs["seed"])

    filters = []

    for filterName in kwargs["filters"]:
        moduleName, className = filterName.rsplit('.', 1)
        log.info("Usando o filtro: " + moduleName + " " + className)

        module_ = importlib.import_module(moduleName)
        filters.append(getattr(module_, className)())

    # Get the inputs and output
    wordWindowSize = kwargs["word_window_size"]
    hiddenLayerSize = kwargs["hidden_size"]
    batchSize = kwargs["batch_size"]
    startSymbol = kwargs["start_symbol"]
    numEpochs = kwargs["num_epochs"]
    lr = kwargs["lr"]
    labelLexicon = createLexiconUsingFile(kwargs["label_file"])

    log.info("Reading training examples")

    log.info("Reading W2v File1")
    embedding1 = EmbeddingFactory().createFromW2V(kwargs["word_embedding1"],
                                                  RandomUnknownStrategy())

    # Supervised part
    # Learner1
    input1 = T.lmatrix(name="input1")

    embeddingLayer1 = EmbeddingLayer(input1,
                                     embedding1.getEmbeddingMatrix(),
                                     trainable=True)
    flatten1 = FlattenLayer(embeddingLayer1)

    linear11 = LinearLayer(flatten1,
                           wordWindowSize * embedding1.getEmbeddingSize(),
                           hiddenLayerSize,
                           weightInitialization=GlorotUniform())
    act11 = ActivationLayer(linear11, tanh)

    linear12 = LinearLayer(act11,
                           hiddenLayerSize,
                           labelLexicon.getLen(),
                           weightInitialization=ZeroWeightGenerator())
    act12 = ActivationLayer(linear12, softmax)

    ## Learner2
    log.info("Reading W2v File2")
    embedding2 = EmbeddingFactory().createFromW2V(kwargs["word_embedding2"],
                                                  RandomUnknownStrategy())

    input2 = T.lmatrix(name="input2")

    embeddingLayer2 = EmbeddingLayer(input2,
                                     embedding2.getEmbeddingMatrix(),
                                     trainable=True)
    flatten2 = FlattenLayer(embeddingLayer2)

    linear21 = LinearLayer(flatten2,
                           wordWindowSize * embedding2.getEmbeddingSize(),
                           hiddenLayerSize,
                           weightInitialization=GlorotUniform())
    act21 = ActivationLayer(linear21, tanh)

    linear22 = LinearLayer(act21,
                           hiddenLayerSize,
                           labelLexicon.getLen(),
                           weightInitialization=ZeroWeightGenerator())
    act22 = ActivationLayer(linear22, softmax)

    y = T.lvector("y")

    # Set loss and prediction and retrieve all layers
    output1 = act12.getOutput()
    prediction1 = ArgmaxPrediction(1).predict(output1)
    loss1 = NegativeLogLikelihood().calculateError(output1, prediction1, y)

    if kwargs["l2"][0]:
        _lambda1 = kwargs["l2"][0]
        log.info("Using L2 with lambda= %.2f", _lambda1)
        loss1 += _lambda1 * (T.sum(T.square(linear11.getParameters()[0])))

    output2 = act22.getOutput()
    prediction2 = ArgmaxPrediction(1).predict(output2)
    loss2 = NegativeLogLikelihood().calculateError(output2, prediction2, y)

    if kwargs["l2"][1]:
        _lambda2 = kwargs["l2"][1]
        log.info("Using L2 with lambda= %.2f", _lambda2)
        loss2 += _lambda2 * (T.sum(T.square(linear21.getParameters()[0])))

    loss = loss1 + loss2

    ## CoLearningPrediction
    output = T.stack([linear12.getOutput(), linear22.getOutput()])
    # return T.argmax(output, 2)[T.argmax(T.max(output, 2), 0),T.arange(output.shape[1])]
    average = T.mean(output, 0)
    prediction = ArgmaxPrediction(1).predict(
        ActivationLayer(average, softmax).getOutput())
    # prediction = CoLearningWnnPrediction().predict([output1, output2])

    supervisedModeUnit = ModelUnit("supervised_wnn", [input1, input2],
                                   y,
                                   loss,
                                   prediction=prediction)

    # Unsupervised part

    ## Learner1
    inputUnsuper1 = T.lmatrix(name="input_unsupervised_1")

    embeddingLayerUnsuper1 = EmbeddingLayer(inputUnsuper1,
                                            embeddingLayer1.getParameters()[0],
                                            trainable=True)

    flattenUnsuper1 = FlattenLayer(embeddingLayerUnsuper1)

    w, b = linear11.getParameters()
    linearUnsuper11 = LinearLayer(flattenUnsuper1,
                                  wordWindowSize *
                                  embedding1.getEmbeddingSize(),
                                  hiddenLayerSize,
                                  W=w,
                                  b=b)
    actUnsupervised11 = ActivationLayer(linearUnsuper11, tanh)

    w, b = linear12.getParameters()
    linearUnsuper12 = LinearLayer(actUnsupervised11,
                                  hiddenLayerSize,
                                  labelLexicon.getLen(),
                                  W=w,
                                  b=b)
    actUnsuper12 = ActivationLayer(linearUnsuper12, softmax)

    ## Learner2
    inputUnsuper2 = T.lmatrix(name="input_unsupervised_2")

    embeddingLayerUnsuper2 = EmbeddingLayer(inputUnsuper2,
                                            embeddingLayer2.getParameters()[0],
                                            trainable=True)
    flattenUnsuper2 = FlattenLayer(embeddingLayerUnsuper2)

    w, b = linear21.getParameters()
    linearUnsuper21 = LinearLayer(flattenUnsuper2,
                                  wordWindowSize *
                                  embedding2.getEmbeddingSize(),
                                  hiddenLayerSize,
                                  W=w,
                                  b=b)
    actUnsuper21 = ActivationLayer(linearUnsuper21, tanh)

    w, b = linear22.getParameters()
    linearUnsuper22 = LinearLayer(actUnsuper21,
                                  hiddenLayerSize,
                                  labelLexicon.getLen(),
                                  W=w,
                                  b=b)
    actUnsuper22 = ActivationLayer(linearUnsuper22, softmax)

    # Set loss and prediction and retrieve all layers
    outputUns1 = actUnsuper12.getOutput()
    predictionUns1 = ArgmaxPrediction(1).predict(outputUns1)

    outputUns2 = actUnsuper22.getOutput()
    predictionUns2 = ArgmaxPrediction(1).predict(outputUns2)
    #
    # unsupervisedLoss = kwargs["lambda"] * (
    #         NegativeLogLikelihood().calculateError(outputUns1, predictionUns1, predictionUns2) +
    #         NegativeLogLikelihood().calculateError(outputUns2, predictionUns2, predictionUns1))

    _lambdaShared = theano.shared(value=kwargs["lambda"],
                                  name='lambda',
                                  borrow=True)

    unsupervisedLoss = _lambdaShared * (NegativeLogLikelihood().calculateError(
        outputUns1, predictionUns1, predictionUns2) + NegativeLogLikelihood(
        ).calculateError(outputUns2, predictionUns2, predictionUns1))

    unsupervisedUnit = ModelUnit("unsupervised_wnn",
                                 [inputUnsuper1, inputUnsuper2],
                                 None,
                                 unsupervisedLoss,
                                 yWillBeReceived=False)

    # Creates model
    model = CoLearningModel(kwargs["loss_uns_epoch"])

    model.addTrainingModelUnit(supervisedModeUnit, metrics=["loss", "acc"])
    model.addTrainingModelUnit(unsupervisedUnit, metrics=["loss"])

    model.setEvaluatedModelUnit(supervisedModeUnit, metrics=["acc"])

    # Compile Model
    opt1 = SGD(lr=lr[0], decay=1.0)
    opt2 = SGD(lr=lr[1], decay=1.0)

    log.info("Compiling the model")
    model.compile([(opt1, {
        supervisedModeUnit: act12.getLayerSet(),
        unsupervisedUnit: actUnsuper12.getLayerSet()
    }),
                   (opt2, {
                       supervisedModeUnit: act22.getLayerSet(),
                       unsupervisedUnit: actUnsuper22.getLayerSet()
                   })])

    # Generators
    inputGenerator1 = WordWindowGenerator(wordWindowSize, embedding1, filters,
                                          startSymbol)
    inputGenerator2 = WordWindowGenerator(wordWindowSize, embedding2, filters,
                                          startSymbol)
    outputGenerator = LabelGenerator(labelLexicon)

    # Reading supervised and unsupervised data sets.
    trainSupervisedDatasetReader = TokenLabelReader(
        kwargs["train_supervised"], kwargs["token_label_separator"])
    trainSupervisedDatasetReader = SyncBatchIterator(
        trainSupervisedDatasetReader, [inputGenerator1, inputGenerator2],
        [outputGenerator], batchSize[0])

    trainUnsupervisedDataset = TokenReader(kwargs["train_unsupervised"])
    trainUnsupervisedDatasetReader = SyncBatchIterator(
        trainUnsupervisedDataset, [inputGenerator1, inputGenerator2], None,
        batchSize[1])

    embedding1.stopAdd()
    embedding2.stopAdd()
    labelLexicon.stopAdd()

    # Get dev inputs and output
    log.info("Reading development examples")
    devDatasetReader = TokenLabelReader(kwargs["dev"],
                                        kwargs["token_label_separator"])
    devReader = SyncBatchIterator(devDatasetReader,
                                  [inputGenerator1, inputGenerator2],
                                  [outputGenerator],
                                  sys.maxint,
                                  shuffle=False)

    lambdaChange = ChangeLambda(_lambdaShared, kwargs["lambda"],
                                kwargs["loss_uns_epoch"])
    lossCallback = LossCallback(loss1, loss2, input1, input2, y)

    # trainUnsupervisedDatasetReaderAcc = SyncBatchIterator(trainUnsupervisedDataset,
    #                                                       [inputGenerator1, inputGenerator2],
    #                                                       [outputGenerator], sys.maxint)

    # accCallBack = AccCallBack(prediction1, prediction2, input1, input2,
    #                           unsurpervisedDataset=trainUnsupervisedDatasetReaderAcc)

    # Training Model
    model.train([trainSupervisedDatasetReader, trainUnsupervisedDatasetReader],
                numEpochs,
                devReader,
                callbacks=[lambdaChange, lossCallback])
コード例 #9
0
def mainWnnNegativeSampling(args):
    # Reading parameters
    embeddingMatrix = None
    wordEmbeddingSize = args.word_embedding_size
    windowSize = args.window_size
    hiddenLayerSize = args.hidden_size
    startSymbol = args.start_symbol
    # endSymbol = args.end_symbol
    endSymbol = startSymbol
    noiseRate = args.noise_rate

    # todo: o algoritmo não suporta mini batch. Somente treinamento estocástico.
    batchSize = 1

    shuffle = args.shuffle
    lr = args.lr
    numEpochs = args.num_epochs
    power = args.power

    minLr = args.min_lr
    numExUpdLr = args.num_examples_updt_lr

    log = logging.getLogger(__name__)

    log.info(str(args))

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)
    #
    # if args.decay.lower() == "normal":
    #     decay = 0.0
    # elif args.decay.lower() == "divide_epoch":
    #     decay = 1.0

    parametersToSaveOrLoad = {"hidden_size", "window_size", "start_symbol"}

    # Calculate the frequency of each word
    trainReader = TokenReader(args.train)
    wordLexicon = Lexicon("UUKNNN", "lexicon")
    wordLexicon.put(startSymbol, False)

    totalNumOfTokens = 0
    for tokens, labels in trainReader.read():
        # we don't count the </s>, because this token is only insert in the sentence to count its frequency.
        totalNumOfTokens += len(tokens)

        # Word2vec considers that the number of lines is the frequency of </s>
        tokens += [startSymbol]

        for token in tokens:
            wordLexicon.put(token)

    # Prune the words with the frequency less than min_count
    wordLexicon.prune(args.min_count)
    wordLexicon.stopAdd()

    # Calculte the unigram distribution
    frequency = np.power(wordLexicon.getFrequencyOfAllWords(), power)
    total = float(frequency.sum())

    # # Print the distribution of all words
    # for _ in xrange(len(frequency)):
    #     print "%s\t%d\t%.4f" % (wordLexicon.getLexicon(_), frequency[_],frequency[_]/float(total))

    sampler = Sampler(frequency / float(total))

    # Create a random embedding for each word
    wordEmbedding = Embedding(wordLexicon, None, wordEmbeddingSize)
    log.info("Lexicon size: %d" % (wordLexicon.getLen()))

    # Create NN
    x = T.lmatrix("word_window")
    y = T.lvector("labels")

    wordEmbeddingLayer = EmbeddingLayer(x,
                                        wordEmbedding.getEmbeddingMatrix(),
                                        name="embedding")
    flatten = FlattenLayer(wordEmbeddingLayer)

    linear1 = LinearLayer(flatten,
                          wordEmbeddingSize * windowSize,
                          hiddenLayerSize,
                          name="linear1")
    act1 = ActivationLayer(linear1, tanh)

    # Softmax regression. It's like a logistic regression
    linear2 = LinearLayer(act1,
                          hiddenLayerSize,
                          1,
                          weightInitialization=ZeroWeightGenerator(),
                          name="linear_softmax_regresion")

    act2 = ActivationLayer(linear2, sigmoid)
    # We clip the output of -sigmoid, because this output can be 0  and ln(0) is infinite, which can cause problems.
    output = T.flatten(T.clip(act2.getOutput(), 10**-5, 1 - 10**-5))

    # Loss Functions
    negativeSamplingLoss = T.nnet.binary_crossentropy(output, y).sum()
    # Set training
    inputGenerators = [
        WordWindowGenerator(windowSize, wordLexicon, [], startSymbol,
                            endSymbol)
    ]

    outputGenerators = [ConstantLabel(labelLexicon=None, label=1)]

    trainIterator = SyncBatchIterator(trainReader, inputGenerators,
                                      outputGenerators, batchSize, shuffle)

    trainMetrics = [LossMetric("lossTrain", negativeSamplingLoss)]

    allLayers = act2.getLayerSet()

    # opt = SGD(lr=lr, decay=decay)
    opt = SGD(lr=lr)

    model = NegativeSamplingModel(args.t, noiseRate, sampler, minLr,
                                  numExUpdLr, totalNumOfTokens, numEpochs, [x],
                                  [y], allLayers, opt, negativeSamplingLoss,
                                  trainMetrics)
    # Save Model
    if args.save_model:
        savePath = args.save_model
        objsToSave = list(act2.getLayerSet()) + [wordLexicon]

        modelWriter = ModelWriter(savePath, objsToSave, args,
                                  parametersToSaveOrLoad)

    # Training
    model.train(trainIterator, numEpochs=numEpochs, callbacks=[])

    if args.save_model:
        modelWriter.save()
コード例 #10
0
def mainWnn(args):
    ################################################
    # Initializing parameters
    ##############################################
    log = logging.getLogger(__name__)

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)

    parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters",
                              "alg", "hidden_activation_function", "word_window_size", "char_window_size",
                              "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size",
                              "use_capitalization", "start_symbol", "end_symbol", "with_hidden"}

    # Load parameters of the saving model
    if args.load_model:
        persistentManager = H5py(args.load_model)
        savedParameters = json.loads(persistentManager.getAttribute("parameters"))

        if savedParameters.get("charwnn_filters", None) != None:
            savedParameters["char_filters"] = savedParameters["charwnn_filters"]
            savedParameters.pop("charwnn_filters")
            print savedParameters

        log.info("Loading parameters of the model")
        args = args._replace(**savedParameters)

    log.info(str(args))

    # Read the parameters
    lr = args.lr
    startSymbol = args.start_symbol
    endSymbol = args.end_symbol
    numEpochs = args.num_epochs
    shuffle = args.shuffle
    normalizeMethod = args.normalization.lower() if args.normalization is not None else None
    wordWindowSize = args.word_window_size
    hiddenLayerSize = args.hidden_size
    hiddenActFunctionName = args.hidden_activation_function
    embeddingSize = args.word_emb_size

    withCharWNN = args.with_charwnn
    charEmbeddingSize = args.char_emb_size
    charWindowSize = args.char_window_size
    startSymbolChar = "</s>"

    suffixEmbSize = args.suffix_emb_size
    capEmbSize = args.cap_emb_size

    useSuffixFeatures = args.suffix_size > 0
    useCapFeatures = args.use_capitalization

    # Insert the character that will be used to fill the matrix
    # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication.
    artificialChar = "ART_CHAR"

    # TODO: the maximum number of characters of word is fixed in 20.
    numMaxChar = 20

    if args.alg == "window_stn":
        isSentenceModel = True
    elif args.alg == "window_word":
        isSentenceModel = False
    else:
        raise Exception("The value of model_type isn't valid.")

    batchSize = -1 if isSentenceModel else args.batch_size
    wordFilters = []

    # Lendo Filtros do wnn
    log.info("Lendo filtros básicos")
    wordFilters = getFilters(args.word_filters, log)

    # Lendo Filtros do charwnn
    log.info("Lendo filtros do charwnn")
    charFilters = getFilters(args.char_filters, log)

    # Lendo Filtros do suffix
    log.info("Lendo filtros do sufixo")
    suffixFilters = getFilters(args.suffix_filters, log)

    # Lendo Filtros da capitalização
    log.info("Lendo filtros da capitalização")
    capFilters = getFilters(args.cap_filters, log)

    ################################################
    # Create the lexicon and go out after this
    ################################################
    if args.create_only_lexicon:
        inputGenerators = []
        lexiconsToSave = []

        if args.word_lexicon and not os.path.exists(args.word_lexicon):
            wordLexicon = Lexicon("UUUNKKK", "labelLexicon")

            inputGenerators.append(
                WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol))
            lexiconsToSave.append((wordLexicon, args.word_lexicon))

        if not os.path.exists(args.label_file):
            labelLexicon = Lexicon(None, "labelLexicon")
            outputGenerator = [LabelGenerator(labelLexicon)]
            lexiconsToSave.append((labelLexicon, args.label_file))
        else:
            outputGenerator = None

        if args.char_lexicon and not os.path.exists(args.char_lexicon):
            charLexicon = Lexicon("UUUNKKK", "charLexicon")

            charLexicon.put(startSymbolChar)
            charLexicon.put(artificialChar)

            inputGenerators.append(
                CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                         startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                         filters=charFilters))

            lexiconsToSave.append((charLexicon, args.char_lexicon))

        if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon):
            suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon")

            if args.suffix_size <= 0:
                raise Exception(
                    "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.")

            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

            lexiconsToSave.append((suffixLexicon, args.suffix_lexicon))

        if args.cap_lexicon and not os.path.exists(args.cap_lexicon):
            capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon")

            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

            lexiconsToSave.append((capLexicon, args.cap_lexicon))

        if len(inputGenerators) == 0:
            inputGenerators = None

        if not (inputGenerators or outputGenerator):
            log.info("All lexicons have been generated.")
            return

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize,
                                        shuffle=shuffle)

        for lexicon, pathToSave in lexiconsToSave:
            lexicon.save(pathToSave)

        log.info("Lexicons were generated with success!")

        return

    ################################################
    # Starting training
    ###########################################

    if withCharWNN and (useSuffixFeatures or useCapFeatures):
        raise Exception("It's impossible to use hand-crafted features with Charwnn.")

    # Read word lexicon and create word embeddings
    if args.load_model:
        wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon")
        vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer")

        wordEmbedding = Embedding(wordLexicon, vectors)

    elif args.word_embedding:
        wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon")
    elif args.word_lexicon:
        wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon")
        wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize)
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Read char lexicon and create char embeddings
    if withCharWNN:
        if args.load_model:
            charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon")
            vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                     "char_convolution_layer")

            charEmbedding = Embedding(charLexicon, vectors)
        elif args.char_lexicon:
            charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon")
            charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize)
        else:
            log.error("You need to set one of these parameters: load_model or char_lexicon")
            return
    else:
        # Read suffix lexicon if suffix size is greater than 0
        if useSuffixFeatures:
            if args.load_model:
                suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "suffix_embedding")

                suffixEmbedding = Embedding(suffixLexicon, vectors)
            elif args.suffix_lexicon:
                suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon")
                suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or suffix_lexicon")
                return

        # Read capitalization lexicon
        if useCapFeatures:
            if args.load_model:
                capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon")
                vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager,
                                                                                         "cap_embedding")

                capEmbedding = Embedding(capLexicon, vectors)
            elif args.cap_lexicon:
                capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon")
                capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize)
            else:
                log.error("You need to set one of these parameters: load_model or cap_lexicon")
                return

    # Read labels
    if args.load_model:
        labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon")
    elif args.label_file:
        labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon")
    else:
        log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon")
        return

    # Normalize the word embedding
    if not normalizeMethod:
        pass
    elif normalizeMethod == "minmax":
        log.info("Normalization: minmax")
        wordEmbedding.minMaxNormalization()
    elif normalizeMethod == "mean":
        log.info("Normalization: mean normalization")
        wordEmbedding.meanNormalization()
    else:
        log.error("Unknown normalization method: %s" % normalizeMethod)
        sys.exit(1)

    if normalizeMethod is not None and args.load_model is not None:
        log.warn("The word embedding of model was normalized. This can change the result of test.")

    # Build neural network
    if isSentenceModel:
        raise NotImplementedError("Sentence model is not implemented!")
    else:
        wordWindow = T.lmatrix("word_window")
        inputModel = [wordWindow]

        wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True,
                                            name="word_embedding_layer")
        flatten = FlattenLayer(wordEmbeddingLayer)

        if withCharWNN:
            # Use the convolution
            log.info("Using charwnn")
            convSize = args.conv_size

            if args.charwnn_with_act:
                charAct = tanh
            else:
                charAct = None

            charWindowIdxs = T.ltensor4(name="char_window_idx")
            inputModel.append(charWindowIdxs)

            charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(),
                                                                 numMaxChar, convSize, charWindowSize,
                                                                 charEmbeddingSize, charAct,
                                                                 name="char_convolution_layer")
            layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer])
            sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize)
        elif useSuffixFeatures or useCapFeatures:
            # Use hand-crafted features
            concatenateInputs = [flatten]
            nmFetauresByWord = wordEmbedding.getEmbeddingSize()

            if useSuffixFeatures:
                log.info("Using suffix features")

                suffixInput = T.lmatrix("suffix_input")
                suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(),
                                                name="suffix_embedding")
                suffixFlatten = FlattenLayer(suffixEmbLayer)
                concatenateInputs.append(suffixFlatten)

                nmFetauresByWord += suffixEmbedding.getEmbeddingSize()
                inputModel.append(suffixInput)

            if useCapFeatures:
                log.info("Using capitalization features")

                capInput = T.lmatrix("capitalization_input")
                capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(),
                                             name="cap_embedding")
                capFlatten = FlattenLayer(capEmbLayer)
                concatenateInputs.append(capFlatten)

                nmFetauresByWord += capEmbedding.getEmbeddingSize()
                inputModel.append(capInput)

            layerBeforeLinear = ConcatenateLayer(concatenateInputs)
            sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord
        else:
            # Use only the word embeddings
            layerBeforeLinear = flatten
            sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize()

        # The rest of the NN
        if args.with_hidden:
            hiddenActFunction = method_name(hiddenActFunctionName)
            weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform()

            linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize,
                                  weightInitialization=weightInit, name="linear1")
            act1 = ActivationLayer(linear1, hiddenActFunction)

            layerBeforeSoftmax = act1
            sizeLayerBeforeSoftmax = hiddenLayerSize
            log.info("Using hidden layer")
        else:
            layerBeforeSoftmax = layerBeforeLinear
            sizeLayerBeforeSoftmax = sizeLayerBeforeLinear
            log.info("Not using hidden layer")

        linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(),
                              weightInitialization=ZeroWeightGenerator(),
                              name="linear_softmax")
        act2 = ActivationLayer(linear2, softmax)
        prediction = ArgmaxPrediction(1).predict(act2.getOutput())

    # Load the model
    if args.load_model:
        alreadyLoaded = set([wordEmbeddingLayer])

        for o in (act2.getLayerSet() - alreadyLoaded):
            if o.getName():
                persistentManager.load(o)

    # Set the input and output
    inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)]

    if withCharWNN:
        inputGenerators.append(
            CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar,
                                     startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol,
                                     filters=charFilters))
    else:
        if useSuffixFeatures:
            inputGenerators.append(
                SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters))

        if useCapFeatures:
            inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters))

    outputGenerator = LabelGenerator(labelLexicon)

    if args.train:
        log.info("Reading training examples")

        trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator)
        trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize,
                                        shuffle=shuffle)

        # Get dev inputs and output
        dev = args.dev

        if dev:
            log.info("Reading development examples")
            devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator)
            devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint,
                                          shuffle=False)
        else:
            devReader = None
    else:
        trainReader = None
        devReader = None

    y = T.lvector("y")

    if args.decay.lower() == "normal":
        decay = 0.0
    elif args.decay.lower() == "divide_epoch":
        decay = 1.0

    if args.adagrad:
        log.info("Using Adagrad")
        opt = Adagrad(lr=lr, decay=decay)
    else:
        log.info("Using SGD")
        opt = SGD(lr=lr, decay=decay)

    # Printing embedding information
    dictionarySize = wordEmbedding.getNumberOfVectors()

    log.info("Size of  word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize))

    if withCharWNN:
        log.info("Size of  char dictionary and char embedding size: %d and %d" % (
            charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize()))

    if useSuffixFeatures:
        log.info("Size of  suffix dictionary and suffix embedding size: %d and %d" % (
            suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize()))

    if useCapFeatures:
        log.info("Size of  capitalization dictionary and capitalization embedding size: %d and %d" % (
            capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize()))

    # Compiling
    loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y)

    if args.lambda_L2:
        _lambda = args.lambda_L2
        log.info("Using L2 with lambda= %.2f", _lambda)
        loss += _lambda * (T.sum(T.square(linear1.getParameters()[0])))

    trainMetrics = [
        LossMetric("LossTrain", loss, True),
        AccuracyMetric("AccTrain", y, prediction),
    ]

    evalMetrics = [
        LossMetric("LossDev", loss, True),
        AccuracyMetric("AccDev", y, prediction),
    ]

    testMetrics = [
        LossMetric("LossTest", loss, True),
        AccuracyMetric("AccTest", y, prediction),
    ]

    wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics,
                          evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None)
    # Training
    if trainReader:
        callback = []

        if args.save_model:
            savePath = args.save_model
            objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon]

            if withCharWNN:
                objsToSave.append(charLexicon)

            if useSuffixFeatures:
                objsToSave.append(suffixLexicon)

            if useCapFeatures:
                objsToSave.append(capLexicon)

            modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad)

            # Save the model with best acc in dev
            if args.save_by_acc:
                callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True))

        log.info("Training")
        wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback)

        # Save the model at the end of training
        if args.save_model and not args.save_by_acc:
            modelWriter.save()

    # Testing
    if args.test:
        log.info("Reading test examples")
        testDatasetReader = TokenLabelReader(args.test, args.token_label_separator)
        testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False)

        log.info("Testing")
        wnnModel.test(testReader)

        if args.print_prediction:
            f = codecs.open(args.print_prediction, "w", encoding="utf-8")

            for x, labels in testReader:
                inputs = x

                predictions = wnnModel.prediction(inputs)

                for prediction in predictions:
                    f.write(labelLexicon.getLexicon(prediction))
                    f.write("\n")