def main(args): # jsonNs = JsonArgParser(ns.PARAMETERS).parse( args.negative_sampling_json_base) jsonNs["lr"] = args.lr_ns jsonNs["noise_rate"] = args.noise_rate jsonNs["num_epochs"] = args.num_epochs jsonNs["t"] = args.t jsonNs["power"] = args.power jsonNs["min_count"] = args.min_count jsonNs["save_model"] = args.save_model jsonWnn = JsonArgParser(wt.WNN_PARAMETERS).parse( args.wnn_with_target_json_base) jsonWnn["lr"] = args.lr_wnn jsonWnn["target_module"] = args.save_model log = logging.getLogger(__name__) log.info("Starting unsupervised training") ns.mainWnnNegativeSampling(dict2obj(jsonNs)) log.info("Starting supervised training") wt.mainWnn(dict2obj(jsonWnn))
def main(): nArgs = len(sys.argv) if nArgs != 2 and nArgs != 4: sys.stderr.write('Syntax error! Expected arguments: <params_file> [<num_folds> <params_dist>]\n') sys.stderr.write('\t<params_file>: JSON-formatted file containing parameter values\n') sys.stderr.write('\t<num_folds>: number of cross-validation folds used in random search\n') sys.stderr.write('\t<params_dist>: JSON-formatted file containing the parameter distributions used in random ' 'search\n') sys.exit(1) full_path = os.path.realpath(__file__) path = os.path.split(full_path)[0] logging.config.fileConfig(os.path.join(path, 'logging.conf')) parameters = dict2obj(JsonArgParser(WNN_PARAMETERS).parse(sys.argv[1])) mainWnnNer(parameters)
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 2: log.error("Missing argument: <JSON config file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error("You must provide argument word_embedding or word_lexicon and word_emb_size") # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error("Only one of the parameters label_lexicon and labels can be provided!") exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error("One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) if args.conv_act: convOut = ActivationLayer(convLinear, tanh) else: convOut = convLinear # Max pooling layer. maxPooling = MaxPoolingLayer(convOut) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen(): log.error("Number of label weights (%d) is different from number of labels (%d)!" % ( len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if args.train: trainDatasetReader = ShortDocReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle) wordLexicon.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error("Argument eval_per_iteration cannot be used without a dev argument.") sys.exit(1) if dev: log.info("Reading development examples") devReader = ShortDocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, - 1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Saving model after training if args.save_wordEmbedding: embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon) log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding)) if args.save_conv: convLinear.save(args.save_conv) log.info("Saved convolution layer to file: %s" % (args.save_conv)) if args.save_hiddenLayer: hiddenLinear.save(args.save_hiddenLayer) log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer)) if args.save_softmax: sotmaxLinearInput.save(args.save_softmax) log.info("Saved softmax to file: %s" % (args.save_softmax)) # Testing if args.test: log.info("Reading test examples") testReader = ShortDocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, - 1, shuffle=False) log.info("Testing") model.test(testIterator)
testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator) def method_name(hiddenActFunction): if hiddenActFunction == "tanh": return tanh elif hiddenActFunction == "sigmoid": return sigmoid else: raise Exception("'hidden_activation_function' value don't valid.") if __name__ == '__main__': full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'OfertaArguments') logging.getLogger(__name__).info(argsDict) main(args)
jsonNs["lr"] = args.lr_ns jsonNs["noise_rate"] = args.noise_rate jsonNs["num_epochs"] = args.num_epochs jsonNs["t"] = args.t jsonNs["power"] = args.power jsonNs["min_count"] = args.min_count jsonNs["save_model"] = args.save_model jsonWnn = JsonArgParser(wt.WNN_PARAMETERS).parse( args.wnn_with_target_json_base) jsonWnn["lr"] = args.lr_wnn jsonWnn["target_module"] = args.save_model log = logging.getLogger(__name__) log.info("Starting unsupervised training") ns.mainWnnNegativeSampling(dict2obj(jsonNs)) log.info("Starting supervised training") wt.mainWnn(dict2obj(jsonWnn)) if __name__ == '__main__': full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf')) parameters = dict2obj(JsonArgParser(PARAMETERS).parse(sys.argv[1])) main(parameters)
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 3: log.error("Missing argument: <JSON config file> or/and <Input file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error( "You must provide argument word_embedding or word_lexicon and word_emb_size" ) # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error( "Only one of the parameters label_lexicon and labels can be provided!" ) exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error( "One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len( args.label_weights) != labelLexicon.getLen(): log.error( "Number of label weights (%d) is different from number of labels (%d)!" % (len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [ WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) ] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, mode=mode) wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) # GETS HIDDEN LAYER: # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow) # GRAPH FOR PREDICTION LAYER graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode) lblTxt = ["Sim", "Nao"] tweets = [] with open(sys.argv[2]) as inputFile: content = inputFile.readlines() for line in content: tweets.append(line.decode('utf-8').encode('utf-8')) #print tweets # graph.getResultsFor(t) retorna a predição para dado Tweet t try: output_file = open("Output.txt", "w") except: print "Falha em criar o arquivo de saida\n" try: for t in tweets: output_file.write( t.replace('\n', '').replace('\t', '') + "\t " + lblTxt[graph.getResultsFor(t)] + "\n") print "Resultados gerados com sucesso!\n" except: print "Erro na geração de resultados\n"
log.info("Reading test examples") testReader = DocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator) if __name__ == '__main__': # Load logging configuration. full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 2: log.error('Syntax error! Expected JSON arguments file.') sys.exit(1) # Load arguments from JSON input file. argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'DocClassificationArguments') logging.getLogger(__name__).info(argsDict) main(args)