def trainer_helper(configFile,dataSetFile,tempModel): print "Training model on ",configFile,dataSetFile config = get_training_config_from_json(configFile) sentences, vocab, labels = build_data(dataSetFile,True) word_vecs = wordvecs.load_wordvecs(config.word2vec,vocab) trainer = TextCNNModelTrainer(config,word_vecs,sentences,labels) trainer.train(tempModel) print "Succesfully trained model on ",configFile,dataSetFile," and model is at ",tempModel print "Will proceed at testing the model on same data. If everything is correct, you should see the same accuracy" model = cPickle.load(open(tempModel,"rb")) op = model.classify(sentences) os.remove(tempModel)
__author__ = 'devashish.shankar' if __name__ == "__main__": if len(sys.argv) < 5: print "Usage: training.py" print "\t<model config file path>" print "\t<training data file path>" print "\t<file path to store classifier model>" print "\t<true/false(preprocessing flag)>" exit(0) #processing.. config_file = sys.argv[1] train_data_file = sys.argv[2] model_output_file = sys.argv[3] preprocess = sys.argv[4].lower() training_config = config.get_training_config_from_json(config_file) sentences, vocab, labels = datasets.build_data(train_data_file, preprocess) word_vecs = wordvecs.load_wordvecs(training_config.word2vec, vocab) if training_config.mode == "multichannel": nntrainer = MultiChannelTrainer(training_config, word_vecs, sentences, labels) else: nntrainer = TextCNNModelTrainer(training_config, word_vecs, sentences, labels) nntrainer.train(model_output_file)
modelfile = sys.argv[1] outputdir = sys.argv[3] preprocess = sys.argv[4].lower() load_word_vecs = sys.argv[5].lower() == "true" if not os.path.exists(outputdir): print "Output dir ", outputdir, " doesn't exist. Creating it" os.makedirs(outputdir) else: print "Using Output dir ", outputdir, ". Any previous results in this dir on same dataset might get overwritten. " model = cPickle.load(open(modelfile, "rb")) if load_word_vecs: print "Loading word vectors" model.add_global_word_vecs({}) print "Loading word vectors done" sentences, vocab, labels = datasets.build_data(testfile, preprocess) labels = model.get_labels() output = model.classify(sentences) # Free memory del model print "Removed model from memory" # Format the output to earlier format # TODO evaluate function should be changed to accept newer format, which is cleaner data = [] for i in range(len(output[0])): actual_label = sentences[i]["y"] text = sentences[i]["text"] predicted_label = output[0][i] predicted_prob = output[1][i][predicted_label] data.append([predicted_prob, labels[predicted_label], labels[actual_label], text]) evaluate(data, outputdir)
modelfile = sys.argv[1] outputdir = sys.argv[3] preprocess = sys.argv[4].lower() load_word_vecs = sys.argv[5].lower() == "true" if not os.path.exists(outputdir): print "Output dir ", outputdir, " doesn't exist. Creating it" os.makedirs(outputdir) else: print "Using Output dir ", outputdir, ". Any previous results in this dir on same dataset might get overwritten. " model = cPickle.load(open(modelfile, "rb")) if load_word_vecs: print "Loading word vectors" model.add_global_word_vecs({}) print "Loading word vectors done" sentences, vocab, labels = datasets.build_data(testfile, preprocess) labels = model.get_labels() output = model.classify(sentences) #Free memory del model print "Removed model from memory" #Format the output to earlier format #TODO evaluate function should be changed to accept newer format, which is cleaner data = [] for i in range(len(output[0])): actual_label = sentences[i]['y'] text = sentences[i]['text'] predicted_label = output[0][i] predicted_prob = output[1][i][predicted_label] data.append([ predicted_prob, labels[predicted_label], labels[actual_label], text
def test_dataset_reader(): sentences,vocabs,labels = build_data("../sample/datasets/sst_small_sample.csv") assert len(sentences) == 300 assert len(labels) == 2 assert "neg" in labels and "pos" in labels
if __name__=="__main__": if len(sys.argv)<5: print "Usage: training.py" print "\t<model config file path>" print "\t<training data file path>" print "\t<file path to store classifier model>" print "\t<true/false(preprocessing flag)>" exit(0) #processing.. config_file=sys.argv[1] train_data_file=sys.argv[2] model_output_file=sys.argv[3] preprocess=sys.argv[4].lower() training_config = config.get_training_config_from_json(config_file) sentences, vocab, labels = datasets.build_data(train_data_file,preprocess) print "Dataset loaded" word_vecs = wordvecs.load_wordvecs(training_config.word2vec,vocab) print "Loaded word vecs from file" if training_config.mode=="multichannel": nntrainer = MultiChannelTrainer(training_config,word_vecs,sentences,labels) else: nntrainer = TextCNNModelTrainer(training_config,word_vecs,sentences,labels) nntrainer.train(model_output_file)