Example #1
0
def trainer_helper(configFile,dataSetFile,tempModel):
    print "Training model on ",configFile,dataSetFile
    config  = get_training_config_from_json(configFile)
    sentences, vocab, labels = build_data(dataSetFile,True)
    word_vecs = wordvecs.load_wordvecs(config.word2vec,vocab)
    trainer = TextCNNModelTrainer(config,word_vecs,sentences,labels)
    trainer.train(tempModel)
    print "Succesfully trained model on ",configFile,dataSetFile," and model is at ",tempModel
    print "Will proceed at testing the model on same data. If everything is correct, you should see the same accuracy"
    model = cPickle.load(open(tempModel,"rb"))
    op = model.classify(sentences)
    os.remove(tempModel)
Example #2
0
__author__ = 'devashish.shankar'

if __name__ == "__main__":
    if len(sys.argv) < 5:
        print "Usage: training.py"
        print "\t<model config file path>"
        print "\t<training data file path>"
        print "\t<file path to store classifier model>"
        print "\t<true/false(preprocessing flag)>"
        exit(0)

    #processing..
    config_file = sys.argv[1]
    train_data_file = sys.argv[2]
    model_output_file = sys.argv[3]
    preprocess = sys.argv[4].lower()

    training_config = config.get_training_config_from_json(config_file)
    sentences, vocab, labels = datasets.build_data(train_data_file, preprocess)
    word_vecs = wordvecs.load_wordvecs(training_config.word2vec, vocab)

    if training_config.mode == "multichannel":
        nntrainer = MultiChannelTrainer(training_config, word_vecs, sentences,
                                        labels)
    else:
        nntrainer = TextCNNModelTrainer(training_config, word_vecs, sentences,
                                        labels)

    nntrainer.train(model_output_file)
Example #3
0
    modelfile = sys.argv[1]
    outputdir = sys.argv[3]
    preprocess = sys.argv[4].lower()
    load_word_vecs = sys.argv[5].lower() == "true"

    if not os.path.exists(outputdir):
        print "Output dir ", outputdir, " doesn't exist. Creating it"
        os.makedirs(outputdir)
    else:
        print "Using Output dir ", outputdir, ". Any previous results in this dir on same dataset might get overwritten. "
    model = cPickle.load(open(modelfile, "rb"))
    if load_word_vecs:
        print "Loading word vectors"
        model.add_global_word_vecs({})
        print "Loading word vectors done"
    sentences, vocab, labels = datasets.build_data(testfile, preprocess)
    labels = model.get_labels()
    output = model.classify(sentences)
    # Free memory
    del model
    print "Removed model from memory"
    # Format the output to earlier format
    # TODO evaluate function should be changed to accept newer format, which is cleaner
    data = []
    for i in range(len(output[0])):
        actual_label = sentences[i]["y"]
        text = sentences[i]["text"]
        predicted_label = output[0][i]
        predicted_prob = output[1][i][predicted_label]
        data.append([predicted_prob, labels[predicted_label], labels[actual_label], text])
    evaluate(data, outputdir)
Example #4
0
    modelfile = sys.argv[1]
    outputdir = sys.argv[3]
    preprocess = sys.argv[4].lower()
    load_word_vecs = sys.argv[5].lower() == "true"

    if not os.path.exists(outputdir):
        print "Output dir ", outputdir, " doesn't exist. Creating it"
        os.makedirs(outputdir)
    else:
        print "Using Output dir ", outputdir, ". Any previous results in this dir on same dataset might get overwritten. "
    model = cPickle.load(open(modelfile, "rb"))
    if load_word_vecs:
        print "Loading word vectors"
        model.add_global_word_vecs({})
        print "Loading word vectors done"
    sentences, vocab, labels = datasets.build_data(testfile, preprocess)
    labels = model.get_labels()
    output = model.classify(sentences)
    #Free memory
    del model
    print "Removed model from memory"
    #Format the output to earlier format
    #TODO evaluate function should be changed to accept newer format, which is cleaner
    data = []
    for i in range(len(output[0])):
        actual_label = sentences[i]['y']
        text = sentences[i]['text']
        predicted_label = output[0][i]
        predicted_prob = output[1][i][predicted_label]
        data.append([
            predicted_prob, labels[predicted_label], labels[actual_label], text
Example #5
0
def test_dataset_reader():
    sentences,vocabs,labels = build_data("../sample/datasets/sst_small_sample.csv")
    assert len(sentences) == 300
    assert len(labels) == 2
    assert "neg" in labels and "pos" in labels
Example #6
0
if __name__=="__main__":
    if len(sys.argv)<5:
        print "Usage: training.py"
        print "\t<model config file path>"
        print "\t<training data file path>"
        print "\t<file path to store classifier model>"
        print "\t<true/false(preprocessing flag)>"
        exit(0)

    #processing..
    config_file=sys.argv[1]
    train_data_file=sys.argv[2]
    model_output_file=sys.argv[3]
    preprocess=sys.argv[4].lower()

    training_config = config.get_training_config_from_json(config_file)
    sentences, vocab, labels = datasets.build_data(train_data_file,preprocess)
    print "Dataset loaded"
    word_vecs = wordvecs.load_wordvecs(training_config.word2vec,vocab)
    print "Loaded word vecs from file"

    if training_config.mode=="multichannel":
        nntrainer = MultiChannelTrainer(training_config,word_vecs,sentences,labels)
    else:
        nntrainer = TextCNNModelTrainer(training_config,word_vecs,sentences,labels)

    nntrainer.train(model_output_file)