Ejemplo n.º 1
0
def main(model='mlp', num_epochs=500):
    files = os.listdir(CORPUS)
    random.shuffle(files)
    random.seed(20)
    print len(files)
    fs2 = files[1201:1500] #train
    fs3 = files[1501 : ] #evaluate
    fs1 = files[:1200] #test


    X_train, words_train, y_train = mklsts(CORPUS, fs1, winsize, word2vec_model)
    X_val,  words_val, y_val = mklsts(CORPUS, fs2, winsize, word2vec_model)


    input_var = T.matrix('inputs')
    target_var = T.ivector('targets')
    network = build_mlp(input_var)

    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)

    loss = loss.mean() + 1e-4 * lasagne.regularization.regularize_network_params(
        network, lasagne.regularization.l2)

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)


    eval_prediction = lasagne.layers.get_output(network, deterministic=True)
    eval_loss = lasagne.objectives.categorical_crossentropy(eval_prediction,
                                                           target_var)
    eval_loss = eval_loss.mean()

    eval_acc = T.mean(T.eq(T.argmax(eval_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)


    train_fn = theano.function([input_var, target_var], loss, updates=updates)
    val_fn = theano.function([input_var, target_var], [eval_loss, eval_acc])

    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train,  100, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 100, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))

    del X_train,  words_train, y_train, X_val, words_val, y_val
    np.savez('/home/anna/Documents/News Classifier/model-eng.npz', *lasagne.layers.get_all_param_values(network))

    X_test,  words_test, y_test = mklsts(CORPUS, fs3, winsize, word2vec_model)
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    predict_fn = theano.function([input_var], T.argmax(test_prediction, axis=1))
    pred = list(predict_fn(X_test))# for _ in X_test)

    predannotations = setAnnotations(pred,  {1: 'Location'}, exactness = 'lenient')
    clsannotations = setAnnotations(y_test,  {1: 'Location'}, exactness = 'lenient')

    print len(pred)
    print len(y_test)
    score = f1_score(pred, y_test, average=None)
    print score

    print Pr(predannotations, clsannotations)
    print Re(predannotations, clsannotations)
    print f1(predannotations, clsannotations)

    predannotations = setAnnotations(pred,  {1: 'Location'})
    clsannotations = setAnnotations(y_test,  {1: 'Location'})

    print Pr(predannotations, clsannotations)
    print Re(predannotations, clsannotations)
    print f1(predannotations, clsannotations)


    for file in fs3:
        X_test,  words_test, y_test = mklsts(CORPUS, [file], winsize, word2vec_model)


        pred = list(predict_fn(X_test))# for _ in X_test)
        predannotations = setAnnotations(pred,  {1: 'Location'})# exactness = 'lenient')
        clsannotations = setAnnotations(y_test,  {1: 'Location'})#, exactness = 'lenient')

        predlabels = data_utils.setlabels(words_test, predannotations)
        clslabels = data_utils.setlabels(words_test, clsannotations)



        with open('/home/anna/Documents/News Classifier/eng-toponyms.csv', 'a+') as file:
            writer = csv.writer(file, delimiter = '\t')
            writer.writerow([' '.join(words_test).encode('utf-8'), ', '.join(clslabels).encode('utf-8'), ', '.join(predlabels).encode('utf-8')])
Ejemplo n.º 2
0
    engine.fit(engine.trainData, learningrate = 0.05, maxEpochs=200,
               file = 'training.txt', verbose = True, modelfile = 'network.model', FOLDER = F)

    del neurons
    del target
    del words
    del cls
    del engine.trainData

    neurons1, target1, words1, cls1 = data_utils.mklsts(CORPUS, fs2, winsize,  word2vec_model, word2vec_gram_model)
    testDS = data_utils.DataSet(neurons1,target1, words1, cls1)


    engine.setDS('testData', testDS)

    print engine.predict(engine.testData)

    for f in fs2:
        neurons, target, words, cls = data_utils.mklsts(CORPUS, [f], winsize,  word2vec_model, word2vec_gram_model)
        engine.setDS('DS' , data_utils.DataSet(neurons, target, words, cls))

        engine.predict(engine.DS)

        predlabels = data_utils.setlabels(words, engine.DS.annotationSets['pred'])
        clslabels = data_utils.setlabels(words, engine.DS.annotationSets['class'])

        with open (os.path.join(F, 'results.csv'), 'a+') as f:
            writer = csv.writer(f, delimiter = '|')
            writer.writerow([' '.join(words).encode('utf-8'), ', '.join(clslabels).encode('utf-8'), ', '.join(predlabels).encode('utf-8')])
        del engine.DS
Ejemplo n.º 3
0
Archivo: fires.py Proyecto: AnnPe/maps
)


if __name__ == "__main__":
    dt = []
    labels = []
    with open(os.path.join(F, "Crisismap - news.csv"), "r") as file:
        reader = csv.reader(file, delimiter=";")
        for row in reader:

            dt.append(row[9].decode("utf-8") + ". " + row[10].decode("utf-8"))
            labels.append(row[11])
    engine = data_utils.Engine(model=model)
    old_engine = data_utils.Engine(model=old_model)
    for row, lab in zip(dt, labels):
        words, lemmas, grams = data_utils.lsts(row)
        neurons = list(data_utils.neurons(words, lemmas, grams, winsize, word2vec_model, word2vec_gram_model))
        engine.DS = data_utils.UnsupervisedData(neurons, words)
        old_engine.DS = data_utils.UnsupervisedData(neurons, words)

        engine.predict(engine.DS)
        old_engine.predict(old_engine.DS)
        # DS.setAnnotations('pred', 'pred', {1: 'Location'}, exactness = 'left')
        predlabels = data_utils.setlabels(words, engine.DS.annotationSets["pred"]) + data_utils.setlabels(
            words, old_engine.DS.annotationSets["pred"]
        )

        with open(os.path.join(F, "crisimap.csv"), "a+") as f:
            writer = csv.writer(f, delimiter="|")
            writer.writerow([" ".join(words).encode("utf-8"), str(lab), ", ".join(predlabels).encode("utf-8")])