def main(model='mlp', num_epochs=500): files = os.listdir(CORPUS) random.shuffle(files) random.seed(20) print len(files) fs2 = files[1201:1500] #train fs3 = files[1501 : ] #evaluate fs1 = files[:1200] #test X_train, words_train, y_train = mklsts(CORPUS, fs1, winsize, word2vec_model) X_val, words_val, y_val = mklsts(CORPUS, fs2, winsize, word2vec_model) input_var = T.matrix('inputs') target_var = T.ivector('targets') network = build_mlp(input_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() + 1e-4 * lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) eval_prediction = lasagne.layers.get_output(network, deterministic=True) eval_loss = lasagne.objectives.categorical_crossentropy(eval_prediction, target_var) eval_loss = eval_loss.mean() eval_acc = T.mean(T.eq(T.argmax(eval_prediction, axis=1), target_var), dtype=theano.config.floatX) train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [eval_loss, eval_acc]) for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 100, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 100, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) del X_train, words_train, y_train, X_val, words_val, y_val np.savez('/home/anna/Documents/News Classifier/model-eng.npz', *lasagne.layers.get_all_param_values(network)) X_test, words_test, y_test = mklsts(CORPUS, fs3, winsize, word2vec_model) test_prediction = lasagne.layers.get_output(network, deterministic=True) predict_fn = theano.function([input_var], T.argmax(test_prediction, axis=1)) pred = list(predict_fn(X_test))# for _ in X_test) predannotations = setAnnotations(pred, {1: 'Location'}, exactness = 'lenient') clsannotations = setAnnotations(y_test, {1: 'Location'}, exactness = 'lenient') print len(pred) print len(y_test) score = f1_score(pred, y_test, average=None) print score print Pr(predannotations, clsannotations) print Re(predannotations, clsannotations) print f1(predannotations, clsannotations) predannotations = setAnnotations(pred, {1: 'Location'}) clsannotations = setAnnotations(y_test, {1: 'Location'}) print Pr(predannotations, clsannotations) print Re(predannotations, clsannotations) print f1(predannotations, clsannotations) for file in fs3: X_test, words_test, y_test = mklsts(CORPUS, [file], winsize, word2vec_model) pred = list(predict_fn(X_test))# for _ in X_test) predannotations = setAnnotations(pred, {1: 'Location'})# exactness = 'lenient') clsannotations = setAnnotations(y_test, {1: 'Location'})#, exactness = 'lenient') predlabels = data_utils.setlabels(words_test, predannotations) clslabels = data_utils.setlabels(words_test, clsannotations) with open('/home/anna/Documents/News Classifier/eng-toponyms.csv', 'a+') as file: writer = csv.writer(file, delimiter = '\t') writer.writerow([' '.join(words_test).encode('utf-8'), ', '.join(clslabels).encode('utf-8'), ', '.join(predlabels).encode('utf-8')])
engine.fit(engine.trainData, learningrate = 0.05, maxEpochs=200, file = 'training.txt', verbose = True, modelfile = 'network.model', FOLDER = F) del neurons del target del words del cls del engine.trainData neurons1, target1, words1, cls1 = data_utils.mklsts(CORPUS, fs2, winsize, word2vec_model, word2vec_gram_model) testDS = data_utils.DataSet(neurons1,target1, words1, cls1) engine.setDS('testData', testDS) print engine.predict(engine.testData) for f in fs2: neurons, target, words, cls = data_utils.mklsts(CORPUS, [f], winsize, word2vec_model, word2vec_gram_model) engine.setDS('DS' , data_utils.DataSet(neurons, target, words, cls)) engine.predict(engine.DS) predlabels = data_utils.setlabels(words, engine.DS.annotationSets['pred']) clslabels = data_utils.setlabels(words, engine.DS.annotationSets['class']) with open (os.path.join(F, 'results.csv'), 'a+') as f: writer = csv.writer(f, delimiter = '|') writer.writerow([' '.join(words).encode('utf-8'), ', '.join(clslabels).encode('utf-8'), ', '.join(predlabels).encode('utf-8')]) del engine.DS
) if __name__ == "__main__": dt = [] labels = [] with open(os.path.join(F, "Crisismap - news.csv"), "r") as file: reader = csv.reader(file, delimiter=";") for row in reader: dt.append(row[9].decode("utf-8") + ". " + row[10].decode("utf-8")) labels.append(row[11]) engine = data_utils.Engine(model=model) old_engine = data_utils.Engine(model=old_model) for row, lab in zip(dt, labels): words, lemmas, grams = data_utils.lsts(row) neurons = list(data_utils.neurons(words, lemmas, grams, winsize, word2vec_model, word2vec_gram_model)) engine.DS = data_utils.UnsupervisedData(neurons, words) old_engine.DS = data_utils.UnsupervisedData(neurons, words) engine.predict(engine.DS) old_engine.predict(old_engine.DS) # DS.setAnnotations('pred', 'pred', {1: 'Location'}, exactness = 'left') predlabels = data_utils.setlabels(words, engine.DS.annotationSets["pred"]) + data_utils.setlabels( words, old_engine.DS.annotationSets["pred"] ) with open(os.path.join(F, "crisimap.csv"), "a+") as f: writer = csv.writer(f, delimiter="|") writer.writerow([" ".join(words).encode("utf-8"), str(lab), ", ".join(predlabels).encode("utf-8")])