def evaluate(machine, valid_problem, info_suffix): test_label_file_path = "test_label_" + str(info_suffix) + ".txt" pred_label_file_path = "pred_label_" + str(info_suffix) + ".txt" test_raw_label_file_path = "test_raw_label_" + str(info_suffix) + ".txt" pred_raw_label_file_path = "pred_raw_label_" + str(info_suffix) + ".txt" test_label_file = open(test_label_file_path, "w") pred_label_file = open(pred_label_file_path, "w") test_raw_label_file = open(test_raw_label_file_path, "w") pred_raw_label_file = open(pred_raw_label_file_path, "w") test_types = [] pred_types = [] for valid_sentence in valid_problem.sentences(): test_labels = [] pred_labels = [] sentence_str = " ".join([word.content for word in valid_sentence.words()]) srl_x, srl_y = valid_problem.get_dataset_for_sentence(valid_sentence) pred_y = machine.predict(srl_x.astype(theano.config.floatX)) test_types.append(sentence_str) test_types.append("\t".join([SrlTypes.LABEL_SRLTYPE_MAP[l] for l in srl_y])) pred_types.append(sentence_str) pred_types.append("\t".join([SrlTypes.LABEL_SRLTYPE_MAP[l] for l in pred_y])) test_labels.append(srl_y) pred_labels.append(pred_y) test_label_str = valid_problem.pretty_srl_test_label(valid_sentence, test_labels) pred_label_str = valid_problem.pretty_srl_predict_label(valid_sentence, pred_labels) test_label_file.write(test_label_str) pred_label_file.write(pred_label_str) test_raw_label_file.write("\n".join(test_types)) pred_raw_label_file.write("\n".join(pred_types)) test_label_file.close() pred_label_file.close() test_raw_label_file.close() pred_raw_label_file.close() valid_result = eval_srl(test_label_file_path, pred_label_file_path) valid_info = 'validation info {0}% '.format( valid_result) print valid_info
def test_srl_label_formatter(data_file_path): conll05corpora = Conll05Corpora() conll05corpora.load(data_file_path) print 'load done' test_label_file_path = "test_label.txt" pred_label_file_path = "pred_label.txt" srl_problem = SRLProblem(conll05corpora) for valid_sentence in srl_problem.sentences(): test_labels = [] pred_labels = [] test_label_file = open(test_label_file_path, "w") pred_label_file = open(pred_label_file_path, "w") for srl_x, srl_y in srl_problem.get_dataset_for_sentence(valid_sentence): pred_y = [random.choice(SrlTypes.LABEL_SRLTYPE_MAP.keys()) for i in range(srl_y.size)] test_labels.append(srl_y) pred_labels.append(pred_y) test_label_str = srl_problem.pretty_srl_test_label(valid_sentence, test_labels) pred_label_str = srl_problem.pretty_srl_predict_label(valid_sentence, pred_labels) test_label_file.write(test_label_str) pred_label_file.write(pred_label_str) test_label_file.close() pred_label_file.close() try: valid_result = eval_srl(test_label_file_path, pred_label_file_path) except: print "label = " print "\n".join([" ".join([SrlTypes.LABEL_SRLTYPE_MAP[x] for x in label]) for label in pred_labels]) print "formatted_label = " print pred_label_str
def train_srl_neural_model(train_problem, valid_problem, nn_architecture, hyper_param, model_path=None, model_tag=None): problem_character = train_problem.get_problem_property() trans_mat_prior = train_problem.get_trans_mat_prior() srl_nn = SRLNetwork(problem_character, nn_architecture, trans_mat_prior) if model_path != None: srl_nn.load_model(model_path, model_tag) patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False validation_frequency = 10000 total_minibatch = 0 train_func = get_train_func(srl_nn, hyper_param.learning_rate, hyper_param.l1_reg, hyper_param.l2_reg) valid_func = get_test_func(srl_nn) #stat_func = get_pred_stat_func(srl_nn) pred_func = get_pred_func(srl_nn) while (epoch < hyper_param.n_epochs) and (not done_looping): epoch = epoch + 1 minibatch = 0 for X, y in train_problem.get_data_batch(): if X[0][0] < 3: continue start_time = time.clock() minibatch_avg_cost= train_func(X.astype("float32"), y.astype('int32')) end_time = time.clock() minibatch += 1 total_minibatch += 1 if minibatch % 100 == 0: debug_info = 'epoch {0}.{1}, cost = {2}, time = {3}'.format(epoch,minibatch,minibatch_avg_cost,end_time - start_time) print debug_info ''' numpy.savetxt(str(minibatch) + ".X.txt", numpy.asarray(srl_nn.hidden_output(T.shared(X)).eval())) numpy.savetxt(str(minibatch) + ".y.txt", y) ''' if total_minibatch % validation_frequency == 0: # srl_nn.dump_model('./models/',str(total_minibatch/validation_frequency)) # compute zero-one loss on validation set validation_losses = 0 sample_num = 0 validation_pred = [] validation_label = [] test_num = 0 all_same = 0 same_rate = 0 test_label_file_path = "test_label_" + str(total_minibatch/validation_frequency) + ".txt" pred_label_file_path = "pred_label_" + str(total_minibatch/validation_frequency) + ".txt" test_label_file = open(test_label_file_path, "w") pred_label_file = open(pred_label_file_path, "w") start_time = time.clock() for sentence in valid_problem.sentences(): test_labels = [] pred_labels = [] for srl_x, srl_y in valid_problem.get_dataset_for_sentence(sentence): test_labels.append(srl_y) pred_labels.append(pred_func(srl_x.astype("float32"))) test_label_str = valid_problem.pretty_srl_label(sentence, test_labels) pred_label_str = valid_problem.pretty_srl_label(sentence, pred_labels) test_label_file.write(test_label_str) pred_label_file.write(pred_label_str) test_label_file.close() pred_label_file.close() valid_result = eval_srl(test_label_file_path, pred_label_file_path) valid_info = 'minibatch {0}, validation info {1}% '.format( total_minibatch, valid_result) print valid_info # # if we got the best validation score until now # if validation_losses < best_validation_loss: # #improve patience if loss improvement is good enough # if validation_losses < best_validation_loss * \ # improvement_threshold: # patience = max(patience, epoch * patience_increase) # # best_validation_loss = validation_losses # best_iter = epoch # # if patience <= epoch: # done_looping = True # break hyper_param.learning_rate *= hyper_param.learning_rate_decay_ratio if hyper_param.learning_rate <= hyper_param.learning_rate_lowerbound: hyper_param.learning_rate = hyper_param.learning_rate_lowerbound print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i.') % (best_validation_loss * 100., epoch))