def test(): data = LoadTestData() untrained_models = [] config = {'ngram': 3, 'est': 'add-delta', 'delta': 0.3} untrained_models.append((HMM(config), 'HMM. config: {}'.format(config))) config = { 'ftrs': ('IS_FIRST', 'IS_LAST', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL') } untrained_models.append((MEMM(config), 'MEMM. config: {}'.format(config))) config = { 'ftrs': ('IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL') } untrained_models.append( (CRF_WORD(config), 'CRF. config: {}'.format(config))) trained_models = [(model.prep_data().shuffle(0xfab1e).split(0).train(), name) for model, name in untrained_models] config = { 'n_layers': 3, 'hidden_dim': 32, 'embedding': 'mds', 'win_len': 4, "device": "cpu" } rnn = RNN(config) trained_models.append((rnn.prep_model().load('rnn_model.bin'), 'RNN. config: {}'.format(config))) for model, name in trained_models: trained_model = model conf_mat, dist = TestModel(trained_model, data) print('\n') print(name) print('=' * 80) print('Vowel metrics:') print('-' * 50) PrintConfMat(conf_mat) print('-' * 50) print('Edit distance:') print('-' * 50) for stage in range(1, 4): print('Stage = {}:'.format(stage_names[stage])) print(' Average = {}\n Median = {}\n Min = {}\n Max = {}'. format(dist[stage][0], dist[stage][1], dist[stage][2], dist[stage][3]))
all_tag = [] # for t in tag1: # if t not in punctuations: # all_tag.append(t) all_tag = list(set(tag)) param = [0 for i in range(24)] # print 'Profiling started' # prof = cProfile.Profile() # prof.enable() memm = MEMM( data, tag, all_tag, param, wsj.tagged_sents()[:no_of_sentences], [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14], 0, ) memm.train() print "For", memm.num_calls_cost, "calls to cost total time taken is", memm.tot_time_cost print "Per call avg time taken is", memm.tot_time_cost / memm.num_calls_cost print "For", memm.num_calls_gradient, "calls to gradient total time taken is", memm.tot_time_gradient print "Per call avg time taken is", memm.tot_time_gradient / memm.num_calls_gradient # dt1 = datetime.datetime.now() # print 'before training: ', dt1 # memm.cost(param) # dt2 = datetime.datetime.now()
print "testing set feats:", test_feat.shape print "moving to a sparse representation..." train_feat = sparsify(train_feat) test_feat = sparsify(test_feat) train_labl = [s.labels for s in train_sequences] train_labl = [item for sublist in train_labl for item in sublist] obsr_labl = [s.labels for s in test_sequences] test_seq = [" ".join(x) for x in obsr_labl] test_tok = [item for sublist in obsr_labl for item in sublist] obsr_list = get_observations(test_feat, test_sequences) memm = MEMM(10, 0.0001) memm.fit(train_feat, train_labl, fe.num_feats) pred_tok, pred_seq = memm.predict_sequences(obsr_list) # Structured Perceptron using viterbi in the inference step """percep = StructuredPerceptron(10, fe, 0.1) percep.fit(train_sequences) pred_tok, pred_seq = percep.predict_sequences(test_sequences) obsr_labl = [s.labels for s in test_sequences] test_seq = [" ".join(x) for x in obsr_labl] test_tok = [item for sublist in obsr_labl for item in sublist]"""
def f9(x, y): return round(random.random()) def f10(x, y): return round(random.random()) if __name__ == '__main__': data, tag = create_dataset() tag1 = list(set(tag)) punctuations = ['.', ',', ':', ';', '\"', '\'', '``', '\'\''] all_tag = [] for t in tag1: if t not in punctuations: all_tag.append(t) param = [0 for i in range(10)] print 'Profiling started' # prof = cProfile.Profile() # prof.enable() memm = MEMM(data, tag, all_tag, param, [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10], 0) memm.train() # prof.disable() # s = StringIO.StringIO() # sortby = 'cumulative' # ps = pstats.Stats(prof, stream=s).sort_stats(sortby) # ps.print_stats() # print s.getvalue()
print "testing set feats:", test_feat.shape print "moving to a sparse representation..." train_feat = sparsify(train_feat) test_feat = sparsify(test_feat) train_labl = [s.labels for s in train_sequences] train_labl = [item for sublist in train_labl for item in sublist] obsr_labl = [s.labels for s in test_sequences] test_seq = [" ".join(x) for x in obsr_labl] test_tok = [item for sublist in obsr_labl for item in sublist] obsr_list = get_observations(test_feat, test_sequences) memm = MEMM(10, 0.0001) memm.fit(train_feat, train_labl, fe.num_feats) pred_tok, pred_seq = memm.predict_sequences(obsr_list) # Structured Perceptron using viterbi in the inference step '''percep = StructuredPerceptron(10, fe, 0.1) percep.fit(train_sequences) pred_tok, pred_seq = percep.predict_sequences(test_sequences) obsr_labl = [s.labels for s in test_sequences] test_seq = [" ".join(x) for x in obsr_labl] test_tok = [item for sublist in obsr_labl for item in sublist]''' # print results print metrics.classification_report(test_tok, pred_tok) print metrics.accuracy_score(test_tok, pred_tok)