def train(self, train_file, iteration): print( "________________________________________________________________________________________________tarin starts" ) model = multi_class_perceptron() c = DataHelper() instances = [] sentence_count = 0 for sentence in c.read_sentence(train_file): sentence_count += 1 for token in sentence: feature = token.feature_extracter(model.feature_constructor) # print feature instances.append( (feature, model.pos_constructor(token.gold_pos))) weights_statistics = model.weights_constructor() self.table_statistics.append([ str(weights_statistics[1]), str(weights_statistics[0]), str(len(instances)), str(sentence_count) ]) table = AsciiTable(self.table_statistics) print(table.table) for iter_round in range(iteration): start = time.time() for (feature, pos_tag) in instances: #print (feature, pos_tag) score = model.weight_scores(feature) #print score predicted_tag = model.predict(score) #print predicted_tag if predicted_tag != pos_tag: model.update(feature, pos_tag, predicted_tag) end = time.time() print 'Iteration' + '\t' + str( iter_round + 1) + '\t' + 'done.', " runs at:", end - start, "seconds" model_file = 'dumps\\model_' + str(iter_round + 1) + '.dump' model.save(model_file) print( "________________________________________________________________________________________________tarin ends" )
def tagger(self, filename, iteration): print "________________________________________________________________________________________________Perceptron tagger starts" for iter_round in range(iteration): model_file = 'dumps\\model_' + str(iter_round + 1) + '.dump' print 'Reading from file' + '\t' + model_file.split('\\')[1] model = multi_class_perceptron(model_file) c = DataHelper() output = open('dumps\\dev-predicted.col', 'w') for sentence in c.read_sentence(filename): for token in sentence: feature = token.feature_extracter(model.return_features) score = model.weight_scores(feature) predicted_tag = model.predict(score) pos_tag = model.pos_constructor(token.gold_pos) output.write( '%s\t%s\n' % (token.word, model.return_pos_reverse(predicted_tag))) output.write('\n') output.close() Cgold = DataHelper("dataset\\test.col") GoldWordTagList = Cgold.Tokenize(Cgold) Cpred = DataHelper("dumps\\dev-predicted.col") PredWordTagList = Cpred.Tokenize(Cpred) Ctag = DataHelper("dataset\\test.col") TagSet = Ctag.tagSet(Ctag) eval = Evaluation() per_tag = False f_measure = eval.Evaluate(per_tag, GoldWordTagList, PredWordTagList, TagSet) print 'F-Measure Micro:' + '\t' + f_measure[0] print 'F-Measure Macro:' + '\t' + f_measure[1] print final_eval = Evaluation() f_per_tag = True per_tag_table = final_eval.Evaluate(f_per_tag, GoldWordTagList, PredWordTagList, TagSet) print per_tag_table print "________________________________________________________________________________________________Perceptron tagger ends"
def viterbi_tagger(self, test_file): print "________________________________________________________________________________________________viterbi_tagger starts" c_3 = DataHelper("dataset\\train.col") stream_emission_matrix = gzip.open("dumps\\emission_matrix.dump", 'rb') emission_matrix = cPickle.load(stream_emission_matrix) stream_emission_matrix.close() stream_transition_matrix = gzip.open("dumps\\transition_matrix.dump", 'rb') transition_matrix = cPickle.load(stream_transition_matrix) stream_transition_matrix.close() for x in transition_matrix: for p in transition_matrix[x]: if transition_matrix[x][p] > 0.2: print p, x, transition_matrix[x][p] sentence_count = 0 word_count = 0 output = open('dumps\\dev-predicted-viterbi.col', 'w') for sentence in c_3.read_sentence(test_file): observation = sentence.word_list() sentence_count += 1 word_count += len(observation) #print observation states = sentence.tag_list() #print states #for word in observation: # if word in emission_matrix: # states = states + emission_matrix[word].keys() # else: # states = states + ['NN'] states = list(set(states)) #states.insert(0, '<S>') #start = time.time() prediction = self.viterbi_smoothing(observation, states, emission_matrix, transition_matrix) #end = time.time() #print 'Sentence '+str(sentence_count)+' at', end - start for i in range(len(prediction[0])): output.write('%s\t%s\n' % (prediction[1][i], prediction[0][i])) output.write('\n') output.close() Ctag = DataHelper("dataset\\test.col") TagSet = Ctag.tagSet(Ctag) self.table_statistics.append([ str(emission_matrix.__len__()), str(len(TagSet)), str(word_count), str(sentence_count) ]) table = AsciiTable(self.table_statistics) print(table.table) print "________________________________________________________________________________________________viterbi_tagger ends" Cgold = DataHelper("dataset\\test.col") GoldWordTagList = Cgold.Tokenize(Cgold) Cpred = DataHelper("dumps\\dev-predicted-viterbi.col") PredWordTagList = Cpred.Tokenize(Cpred) eval = Evaluation() per_tag = False f_measure = eval.Evaluate(per_tag, GoldWordTagList, PredWordTagList, TagSet) print 'F-Measure Micro:' + '\t' + f_measure[0] print 'F-Measure Macro:' + '\t' + f_measure[1] print final_eval = Evaluation() f_per_tag = True per_tag_table = final_eval.Evaluate(f_per_tag, GoldWordTagList, PredWordTagList, TagSet) print per_tag_table