def train_model(lang,training_set='train'): # load and sample data data = get_data(lang,dataset=training_set).parsed_sents() if len(data) >200: random.seed(1234) subdata = random.sample(data, 200) else: subdata = data # train model and save tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('{0}.model'.format(lang)) # test performance on new data if lang != 'english': testdata = get_data(lang,dataset='test').parsed_sents() # english test data not available # so find a subset of training data # that is disjoint from data used for training else: not_in_training = [sent for sent in data if sent not in subdata] testdata = random.sample(not_in_training,200) parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) # store and print results with open('results.txt','a') as results_file: results_file.write('{0} model:\n'.format(lang)) results_file.write("UAS: {} \nLAS: {}\n".format(*ev.eval())) print '{0} model:\n'.format(lang) print "UAS: {} \nLAS: {}\n".format(*ev.eval()) return ev.eval()[1]
tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) #tp.save('swedish.model') #tp.save('english.model') ### tp.save('danish.model') #labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() labeleddata = dataset.get_english_dev_corpus().parsed_sents() #labeleddata = dataset.get_danish_dev_corpus().parsed_sents() #blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #blinddata = dataset.get_danish_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(blinddata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('english.model') # parsed = tp.parse([sentence]) # print parsed[0].to_conll(10).encode('utf-8')
tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) # tp.save('swedish.model') # labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() # blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() tp.save('english.model') labeleddata = dataset.get_english_dev_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') # parsed = tp.parse(labeleddata) parsed = tp.parse(blinddata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('english.model') # parsed = tp.parse([sentence]) # print parsed[0].to_conll(10).encode('utf-8')
# tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # # ev = DependencyEvaluator(EN_testdata, EN_parsed) # print('Evaluating EN model...') # print "LAS: {} \nUAS: {}".format(*ev.eval()) with open('danish.conll', 'w') as f: for p in DK_parsed: f.write(p.to_conll(10).encode('utf-8'))
if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (swedish): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('swedish.model') # parsed = tp.parse([sentence]) # print parsed[0].to_conll(10).encode('utf-8')
from transition import Transition if __name__ == '__main__': data = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('english.model') testdata = dataset.get_english_dev_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('english.model') # parsed = tp.parse([sentence]) # print parsed[0].to_conll(10).encode('utf-8')
subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # tp.save('swedish.model') # tp.save('english.model') tp.save('danish.model') # testdata = dataset.get_swedish_test_corpus().parsed_sents() testdata = dataset.get_danish_test_corpus().parsed_sents() # tp = TransitionParser.load('badfeatures.model') # testdata = dataset.get_english_test_corpus().parsed_sents() # tp = TransitionParser.load('english.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "LAS: {} \nUAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english): sentence = DependencyGraph.from_sentence('Hi, this is a test') tp = TransitionParser.load('english.model') parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode('utf-8')
ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: " + str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t2 = time.time() print "Time: " + str(t2 - t1) + "\n" # NEED TO ALSO DO DANISH AND ENGLISH, BUT IT'S PRACTICALLY THE SAME # ENGLISH FEATURE MODELS
# tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # # ev = DependencyEvaluator(EN_testdata, EN_parsed) # print('Evaluating EN model...') # print "LAS: {} \nUAS: {}".format(*ev.eval()) with open('danish.conll', 'w') as f: for p in DK_parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n')
danishsubdata = random.sample(danishdata, 235) try: #SWEDISH TESTING tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedishsubdata) tp.save('swedish.model') #badfeatures.model...don't use for real testing #tp = TransitionParser.load('badfeatures.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() parsed = tp.parse(testdata) #to write output...for badfeatures.model ''' with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ''' ev = DependencyEvaluator(testdata, parsed) print "SWEDISH UAS: {} \nLAS: {}".format(*ev.eval()) #DANISH TESTING tp.train(danishsubdata)
try: model_path = sys.argv[1] # print 'ModelPath', model_path except IndexError as ie: print 'Model Path Not Specified! Exiting...', ie sys.exit(-1) try: tp = TransitionParser(Transition, FeatureExtractor) tp = TransitionParser.load(model_path) # load the trained model for parsing. for line in sys.stdin: # print 'Processing:', line sentence = DependencyGraph.from_sentence(line) parsed = tp.parse([sentence]) # parse the input line print parsed[0].to_conll(10).encode('utf-8') # with open('test.conll', 'w') as f: # for p in parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('english.model') # parsed = tp.parse([sentence]) # print parsed[0].to_conll(10).encode('utf-8') except Exception as e:
random.seed() danish_subdata = random.sample(danish_data, 200) try: print 'training swedish' # swedish tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedish_subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') print 'testing swedish' parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print 'Swedish results' print "UAS: {} \nLAS: {}".format(*ev.eval()) # english print '\n----------------------\n' print 'Training english' tpe = TransitionParser(Transition, FeatureExtractor)
ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: "+str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t2 = time.time() print "Time: " + str(t2 - t1) + "\n" # NEED TO ALSO DO DANISH AND ENGLISH, BUT IT'S PRACTICALLY THE SAME # ENGLISH FEATURE MODELS
'swedish' : ('swedish.model', dataset.get_swedish_train_corpus, dataset.get_swedish_test_corpus) } for model_type, model_tuple in model_dict.iteritems(): model, data, testdata = model_tuple[0], model_tuple[1]().parsed_sents(), model_tuple[2]().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) # 200 randomly selected DependencyGraphs(sentences) for model training. try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # train with 200 randomly selected dependency graphs(sentences). tp.save(model) # save the trained model. tp = TransitionParser.load(model) # load the trained model for parsing. parsed = tp.parse(testdata) # parse the test data with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') # evaluate the test parse result here... ev = DependencyEvaluator(testdata, parsed) print 'Model: {}'.format(model_type) # LAS: labeled attachment score - percentage of scoring tokens for which the parsing system has predicted the # correct head and dependency label. # UAS: print "LAS: {} \nUAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english):