def main(): lang_train_list = [] if len(sys.argv) == 1: lang_train_list = ['swedish', 'danish', 'english'] else: lang_train_list = sys.argv[1:] random.seed(1126) for lang in lang_train_list: whole_data = get_train_data_from_lang(lang) subdata = random.sample(whole_data, 200) tp = TransitionParser(Transition, FeatureExtractor) print '\n===== Start training {} data ====='.format(lang) tp.train(subdata) tp.save(lang + '.model') print '===== Sucessfully generating models ====='
def train_model(lang,training_set='train'): # load and sample data data = get_data(lang,dataset=training_set).parsed_sents() if len(data) >200: random.seed(1234) subdata = random.sample(data, 200) else: subdata = data # train model and save tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('{0}.model'.format(lang)) # test performance on new data if lang != 'english': testdata = get_data(lang,dataset='test').parsed_sents() # english test data not available # so find a subset of training data # that is disjoint from data used for training else: not_in_training = [sent for sent in data if sent not in subdata] testdata = random.sample(not_in_training,200) parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) # store and print results with open('results.txt','a') as results_file: results_file.write('{0} model:\n'.format(lang)) results_file.write("UAS: {} \nLAS: {}\n".format(*ev.eval())) print '{0} model:\n'.format(lang) print "UAS: {} \nLAS: {}\n".format(*ev.eval()) return ev.eval()[1]
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': #traindata = dataset.get_swedish_train_corpus().parsed_sents() traindata = dataset.get_english_train_corpus().parsed_sents() #traindata = dataset.get_danish_train_corpus().parsed_sents() try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) #tp.save('swedish.model') #tp.save('english.model') ### tp.save('danish.model') #labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() labeleddata = dataset.get_english_dev_corpus().parsed_sents() #labeleddata = dataset.get_danish_dev_corpus().parsed_sents() #blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #blinddata = dataset.get_danish_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(blinddata)
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': # traindata = dataset.get_swedish_train_corpus().parsed_sents() traindata = dataset.get_english_train_corpus().parsed_sents() try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) # tp.save('swedish.model') # labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() # blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() tp.save('english.model') labeleddata = dataset.get_english_dev_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') # parsed = tp.parse(labeleddata) parsed = tp.parse(blinddata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8'))
# EN_tp = TransitionParser.load('english.model') # EN_parsed = EN_tp.parse(EN_testdata) # print('Ok') # # SE # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # # ev = DependencyEvaluator(EN_testdata, EN_parsed)
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval())
import random from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('english.model') testdata = dataset.get_english_dev_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english):
with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: " + str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results" print "UAS: {} \nLAS: {}".format(*ev.eval())
# EN_tp = TransitionParser.load('english.model') # EN_parsed = EN_tp.parse(EN_testdata) # print('Ok') # # SE # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # # ev = DependencyEvaluator(EN_testdata, EN_parsed) # print('Evaluating EN model...')
#get korean training data koreandata = dataset.get_korean_train_corpus().parsed_sents() random.seed(1234) koreansubdata = random.sample(koreandata, 200) #get danish training data danishdata = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) danishsubdata = random.sample(danishdata, 235) try: #SWEDISH TESTING tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedishsubdata) tp.save('swedish.model') #badfeatures.model...don't use for real testing #tp = TransitionParser.load('badfeatures.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() parsed = tp.parse(testdata) #to write output...for badfeatures.model ''' with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8'))
# load test set in english and get 200 random sentences english_data = dataset.get_english_train_corpus().parsed_sents() random.seed() english_subdata = random.sample(english_data, 200) # load test set in danish and get 200 random sentences danish_data = dataset.get_danish_train_corpus().parsed_sents() random.seed() danish_subdata = random.sample(danish_data, 200) try: print 'training swedish' # swedish tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedish_subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') print 'testing swedish' parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print 'Swedish results'
subdata_dan = random.sample(data_dan, 200) try: # BAD MODEL ########################################################### tp = TransitionParser.load('badfeatures.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) print "Bad Features Model" print "UAS: {} \nLAS: {}".format(*ev.eval()) # SWEDISH ############################################################# tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() # tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('swedish_test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish" print "UAS: {} \nLAS: {}".format(*ev.eval())
with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: "+str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results" print "UAS: {} \nLAS: {}".format(*ev.eval())
if __name__ == '__main__': # 'data' is parsed sentences converted into Dependency Graph objects. model_dict = { 'english' : ('english.model', dataset.get_english_train_corpus, dataset.get_english_test_corpus), 'danish' : ('danish.model', dataset.get_danish_train_corpus, dataset.get_danish_test_corpus), 'swedish' : ('swedish.model', dataset.get_swedish_train_corpus, dataset.get_swedish_test_corpus) } for model_type, model_tuple in model_dict.iteritems(): model, data, testdata = model_tuple[0], model_tuple[1]().parsed_sents(), model_tuple[2]().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) # 200 randomly selected DependencyGraphs(sentences) for model training. try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # train with 200 randomly selected dependency graphs(sentences). tp.save(model) # save the trained model. tp = TransitionParser.load(model) # load the trained model for parsing. parsed = tp.parse(testdata) # parse the test data with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') # evaluate the test parse result here... ev = DependencyEvaluator(testdata, parsed) print 'Model: {}'.format(model_type) # LAS: labeled attachment score - percentage of scoring tokens for which the parsing system has predicted the