def main(): lang_train_list = [] if len(sys.argv) == 1: lang_train_list = ['swedish', 'danish', 'english'] else: lang_train_list = sys.argv[1:] random.seed(1126) for lang in lang_train_list: whole_data = get_train_data_from_lang(lang) subdata = random.sample(whole_data, 200) tp = TransitionParser(Transition, FeatureExtractor) print '\n===== Start training {} data ====='.format(lang) tp.train(subdata) tp.save(lang + '.model') print '===== Sucessfully generating models ====='
def train_model(lang,training_set='train'): # load and sample data data = get_data(lang,dataset=training_set).parsed_sents() if len(data) >200: random.seed(1234) subdata = random.sample(data, 200) else: subdata = data # train model and save tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('{0}.model'.format(lang)) # test performance on new data if lang != 'english': testdata = get_data(lang,dataset='test').parsed_sents() # english test data not available # so find a subset of training data # that is disjoint from data used for training else: not_in_training = [sent for sent in data if sent not in subdata] testdata = random.sample(not_in_training,200) parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) # store and print results with open('results.txt','a') as results_file: results_file.write('{0} model:\n'.format(lang)) results_file.write("UAS: {} \nLAS: {}\n".format(*ev.eval())) print '{0} model:\n'.format(lang) print "UAS: {} \nLAS: {}\n".format(*ev.eval()) return ev.eval()[1]
import random from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('english.model') testdata = dataset.get_english_dev_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english):
print "Invalid argument: " + fo exit(1) set_feature_option(feature_options) if language is 'swedish': traindata = dataset.get_swedish_train_corpus().parsed_sents() else: traindata = dataset.get_english_train_corpus().parsed_sents() try: time.clock() tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) fname = language + '.' + arg_fo tp.save(fname + '.model') # tp.save('swedish.model') if language is 'swedish': labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() else: labeleddata = dataset.get_english_dev_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() # tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(blinddata) with open(fname + '.conll', 'w') as f: for p in parsed:
print time.ctime( ), "-------DONE----- BADMODEL", modelfile, conllfile if F_TRAIN_SWEDISH == True: print time.ctime(), "START TRAIN SWEDISH" traindata = dataset.get_swedish_train_corpus().parsed_sents() labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() modelfile = 'swedish.model' conllfile = 'swedish.conll' tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) tp.save(modelfile) # load model for testing tp = TransitionParser.load(modelfile) parsed = tp.parse(blinddata) ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) with open(conllfile, 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') print time.ctime( ), "-------DONE----- TESTING SWEDISH ", modelfile, conllfile
from transition import Transition if __name__ == '__main__': #data = dataset.get_swedish_train_corpus().parsed_sents() #data = dataset.get_korean_train_corpus().parsed_sents() data = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) #tp.save('swedish.model') #tp.save('korean.model') tp.save('danish.model') #testdata = dataset.get_swedish_test_corpus().parsed_sents() #testdata = dataset.get_korean_test_corpus().parsed_sents() testdata = dataset.get_danish_test_corpus().parsed_sents() #tp = TransitionParser.load('swedish.model') #tp = TransitionParser.load('korean.model') tp = TransitionParser.load('danish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n')
if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() # data = dataset.get_english_test_corpus().parsed_sents() # data = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') # tp.save('english.model') # tp.save('danish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval())
# 'data' is parsed sentences converted into Dependency Graph objects. model_dict = { 'english' : ('english.model', dataset.get_english_train_corpus, dataset.get_english_test_corpus), 'danish' : ('danish.model', dataset.get_danish_train_corpus, dataset.get_danish_test_corpus), 'swedish' : ('swedish.model', dataset.get_swedish_train_corpus, dataset.get_swedish_test_corpus) } for model_type, model_tuple in model_dict.iteritems(): model, data, testdata = model_tuple[0], model_tuple[1]().parsed_sents(), model_tuple[2]().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) # 200 randomly selected DependencyGraphs(sentences) for model training. try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # train with 200 randomly selected dependency graphs(sentences). tp.save(model) # save the trained model. tp = TransitionParser.load(model) # load the trained model for parsing. parsed = tp.parse(testdata) # parse the test data with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') # evaluate the test parse result here... ev = DependencyEvaluator(testdata, parsed) print 'Model: {}'.format(model_type) # LAS: labeled attachment score - percentage of scoring tokens for which the parsing system has predicted the # correct head and dependency label.
from transition import Transition if __name__ == '__main__': # traindata = dataset.get_swedish_train_corpus().parsed_sents() traindata = dataset.get_english_train_corpus().parsed_sents() try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) # tp.save('swedish.model') # labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() # blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() tp.save('english.model') labeleddata = dataset.get_english_dev_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') # parsed = tp.parse(labeleddata) parsed = tp.parse(blinddata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval())
with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: " + str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t2 = time.time()
from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': random.seed(1234) # Tain the english model print "--->\t Load the english corpus" data = dataset.get_english_train_corpus().parsed_sents() #data = dataset.get_english_test_corpus().parsed_sents() tp = TransitionParser(Transition, FeatureExtractor) subdata = random.sample(data, 200) #subdata = data print "--->\t Train english corpus model" tp.train(subdata) tp.save('english.model') # Tain the danish model print "--->\t Load the danish corpus" data = dataset.get_danish_train_corpus().parsed_sents() #data = dataset.get_danish_test_corpus().parsed_sents() tp = TransitionParser(Transition, FeatureExtractor) subdata = random.sample(data, 200) #subdata = data print "--->\t Train danish corpus model" tp.train(subdata) tp.save('danish.model') # Tain the swedish model print "--->\t Load the swedish corpus" data = dataset.get_swedish_train_corpus().parsed_sents()
koreandata = dataset.get_korean_train_corpus().parsed_sents() random.seed(1234) koreansubdata = random.sample(koreandata, 200) #get danish training data danishdata = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) danishsubdata = random.sample(danishdata, 235) try: #SWEDISH TESTING tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedishsubdata) tp.save('swedish.model') #badfeatures.model...don't use for real testing #tp = TransitionParser.load('badfeatures.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() parsed = tp.parse(testdata) #to write output...for badfeatures.model ''' with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n')
'english': 50.} totalPoints = 0 for testName in tests.keys(): data = tests[testName]().parsed_sents() data_1h = data[0:(len(data)/2)] data_2h = data[(len(data)/2):-1] random.seed(99999) traindata = random.sample(data_1h, 200) testdata = random.sample(data_2h, 800) try: print "Training {0} model...".format(testName) tp = TransitionParser(Transition, MyFeatureExtractor) tp.train(traindata) tp.save(testName + ".model") print "Testing {0} model...".format(testName) parsed = tp.parse(testdata) # with open('test.conll', 'w') as f: # for p in parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Test Results For: {0}".format(testName) (uas, las) = ev.eval() points = scoreWeight[testName] * (min(0.7, las)/0.7)**2 totalPoints += points print "UAS: {0} \nLAS: {1}".format(uas, las)
english_data = dataset.get_english_train_corpus().parsed_sents() random.seed() english_subdata = random.sample(english_data, 200) # load test set in danish and get 200 random sentences danish_data = dataset.get_danish_train_corpus().parsed_sents() random.seed() danish_subdata = random.sample(danish_data, 200) try: print 'training swedish' # swedish tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedish_subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') print 'testing swedish' parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print 'Swedish results' print "UAS: {} \nLAS: {}".format(*ev.eval())
with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: "+str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t2 = time.time()
from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (swedish):
# print('Ok') # # SE # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # # ev = DependencyEvaluator(EN_testdata, EN_parsed) # print('Evaluating EN model...') # print "LAS: {} \nUAS: {}".format(*ev.eval())
from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': #data = dataset.get_swedish_train_corpus().parsed_sents() data = dataset.get_korean_train_corpus().parsed_sents() #data = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) #tp.save('swedish.model') tp.save('korean.model') #tp.save('danish.model') #testdata = dataset.get_swedish_test_corpus().parsed_sents() testdata = dataset.get_korean_test_corpus().parsed_sents() #testdata = dataset.get_danish_test_corpus().parsed_sents() #tp = TransitionParser.load('swedish.model') tp = TransitionParser.load('korean.model') #tp = TransitionParser.load('danish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8'))
f.write('\n') print time.ctime(), "-------DONE----- BADMODEL", modelfile, conllfile if F_TRAIN_SWEDISH == True: print time.ctime(), "START TRAIN SWEDISH" traindata = dataset.get_swedish_train_corpus().parsed_sents() labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() modelfile = 'swedish.model' conllfile = 'swedish.conll' tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) tp.save(modelfile) # load model for testing tp = TransitionParser.load(modelfile) parsed = tp.parse(blinddata) ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) with open(conllfile, 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') print time.ctime(), "-------DONE----- TESTING SWEDISH ", modelfile, conllfile if F_TRAIN_ENGLISH == True:
from transition import Transition if __name__ == "__main__": # data = dataset.get_swedish_train_corpus().parsed_sents() data = dataset.get_english_train_corpus().parsed_sents() # data = dataset.get_korean_train_corpus().parsed_sents() # data = dataset.get_danish_train_corpus().parsed_sents() # random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # tp.save('swedish.model') tp.save("english.model") # tp.save('korean.model') # tp.save('danish.model') # testdata = dataset.get_swedish_test_corpus().parsed_sents() # testdata = dataset.get_english_dev_corpus().parsed_sents() # testdata = dataset.get_korean_test_corpus().parsed_sents() # testdata = dataset.get_danish_test_corpus().parsed_sents() # tp = TransitionParser.load('swedish.model') tp = TransitionParser.load("english.model") # tp = TransitionParser.load('korean.model') # tp = TransitionParser.load('danish.model') parsed = tp.parse(testdata) with open("test.conll", "w") as f: for p in parsed: