def main(): lang_train_list = [] if len(sys.argv) == 1: lang_train_list = ['swedish', 'danish', 'english'] else: lang_train_list = sys.argv[1:] random.seed(1126) for lang in lang_train_list: whole_data = get_train_data_from_lang(lang) subdata = random.sample(whole_data, 200) tp = TransitionParser(Transition, FeatureExtractor) print '\n===== Start training {} data ====='.format(lang) tp.train(subdata) tp.save(lang + '.model') print '===== Sucessfully generating models ====='
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': # traindata = dataset.get_swedish_train_corpus().parsed_sents() traindata = dataset.get_english_train_corpus().parsed_sents() try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) # tp.save('swedish.model') # labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() # blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() tp.save('english.model') labeleddata = dataset.get_english_dev_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') # parsed = tp.parse(labeleddata) parsed = tp.parse(blinddata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8'))
parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: " + str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS print 'Starting Swedish' tp_s = TransitionParser(Transition, FeatureExtractor) tp_s.train(subdata) tp_s.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp_s = TransitionParser.load('swedish.model') parsed = tp_s.parse(testdata) with open('swedish.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Swedish Results"