def main():
    lang_train_list = []
    if len(sys.argv) == 1:
        lang_train_list = ['swedish', 'danish', 'english']
    else:
        lang_train_list = sys.argv[1:]

    random.seed(1126)

    for lang in lang_train_list:
        whole_data = get_train_data_from_lang(lang)
        subdata = random.sample(whole_data, 200)
        tp = TransitionParser(Transition, FeatureExtractor)
        print '\n===== Start training {} data ====='.format(lang)
        tp.train(subdata)
        tp.save(lang + '.model')

    print '===== Sucessfully generating models ====='
Exemple #2
0
def train_model(lang,training_set='train'):
	# load and sample data
	data = get_data(lang,dataset=training_set).parsed_sents()
	if len(data) >200:
		random.seed(1234)
		subdata = random.sample(data, 200)
	else:
		subdata = data

	# train model and save
	tp = TransitionParser(Transition, FeatureExtractor)
	tp.train(subdata)
	tp.save('{0}.model'.format(lang))


	# test performance on new data
	if lang != 'english':
		testdata = get_data(lang,dataset='test').parsed_sents()
	
	# english test data not available
	# so find a subset of training data 
	# that is disjoint from data used for training 
	else:
		not_in_training = [sent for sent in data if sent not in subdata]
		testdata = random.sample(not_in_training,200)

	parsed = tp.parse(testdata)

	ev = DependencyEvaluator(testdata, parsed)

	# store and print results
	with open('results.txt','a') as results_file:
		results_file.write('{0} model:\n'.format(lang))
		results_file.write("UAS: {} \nLAS: {}\n".format(*ev.eval()))
	print '{0} model:\n'.format(lang)
	print "UAS: {} \nLAS: {}\n".format(*ev.eval())
	return ev.eval()[1]
import random
from providedcode import dataset
from providedcode.transitionparser import TransitionParser
from providedcode.evaluate import DependencyEvaluator
from featureextractor import FeatureExtractor
from transition import Transition

if __name__ == '__main__':
    data = dataset.get_english_train_corpus().parsed_sents()
    random.seed(1234)
    subdata = random.sample(data, 200)

    try:
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(subdata)
        tp.save('english.model')

        testdata = dataset.get_english_dev_corpus().parsed_sents()
        #tp = TransitionParser.load('badfeatures.model')

        parsed = tp.parse(testdata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "UAS: {} \nLAS: {}".format(*ev.eval())

        # parsing arbitrary sentences (english):
Exemple #4
0
            print "Invalid argument: " + fo
            exit(1)
    set_feature_option(feature_options)

    if language is 'swedish':
        traindata = dataset.get_swedish_train_corpus().parsed_sents()
    else:
        traindata = dataset.get_english_train_corpus().parsed_sents()

    try:
        time.clock()
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(traindata)

        fname = language + '.' + arg_fo
        tp.save(fname + '.model')
        # tp.save('swedish.model')

        if language is 'swedish':
            labeleddata = dataset.get_swedish_dev_corpus().parsed_sents()
            blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents()
        else:
            labeleddata = dataset.get_english_dev_corpus().parsed_sents()
            blinddata = dataset.get_english_dev_blind_corpus().parsed_sents()

        # tp = TransitionParser.load('badfeatures.model')

        parsed = tp.parse(blinddata)

        with open(fname + '.conll', 'w') as f:
            for p in parsed:
Exemple #5
0
            print time.ctime(
            ), "-------DONE----- BADMODEL", modelfile, conllfile

        if F_TRAIN_SWEDISH == True:
            print time.ctime(), "START TRAIN SWEDISH"
            traindata = dataset.get_swedish_train_corpus().parsed_sents()
            labeleddata = dataset.get_swedish_dev_corpus().parsed_sents()
            blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents()

            modelfile = 'swedish.model'
            conllfile = 'swedish.conll'

            tp = TransitionParser(Transition, FeatureExtractor)
            tp.train(traindata)
            tp.save(modelfile)

            # load model for testing
            tp = TransitionParser.load(modelfile)
            parsed = tp.parse(blinddata)

            ev = DependencyEvaluator(labeleddata, parsed)
            print "UAS: {} \nLAS: {}".format(*ev.eval())

            with open(conllfile, 'w') as f:
                for p in parsed:
                    f.write(p.to_conll(10).encode('utf-8'))
                    f.write('\n')
            print time.ctime(
            ), "-------DONE----- TESTING SWEDISH ", modelfile, conllfile
Exemple #6
0
from transition import Transition

if __name__ == '__main__':
    #data = dataset.get_swedish_train_corpus().parsed_sents()
    #data = dataset.get_korean_train_corpus().parsed_sents()
    data = dataset.get_danish_train_corpus().parsed_sents()

    random.seed(1234)
    subdata = random.sample(data, 200)

    try:
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(subdata)
        #tp.save('swedish.model')
        #tp.save('korean.model')
        tp.save('danish.model')

        #testdata = dataset.get_swedish_test_corpus().parsed_sents()
        #testdata = dataset.get_korean_test_corpus().parsed_sents()
        testdata = dataset.get_danish_test_corpus().parsed_sents()

        #tp = TransitionParser.load('swedish.model')
        #tp = TransitionParser.load('korean.model')
        tp = TransitionParser.load('danish.model')

        parsed = tp.parse(testdata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')
Exemple #7
0
if __name__ == '__main__':
    data = dataset.get_swedish_train_corpus().parsed_sents()

    # data = dataset.get_english_test_corpus().parsed_sents()
    # data = dataset.get_danish_train_corpus().parsed_sents()

    random.seed(1234)
    subdata = random.sample(data, 200)




    try:
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(subdata)
        tp.save('swedish.model')
        # tp.save('english.model')
        # tp.save('danish.model')

        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        #tp = TransitionParser.load('badfeatures.model')

        parsed = tp.parse(testdata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "UAS: {} \nLAS: {}".format(*ev.eval())
Exemple #8
0
    # 'data' is parsed sentences converted into Dependency Graph objects.
    model_dict = {
            'english' : ('english.model', dataset.get_english_train_corpus, dataset.get_english_test_corpus),
            'danish' : ('danish.model', dataset.get_danish_train_corpus, dataset.get_danish_test_corpus),
            'swedish' : ('swedish.model', dataset.get_swedish_train_corpus, dataset.get_swedish_test_corpus)
    }
    for model_type, model_tuple in model_dict.iteritems():
        model, data, testdata = model_tuple[0], model_tuple[1]().parsed_sents(), model_tuple[2]().parsed_sents()

        random.seed(1234)
        subdata = random.sample(data, 200)  # 200 randomly selected DependencyGraphs(sentences) for model training.

        try:
            tp = TransitionParser(Transition, FeatureExtractor)
            tp.train(subdata)   # train with 200 randomly selected dependency graphs(sentences).
            tp.save(model)  # save the trained model.

            tp = TransitionParser.load(model)   # load the trained model for parsing.

            parsed = tp.parse(testdata) # parse the test data

            with open('test.conll', 'w') as f:
                for p in parsed:
                    f.write(p.to_conll(10).encode('utf-8'))
                    f.write('\n')

            # evaluate the test parse result here...
            ev = DependencyEvaluator(testdata, parsed)
            print 'Model: {}'.format(model_type)
            # LAS: labeled attachment score - percentage of scoring tokens for which the parsing system has predicted the
            #    correct head and dependency label.
Exemple #9
0
from transition import Transition

if __name__ == '__main__':
    # traindata = dataset.get_swedish_train_corpus().parsed_sents()
    traindata = dataset.get_english_train_corpus().parsed_sents()

    try:

        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(traindata)

        # tp.save('swedish.model')
        # labeleddata = dataset.get_swedish_dev_corpus().parsed_sents()
        # blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents()

        tp.save('english.model')
        labeleddata = dataset.get_english_dev_corpus().parsed_sents()
        blinddata = dataset.get_english_dev_blind_corpus().parsed_sents()

        #tp = TransitionParser.load('badfeatures.model')

        # parsed = tp.parse(labeleddata)
        parsed = tp.parse(blinddata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(labeleddata, parsed)
        print "UAS: {} \nLAS: {}".format(*ev.eval())
        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "Bad Features Results"
        print "UAS: {} \nLAS: {}".format(*ev.eval())
        t1 = time.time()
        print "Time: " + str(t1 - t0) + '\n'

        # SWEDISH FEATURE MODELS
        print 'Starting Swedish'
        tp_s = TransitionParser(Transition, FeatureExtractor)
        tp_s.train(subdata)
        tp_s.save('swedish.model')

        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        tp_s = TransitionParser.load('swedish.model')

        parsed = tp_s.parse(testdata)

        with open('swedish.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "Swedish Results"
        print "UAS: {} \nLAS: {}".format(*ev.eval())
        t2 = time.time()
Exemple #11
0
from featureextractor import FeatureExtractor
from transition import Transition

if __name__ == '__main__':
    random.seed(1234)

    # Tain the english model
    print "--->\t Load the english corpus"
    data = dataset.get_english_train_corpus().parsed_sents()
    #data = dataset.get_english_test_corpus().parsed_sents()
    tp = TransitionParser(Transition, FeatureExtractor)
    subdata = random.sample(data, 200)
    #subdata = data
    print "--->\t Train english corpus model"
    tp.train(subdata)
    tp.save('english.model')

    # Tain the danish model
    print "--->\t Load the danish corpus"
    data = dataset.get_danish_train_corpus().parsed_sents()
    #data = dataset.get_danish_test_corpus().parsed_sents()
    tp = TransitionParser(Transition, FeatureExtractor)
    subdata = random.sample(data, 200)
    #subdata = data
    print "--->\t Train danish corpus model"
    tp.train(subdata)
    tp.save('danish.model')

    # Tain the swedish model
    print "--->\t Load the swedish corpus"
    data = dataset.get_swedish_train_corpus().parsed_sents()
Exemple #12
0
    koreandata = dataset.get_korean_train_corpus().parsed_sents()
    random.seed(1234)
    koreansubdata = random.sample(koreandata, 200)
    

    #get danish training data
    danishdata = dataset.get_danish_train_corpus().parsed_sents()
    random.seed(1234)
    danishsubdata = random.sample(danishdata, 235)

    try:
        
        #SWEDISH TESTING
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(swedishsubdata)
        tp.save('swedish.model')
        
        
        #badfeatures.model...don't use for real testing
        #tp = TransitionParser.load('badfeatures.model')

 
        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        parsed = tp.parse(testdata)
        
        #to write output...for badfeatures.model
        '''
        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')
Exemple #13
0
                   'english': 50.}
    totalPoints = 0
    for testName in tests.keys():
        data = tests[testName]().parsed_sents()
        data_1h = data[0:(len(data)/2)]
        data_2h = data[(len(data)/2):-1]

        random.seed(99999)
        traindata = random.sample(data_1h, 200)
        testdata = random.sample(data_2h, 800)

        try:
            print "Training {0} model...".format(testName)
            tp = TransitionParser(Transition, MyFeatureExtractor)
            tp.train(traindata)
            tp.save(testName + ".model")

            print "Testing {0} model...".format(testName)
            parsed = tp.parse(testdata)

#            with open('test.conll', 'w') as f:
#                for p in parsed:
#                    f.write(p.to_conll(10).encode('utf-8'))
#                    f.write('\n')

            ev = DependencyEvaluator(testdata, parsed)
            print "Test Results For: {0}".format(testName)
            (uas, las) = ev.eval()
            points = scoreWeight[testName] * (min(0.7, las)/0.7)**2
            totalPoints += points
            print "UAS: {0} \nLAS: {1}".format(uas, las)
Exemple #14
0
    english_data = dataset.get_english_train_corpus().parsed_sents()
    random.seed()
    english_subdata = random.sample(english_data, 200)

    # load test set in danish and get 200 random sentences
    danish_data = dataset.get_danish_train_corpus().parsed_sents()
    random.seed()
    danish_subdata = random.sample(danish_data, 200)

    try:
        print 'training swedish'

        # swedish
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(swedish_subdata)
        tp.save('swedish.model')

        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        tp = TransitionParser.load('swedish.model')

        print 'testing swedish'
        parsed = tp.parse(testdata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print 'Swedish results'
        print "UAS: {} \nLAS: {}".format(*ev.eval())
        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "Bad Features Results"
        print "UAS: {} \nLAS: {}".format(*ev.eval())
        t1 = time.time()
        print "Time: "+str(t1 - t0) + '\n'

        # SWEDISH FEATURE MODELS
        print 'Starting Swedish'
        tp_s = TransitionParser(Transition, FeatureExtractor)
        tp_s.train(subdata)
        tp_s.save('swedish.model')

        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        tp_s = TransitionParser.load('swedish.model')

        parsed = tp_s.parse(testdata)

        with open('swedish.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "Swedish Results"
        print "UAS: {} \nLAS: {}".format(*ev.eval())
        t2 = time.time()
Exemple #16
0
from providedcode.transitionparser import TransitionParser
from providedcode.evaluate import DependencyEvaluator
from featureextractor import FeatureExtractor
from transition import Transition

if __name__ == '__main__':
    data = dataset.get_swedish_train_corpus().parsed_sents()
    random.seed(1234)
    subdata = random.sample(data, 200)

    try:

        tp = TransitionParser(Transition, FeatureExtractor)

        tp.train(subdata)
        tp.save('swedish.model')

        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        tp = TransitionParser.load('swedish.model')

        parsed = tp.parse(testdata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
                f.write('\n')

        ev = DependencyEvaluator(testdata, parsed)
        print "UAS: {} \nLAS: {}".format(*ev.eval())

        # parsing arbitrary sentences (swedish):
Exemple #17
0
        # print('Ok')

        # # SE
        # tp = TransitionParser(Transition, FeatureExtractor)
        # tp.train(SE_subdata)
        # tp.save('swedish.model')
        # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents()
        # SE_tp = TransitionParser.load('swedish.model')
        # SE_parsed = SE_tp.parse(SE_testdata)
        #
        # DK
        tp = TransitionParser(Transition, FeatureExtractor)
        print('Training...')
        tp.train(DK_subdata)
        print('Ok. Saving the model...')
        tp.save('danish.model')
        print('Ok. Parsing the test corpus...')
        DK_testdata = dataset.get_danish_test_corpus().parsed_sents()
        #DK_tp = TransitionParser.load('danish.model')
        DK_parsed = tp.parse(DK_testdata)
        print('Ok.')


        # with open('english.conll', 'w') as f:
        #     for p in EN_parsed:
        #         f.write(p.to_conll(10).encode('utf-8'))
        #         f.write('\n')
        #
        # ev = DependencyEvaluator(EN_testdata, EN_parsed)
        # print('Evaluating EN model...')
        # print "LAS: {} \nUAS: {}".format(*ev.eval())
Exemple #18
0
from featureextractor import FeatureExtractor
from transition import Transition

if __name__ == '__main__':
    #data = dataset.get_swedish_train_corpus().parsed_sents()
    data = dataset.get_korean_train_corpus().parsed_sents()
    #data = dataset.get_danish_train_corpus().parsed_sents()

    random.seed(1234)
    subdata = random.sample(data, 200)

    try:
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(subdata)
        #tp.save('swedish.model')
        tp.save('korean.model')
        #tp.save('danish.model')

        #testdata = dataset.get_swedish_test_corpus().parsed_sents()
        testdata = dataset.get_korean_test_corpus().parsed_sents()
        #testdata = dataset.get_danish_test_corpus().parsed_sents()

        #tp = TransitionParser.load('swedish.model')
        tp = TransitionParser.load('korean.model')
        #tp = TransitionParser.load('danish.model')

        parsed = tp.parse(testdata)

        with open('test.conll', 'w') as f:
            for p in parsed:
                f.write(p.to_conll(10).encode('utf-8'))
Exemple #19
0
                    f.write('\n')

            print time.ctime(), "-------DONE----- BADMODEL", modelfile, conllfile

        if F_TRAIN_SWEDISH == True:
            print time.ctime(), "START TRAIN SWEDISH"
            traindata = dataset.get_swedish_train_corpus().parsed_sents()
            labeleddata = dataset.get_swedish_dev_corpus().parsed_sents()
            blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents()

            modelfile = 'swedish.model'
            conllfile = 'swedish.conll'

            tp = TransitionParser(Transition, FeatureExtractor)
            tp.train(traindata)
            tp.save(modelfile)

            # load model for testing
            tp = TransitionParser.load(modelfile)
            parsed = tp.parse(blinddata)

            ev = DependencyEvaluator(labeleddata, parsed)
            print "UAS: {} \nLAS: {}".format(*ev.eval())

            with open(conllfile, 'w') as f:
                for p in parsed:
                    f.write(p.to_conll(10).encode('utf-8'))
                    f.write('\n')
            print time.ctime(), "-------DONE----- TESTING SWEDISH ", modelfile, conllfile

        if F_TRAIN_ENGLISH == True:
Exemple #20
0
from transition import Transition

if __name__ == "__main__":
    #    data = dataset.get_swedish_train_corpus().parsed_sents()
    data = dataset.get_english_train_corpus().parsed_sents()
    #    data = dataset.get_korean_train_corpus().parsed_sents()
    #    data = dataset.get_danish_train_corpus().parsed_sents()
    #
    random.seed(1234)
    subdata = random.sample(data, 200)

    try:
        tp = TransitionParser(Transition, FeatureExtractor)
        tp.train(subdata)
        #        tp.save('swedish.model')
        tp.save("english.model")
        #        tp.save('korean.model')
        #        tp.save('danish.model')
        #        testdata = dataset.get_swedish_test_corpus().parsed_sents()
        #
        testdata = dataset.get_english_dev_corpus().parsed_sents()
        # 	testdata = dataset.get_korean_test_corpus().parsed_sents()
        # 	testdata = dataset.get_danish_test_corpus().parsed_sents()
        #        tp = TransitionParser.load('swedish.model')
        tp = TransitionParser.load("english.model")
        #        tp = TransitionParser.load('korean.model')
        # 	tp = TransitionParser.load('danish.model')

        parsed = tp.parse(testdata)
        with open("test.conll", "w") as f:
            for p in parsed: