def main(): if len(sys.argv) < 4: print """ Usage: python parse.py in.model > out.conll Input can be provided manually via the command prompt or piped directly to the script using cat. """ # END if if sys.stdin.isatty(): rawtext = [raw_input("Please type a sentence!")] else: rawtext = sys.stdin.read() # END if out_filename = sys.argv[3] model_filename = sys.argv[1] try: tp = TransitionParser.load(model_filename) parsed = tp.parse(rawtext) with open(out_filename, 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') # END for # END with except Exception: "Error."
def main(): try: sentences = sys.stdin.readlines() model_file = sys.argv[1] except: raise ValueError('''Usage: cat <file of sentences> | python parse.py <model_file> or, python parse.py <model_file>, type sentences and hit Ctrl+d''') if not os.path.isfile(model_file): raise ValueError('cant find the model file') # scrub list / remove line breaks sentences = [sent.rstrip() for sent in sentences] # generate dependency graph object from sentences depgraphs = [DependencyGraph.from_sentence(sent) for sent in sentences] # load model and parse tp = TransitionParser.load(model_file) parsed = tp.parse(depgraphs) # print to stdout. # can cat this to a conll file for viewing with MaltEval for p in parsed: print(p.to_conll(10).encode('utf-8')) return
def evaluate_parse(partIdx): if partIdx == 3: print 'Evaluating your swedish model ... ' testdata = dataset.get_swedish_test_corpus().parsed_sents() if not os.path.exists('./swedish.model'): print 'No model. Please save your model as swedish.model at current directory before submission.' sys.exit(0) tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print 'UAS:',uas print 'LAS:',las swed_score = (min(las, 0.7) / 0.7) ** 2 return swed_score if partIdx == 1: print 'Evaluating your english model ... ' testdata = dataset.get_english_test_corpus().parsed_sents() if not os.path.exists('./english.model'): print 'No model. Please save your model as english.model at current directory before submission.' sys.exit(0) tp = TransitionParser.load('english.model') parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print 'UAS:',uas print 'LAS:',las eng_score = (min(las, 0.7) / 0.7) ** 2 return eng_score if partIdx == 2: print 'Evaluating your danish model ... ' testdata = dataset.get_danish_test_corpus().parsed_sents() if not os.path.exists('./danish.model'): print 'No model. Please save your model danish.model at current directory before submission.' sys.exit(0) tp = TransitionParser.load('danish.model') parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print 'UAS:',uas print 'LAS:',las dan_score = (min(las, 0.7) / 0.7) ** 2 return dan_score
def evaluate_parse(partIdx): if partIdx == 3: print 'Evaluating your swedish model ... ' testdata = dataset.get_swedish_test_corpus().parsed_sents() if not os.path.exists('./swedish.model'): print 'No model. Please save your model as swedish.model at current directory before submission.' sys.exit(0) tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print 'UAS:', uas print 'LAS:', las swed_score = (min(las, 0.7) / 0.7)**2 return swed_score if partIdx == 1: print 'Evaluating your english model ... ' testdata = dataset.get_english_test_corpus().parsed_sents() if not os.path.exists('./english.model'): print 'No model. Please save your model as english.model at current directory before submission.' sys.exit(0) tp = TransitionParser.load('english.model') parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print 'UAS:', uas print 'LAS:', las eng_score = (min(las, 0.7) / 0.7)**2 return eng_score if partIdx == 2: print 'Evaluating your danish model ... ' testdata = dataset.get_danish_test_corpus().parsed_sents() if not os.path.exists('./danish.model'): print 'No model. Please save your model danish.model at current directory before submission.' sys.exit(0) tp = TransitionParser.load('danish.model') parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print 'UAS:', uas print 'LAS:', las dan_score = (min(las, 0.7) / 0.7)**2 return dan_score
def main(): file_to_parse = sys.stdin sentences_list = [s for s in file_to_parse] file_to_parse.close() lang_model = sys.argv[1] tp = TransitionParser.load(lang_model) sentences = [DependencyGraph.from_sentence(s) for s in sentences_list] parsed = tp.parse(sentences) for p in parsed: print p.to_conll(10).encode('utf-8')
def verify_lang_data(model, conll_output): try: lang = extract_lang_from_model_name(model) testdata = get_data_from_lang(lang) tp = TransitionParser.load(model) parsed = tp.parse(testdata) with open(conll_output, 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) uas, las = ev.eval() print "\n=====Prediction of {}.model===== \nUAS: {} \nLAS: {}".format(lang, uas, las) return las pass except ValueError as e: print(e)
def parse(argv): if len(argv) != 2: sys.exit( "python parse.py language.model") # data = dataset.get_english_train_corpus().parsed_sents() # random.seed(1234) # subdata = random.sample(data, 200) language_model = argv[1] try: sentences = sys.stdin.readlines() for i,sentence in enumerate(sentences): dg = DependencyGraph.from_sentence(sentence) tp = TransitionParser.load(language_model) parsed = tp.parse([dg]) print parsed[0].to_conll(10).encode('utf-8') # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(subdata) # tp.save('english.model') # testdata = dataset.get_swedish_test_corpus().parsed_sents() # tp = TransitionParser.load('english.model') # parsed = tp.parse(testdata) #open new file for write on first sentence if i == 0: with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') #append for rest sentences else: with open('test.conll', 'a') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') # ev = DependencyEvaluator(testdata, parsed) # print "UAS: {} \nLAS: {}".format(*ev.eval()) except NotImplementedError: print """
def main(): lang_train_list = [] if len(sys.argv) == 1: lang_train_list = ['swedish', 'danish', 'english'] else: lang_train_list = sys.argv[1:] random.seed(1126) for lang in lang_train_list: whole_data = get_train_data_from_lang(lang) subdata = random.sample(whole_data, 200) tp = TransitionParser(Transition, FeatureExtractor) print '\n===== Start training {} data ====='.format(lang) tp.train(subdata) tp.save(lang + '.model') print '===== Sucessfully generating models ====='
def train_model(lang,training_set='train'): # load and sample data data = get_data(lang,dataset=training_set).parsed_sents() if len(data) >200: random.seed(1234) subdata = random.sample(data, 200) else: subdata = data # train model and save tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('{0}.model'.format(lang)) # test performance on new data if lang != 'english': testdata = get_data(lang,dataset='test').parsed_sents() # english test data not available # so find a subset of training data # that is disjoint from data used for training else: not_in_training = [sent for sent in data if sent not in subdata] testdata = random.sample(not_in_training,200) parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) # store and print results with open('results.txt','a') as results_file: results_file.write('{0} model:\n'.format(lang)) results_file.write("UAS: {} \nLAS: {}\n".format(*ev.eval())) print '{0} model:\n'.format(lang) print "UAS: {} \nLAS: {}\n".format(*ev.eval()) return ev.eval()[1]
from transition import Transition if __name__ == '__main__': #data = dataset.get_swedish_train_corpus().parsed_sents() data = dataset.get_english_train_corpus().parsed_sents() #data = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) # For Swedish to get 200 projectives #subdata = random.sample(data, 223) # For Danish to get 200 projectives #subdata = random.sample(data, 236) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) #tp.save('swedish.model') #tp.save('english.model') #tp.save('danish.model') #testdata = dataset.get_swedish_test_corpus().parsed_sents() testdata = dataset.get_english_dev_corpus().parsed_sents() #testdata = dataset.get_danish_test_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed:
__author__ = 'johnfulgoni' import sys from providedcode.dependencygraph import DependencyGraph from providedcode.transitionparser import TransitionParser # DON'T PRINT ANYTHING! OR ELSE IT MESSES THINGS UP if __name__ == '__main__': argc = len(sys.argv) if argc == 2: #print sys.argv[1] # just to see sentence_list = [] for sent in sys.stdin: # get the sentences from the englishfile sentence = DependencyGraph.from_sentence(sent) sentence_list.append(sentence) my_model = sys.argv[1] # should be 'english.model' tp = TransitionParser.load(my_model) parsed = tp.parse(sentence_list) # following the example in test.py # but we're not writing it to a file for p in parsed: print p.to_conll(10).encode('utf-8') print '\n' else: print "Need two arguments" exit(1)
from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(subdata) # tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('english.model')
subdata = random.sample(data, 200) # use this subdata for bad features and swedish # NEED DANISH AND ENGLISH data_e = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata_e = random.sample(data_e, 200) data_d = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata_d = random.sample(data_d, 200) try: # BAD FEATURES MODEL (SWEDISH DATA) print "Starting Bad Features" testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: "+str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS
from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition from providedcode.dependencygraph import DependencyGraph if __name__ == '__main__': # data = dataset.get_swedish_train_corpus().parsed_sents() # data = dataset.get_english_train_corpus().parsed_sents() # data = dataset.get_dutch_train_corpus().parsed_sents() data = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # tp.save('swedish.model') # tp.save('english.model') tp.save('danish.model') # testdata = dataset.get_swedish_test_corpus().parsed_sents() testdata = dataset.get_danish_test_corpus().parsed_sents() # tp = TransitionParser.load('badfeatures.model') # testdata = dataset.get_english_test_corpus().parsed_sents() # tp = TransitionParser.load('english.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed:
def handle_input(input_file, model_file): tp = TransitionParser.load(model_file) for line in input_file: sentence = DependencyGraph.from_sentence(line) parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode('utf-8')
import providedcode from providedcode.transitionparser import TransitionParser from providedcode.dependencygraph import DependencyGraph from providedcode.evaluate import DependencyEvaluator import sys tp = TransitionParser.load('english.model') for line in sys.stdin: sentence = DependencyGraph.from_sentence(line) parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode('utf-8')
#get korean training data koreandata = dataset.get_korean_train_corpus().parsed_sents() random.seed(1234) koreansubdata = random.sample(koreandata, 200) #get danish training data danishdata = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) danishsubdata = random.sample(danishdata, 235) try: #SWEDISH TESTING tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedishsubdata) tp.save('swedish.model') #badfeatures.model...don't use for real testing #tp = TransitionParser.load('badfeatures.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() parsed = tp.parse(testdata) #to write output...for badfeatures.model ''' with open('test.conll', 'w') as f: for p in parsed:
import nltk from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition from providedcode.dependencygraph import DependencyGraph if __name__ == '__main__': data = dataset.get_english_train_corpus().parsed_sents() #data = nltk.corpus.dependency_treebank.parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('english.model') # testdata = dataset.get_english_test_corpus().parsed_sents() # tp = TransitionParser.load('badfeatures.model') # tp = TransitionParser.load('english.model') # sentence = DependencyGraph.from_sentence('Hi, this is a test') # testdata.append(sentence) # parsed = tp.parse([sentence]) # parsed = tp.parse(testdata) # with open('test.conll', 'w') as f: # for p in parsed:
scoreWeight = {'swedish': 25., 'danish': 25., 'english': 50.} totalPoints = 0 for testName in tests.keys(): data = tests[testName]().parsed_sents() data_1h = data[0:(len(data)/2)] data_2h = data[(len(data)/2):-1] random.seed(99999) traindata = random.sample(data_1h, 200) testdata = random.sample(data_2h, 800) try: print "Training {0} model...".format(testName) tp = TransitionParser(Transition, MyFeatureExtractor) tp.train(traindata) tp.save(testName + ".model") print "Testing {0} model...".format(testName) parsed = tp.parse(testdata) # with open('test.conll', 'w') as f: # for p in parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Test Results For: {0}".format(testName) (uas, las) = ev.eval() points = scoreWeight[testName] * (min(0.7, las)/0.7)**2
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition from providedcode.dependencygraph import DependencyGraph import sys tp = TransitionParser.load('english.model') sentences = [] for sentence in sys.stdin: sentence = sentence.strip() sentence = DependencyGraph.from_sentence(sentence) sentences.append(sentence) parsed = tp.parse(sentences) for parse in parsed: print parse.to_conll(10).encode('utf-8')
from providedcode.dependencygraph import DependencyGraph from providedcode.transitionparser import TransitionParser from transition import Transition if __name__ == '__main__': # print 'NLP Parse Program..' try: model_path = sys.argv[1] # print 'ModelPath', model_path except IndexError as ie: print 'Model Path Not Specified! Exiting...', ie sys.exit(-1) try: tp = TransitionParser(Transition, FeatureExtractor) tp = TransitionParser.load(model_path) # load the trained model for parsing. for line in sys.stdin: # print 'Processing:', line sentence = DependencyGraph.from_sentence(line) parsed = tp.parse([sentence]) # parse the input line print parsed[0].to_conll(10).encode('utf-8') # with open('test.conll', 'w') as f: # for p in parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test')
#os.chdir("/home/sidvash/NLP_coursera/Assignment1/code/") import random from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed)
if __name__ == '__main__': # 'data' is parsed sentences converted into Dependency Graph objects. model_dict = { 'english' : ('english.model', dataset.get_english_train_corpus, dataset.get_english_test_corpus), 'danish' : ('danish.model', dataset.get_danish_train_corpus, dataset.get_danish_test_corpus), 'swedish' : ('swedish.model', dataset.get_swedish_train_corpus, dataset.get_swedish_test_corpus) } for model_type, model_tuple in model_dict.iteritems(): model, data, testdata = model_tuple[0], model_tuple[1]().parsed_sents(), model_tuple[2]().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) # 200 randomly selected DependencyGraphs(sentences) for model training. try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # train with 200 randomly selected dependency graphs(sentences). tp.save(model) # save the trained model. tp = TransitionParser.load(model) # load the trained model for parsing. parsed = tp.parse(testdata) # parse the test data with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') # evaluate the test parse result here... ev = DependencyEvaluator(testdata, parsed) print 'Model: {}'.format(model_type)
# print('Parsing dev corpus...') # EN_testdata = dataset.get_english_dev_corpus().parsed_sents() # EN_tp = TransitionParser.load('english.model') # EN_parsed = EN_tp.parse(EN_testdata) # print('Ok') # # SE # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n') #
# load test set in english and get 200 random sentences english_data = dataset.get_english_train_corpus().parsed_sents() random.seed() english_subdata = random.sample(english_data, 200) # load test set in danish and get 200 random sentences danish_data = dataset.get_danish_train_corpus().parsed_sents() random.seed() danish_subdata = random.sample(danish_data, 200) try: print 'training swedish' # swedish tp = TransitionParser(Transition, FeatureExtractor) tp.train(swedish_subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') print 'testing swedish' parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed)
import random import nltk from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from providedcode.dependencygraph import DependencyGraph from featureextractor import FeatureExtractor from transition import Transition import sys if __name__ == '__main__': try: model = sys.argv[1] tp = TransitionParser.load(model) for line in sys.stdin: # temp = line.strip() # temp = str(temp) # parsing arbitrary sentences (english): # print "[" + temp + "]" temp = line # temp = "Hi, this is a test." sentence = DependencyGraph.from_sentence(temp) for key, dct in sentence.nodes.items(): dct['ctag'] = nltk.tag.mapping.map_tag("en-ptb", "universal", dct['ctag']) parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode('utf-8')
data, 200) # use this subdata for bad features and swedish # NEED DANISH AND ENGLISH data_e = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata_e = random.sample(data_e, 200) data_d = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata_d = random.sample(data_d, 200) try: # BAD FEATURES MODEL (SWEDISH DATA) print "Starting Bad Features" testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "Bad Features Results" print "UAS: {} \nLAS: {}".format(*ev.eval()) t1 = time.time() print "Time: " + str(t1 - t0) + '\n' # SWEDISH FEATURE MODELS
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': #traindata = dataset.get_swedish_train_corpus().parsed_sents() traindata = dataset.get_english_train_corpus().parsed_sents() #traindata = dataset.get_danish_train_corpus().parsed_sents() try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) #tp.save('swedish.model') #tp.save('english.model') ### tp.save('danish.model') #labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() labeleddata = dataset.get_english_dev_corpus().parsed_sents() #labeleddata = dataset.get_danish_dev_corpus().parsed_sents() #blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #blinddata = dataset.get_danish_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(blinddata)
F_TRAIN_ENGLISH = True F_TRAIN_DANISH = True F_TRAIN_KOREAN = False #traindata = dataset.get_swedish_train_corpus().parsed_sents() try: if F_TEST_BADMODEL == True: print time.ctime(), "START BADMODEL" traindata = dataset.get_swedish_train_corpus().parsed_sents() labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() modelfile = 'badfeatures.model' tp = TransitionParser.load(modelfile) parsed = tp.parse(blinddata) ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) conllfile = 'test.conll' with open(conllfile, 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') print time.ctime(), "-------DONE----- BADMODEL", modelfile, conllfile if F_TRAIN_SWEDISH == True: print time.ctime(), "START TRAIN SWEDISH"
import random from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: # removed commenting from following three lines, should generate saved models tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load('swedish.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (swedish):
from transition import Transition if __name__ == '__main__': data = dataset.get_swedish_train_corpus().parsed_sents() # data = dataset.get_english_test_corpus().parsed_sents() # data = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') # tp.save('english.model') # tp.save('danish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n')
F_TRAIN_SWEDISH = True F_TRAIN_ENGLISH = True F_TRAIN_DANISH = True F_TRAIN_KOREAN = False #traindata = dataset.get_swedish_train_corpus().parsed_sents() try: if F_TEST_BADMODEL == True: print time.ctime(), "START BADMODEL" traindata = dataset.get_swedish_train_corpus().parsed_sents() labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() modelfile = 'badfeatures.model' tp = TransitionParser.load(modelfile) parsed = tp.parse(blinddata) ev = DependencyEvaluator(labeleddata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) conllfile = 'test.conll' with open(conllfile, 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') print time.ctime( ), "-------DONE----- BADMODEL", modelfile, conllfile if F_TRAIN_SWEDISH == True:
from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == "__main__": data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(subdata) # tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() tp = TransitionParser.load("badfeatures.model") parsed = tp.parse(testdata) with open("test.conll", "w") as f: for p in parsed: f.write(p.to_conll(10).encode("utf-8")) f.write("\n") ev = DependencyEvaluator(testdata, parsed) print "LAS: {} \nUAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english): # sentence = DependencyGraph.from_sentence('Hi, this is a test') # tp = TransitionParser.load('english.model')
from providedcode.transitionparser import TransitionParser from providedcode.dependencygraph import DependencyGraph from nltk.tag import mapping import sys # check number of inputs if len(sys.argv) != 2: raise ValueError("Invalid arguments. Usage: python parse.py <modelfile>") # validate & load modelfile from commandline arg modelfile = sys.argv[1] tp = TransitionParser.load(modelfile) # read each line as a sentence from stdin for line in sys.stdin: line = line.strip() if len(line) == 0: continue # convert to DependencyGraph and replace CPOS features as they are not # originally part of from_sentence(). Piazza sentence = DependencyGraph.from_sentence(line) for node in sentence.nodes: tag = sentence.nodes[node]['tag'] ctag = mapping.map_tag('wsj', 'universal', tag) sentence.nodes[node]['ctag'] = ctag parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode('utf-8') #print ">>",line,"<<"
import random from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': data = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('english.model') testdata = dataset.get_english_dev_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') parsed = tp.parse(testdata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8')) f.write('\n') ev = DependencyEvaluator(testdata, parsed) print "UAS: {} \nLAS: {}".format(*ev.eval()) # parsing arbitrary sentences (english):
import sys from providedcode.transitionparser import TransitionParser from providedcode.dependencygraph import DependencyGraph if __name__ == '__main__': # the raw sentences read from englishfile lines = sys.stdin # if no sentences read or not enough parameters retrieved, exit the program. if not lines or not sys.argv.__len__() == 2: exit() # put the raw sentences in to dependency graphs and form a list of these graphs. sentences = [DependencyGraph.from_sentence(line) for line in lines] model_name = sys.argv[1] # load the trained model tp = TransitionParser.load(model_name) # parse the sentences with the model parsed = tp.parse(sentences) # write the parsed sentences into the output file conll supported format. for parsed_line in parsed: print parsed_line.to_conll(10).encode('utf-8') #sentence = DependencyGraph.from_sentence('Hi, this is a test') #tp = TransitionParser.load('english.model') #parsed = tp.parse([sentence]) #print parsed[0].to_conll(10).encode('utf-8')
# print('Parsing dev corpus...') # EN_testdata = dataset.get_english_dev_corpus().parsed_sents() # EN_tp = TransitionParser.load('english.model') # EN_parsed = EN_tp.parse(EN_testdata) # print('Ok') # # SE # tp = TransitionParser(Transition, FeatureExtractor) # tp.train(SE_subdata) # tp.save('swedish.model') # SE_testdata = dataset.get_swedish_test_corpus().parsed_sents() # SE_tp = TransitionParser.load('swedish.model') # SE_parsed = SE_tp.parse(SE_testdata) # # DK tp = TransitionParser(Transition, FeatureExtractor) print('Training...') tp.train(DK_subdata) print('Ok. Saving the model...') tp.save('danish.model') print('Ok. Parsing the test corpus...') DK_testdata = dataset.get_danish_test_corpus().parsed_sents() #DK_tp = TransitionParser.load('danish.model') DK_parsed = tp.parse(DK_testdata) print('Ok.') # with open('english.conll', 'w') as f: # for p in EN_parsed: # f.write(p.to_conll(10).encode('utf-8')) # f.write('\n')
from providedcode.transitionparser import TransitionParser from providedcode.dependencygraph import DependencyGraph import fileinput import sys # parsing arbitrary sentences (english): import nltk from nltk.tag import map_tag if __name__ == '__main__': if (len(sys.argv) != 2): print "need 1 argument for model!" exit(1) tp = TransitionParser.load(sys.argv[1]) line = sys.stdin.readline() while line: sentence = DependencyGraph.from_sentence(line) for (index, node) in enumerate(sentence.nodes): sentence.nodes[index]['ctag'] = map_tag( 'en-ptb', 'universal', sentence.nodes[index]['ctag']) parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode('utf-8') line = sys.stdin.readline()
from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == '__main__': # traindata = dataset.get_swedish_train_corpus().parsed_sents() traindata = dataset.get_english_train_corpus().parsed_sents() try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(traindata) # tp.save('swedish.model') # labeleddata = dataset.get_swedish_dev_corpus().parsed_sents() # blinddata = dataset.get_swedish_dev_blind_corpus().parsed_sents() tp.save('english.model') labeleddata = dataset.get_english_dev_corpus().parsed_sents() blinddata = dataset.get_english_dev_blind_corpus().parsed_sents() #tp = TransitionParser.load('badfeatures.model') # parsed = tp.parse(labeleddata) parsed = tp.parse(blinddata) with open('test.conll', 'w') as f: for p in parsed: f.write(p.to_conll(10).encode('utf-8'))
import sys from providedcode.transitionparser import TransitionParser from providedcode.dependencygraph import DependencyGraph if __name__ == '__main__': sentences = sys.stdin.readlines() tp = TransitionParser.load(sys.argv[1]) for sentence in sentences: dg = DependencyGraph.from_sentence(sentence) parsed = tp.parse([dg]) print parsed[0].to_conll(10).encode('utf-8') #print '\n'
from providedcode.dependencygraph import DependencyGraph from providedcode import dataset from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition import sys if __name__ == "__main__": try: # parsing arbitrary sentences (english): fromInput = "".join(sys.stdin.readlines()) # print fromInput sentence = DependencyGraph.from_sentence(fromInput) tp = TransitionParser.load("english.model") parsed = tp.parse([sentence]) print parsed[0].to_conll(10).encode("utf-8") except NotImplementedError: print """ This file is currently broken! We removed the implementation of Transition (in transition.py), which tells the transitionparser how to go from one Configuration to another Configuration. This is an essential part of the arc-eager dependency parsing algorithm, so you should probably fix that :) The algorithm is described in great detail here: http://aclweb.org/anthology//C/C12/C12-1059.pdf We also haven't actually implemented most of the features for for the support vector machine (in featureextractor.py), so as you might expect the evaluator is going to give you somewhat bad results...
data = dataset.get_swedish_train_corpus().parsed_sents() random.seed(1234) subdata = random.sample(data, 200) data_eng = dataset.get_english_train_corpus().parsed_sents() random.seed(1234) subdata_eng = random.sample(data_eng, 200) data_dan = dataset.get_danish_train_corpus().parsed_sents() random.seed(1234) subdata_dan = random.sample(data_dan, 200) try: # BAD MODEL ########################################################### tp = TransitionParser.load('badfeatures.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() parsed = tp.parse(testdata) ev = DependencyEvaluator(testdata, parsed) print "Bad Features Model" print "UAS: {} \nLAS: {}".format(*ev.eval()) # SWEDISH ############################################################# tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) tp.save('swedish.model') testdata = dataset.get_swedish_test_corpus().parsed_sents() # tp = TransitionParser.load('badfeatures.model')
from providedcode.transitionparser import TransitionParser from providedcode.evaluate import DependencyEvaluator from featureextractor import FeatureExtractor from transition import Transition if __name__ == "__main__": # data = dataset.get_swedish_train_corpus().parsed_sents() data = dataset.get_english_train_corpus().parsed_sents() # data = dataset.get_korean_train_corpus().parsed_sents() # data = dataset.get_danish_train_corpus().parsed_sents() # random.seed(1234) subdata = random.sample(data, 200) try: tp = TransitionParser(Transition, FeatureExtractor) tp.train(subdata) # tp.save('swedish.model') tp.save("english.model") # tp.save('korean.model') # tp.save('danish.model') # testdata = dataset.get_swedish_test_corpus().parsed_sents() # testdata = dataset.get_english_dev_corpus().parsed_sents() # testdata = dataset.get_korean_test_corpus().parsed_sents() # testdata = dataset.get_danish_test_corpus().parsed_sents() # tp = TransitionParser.load('swedish.model') tp = TransitionParser.load("english.model") # tp = TransitionParser.load('korean.model') # tp = TransitionParser.load('danish.model')