Python tbankparser Examples, depTree.tbankparser Python Examples

Example #1

0

Show file

File: depTreeCompare.py Project: Tomaat/grammarCorrector

def main2():  # run this
    filename = '../release3.2/data/conll14st-preprocessed.m2'
    print "Load data from", filename
    f = open(filename, 'r')
    data_raw = [p.split('\n') for p in ''.join(f.readlines()).split('\n\n')]
    sentence_tuples = [
        (sentence[0][2:],
         [tuple(errors.split('|||')) for errors in sentence[1:]])
        for sentence in data_raw[:len(data_raw) - 1]
    ]
    f.close()
    random.shuffle(sentence_tuples)
    sents = sentence_tuples[:150]  # this is the only thing what I have to do
    tbank_s = dts.tbankparser()
    targets = [tbank_s.parse(t[0]) for t in sents]
    inputs = [t[0] for t in sents]
    print "main 0"
    main(0, None, (inputs, targets))
    print "main 1"
    main(1, None, (inputs, targets))
    print "main 4"
    main(4, None, (inputs, targets))
    reload(sys)
    sys.setdefaultencoding('utf8')
    print "main 5"
    main(5, None, (inputs, targets))

Example #2

0

Show file

File: depTreeCompare.py Project: Tomaat/grammarCorrector

def main2(): 
	# preproces the data
	filename= '../release3.2/data/conll14st-preprocessed.m2'
	print "Load data from", filename
	f = open(filename,'r')
	data_raw = [p.split('\n') for p in ''.join(f.readlines() ).split('\n\n')]
	sentence_tuples = [(sentence[0][2:],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw[:len(data_raw)-1]]
	f.close()

	random.shuffle(sentence_tuples)

	sents = sentence_tuples[:150] # select 150 sentences for testing 
	tbank_s = dts.tbankparser()
	targets = [tbank_s.parse(t[0]) for t in sents]
	inputs = [t[0] for t in sents]

	main(0,None,(inputs,targets))

	main(1,None,(inputs,targets))
	
	main(4,None,(inputs,targets))
	reload(sys)  
	sys.setdefaultencoding('utf8')

	main(5,None,(inputs,targets))

Example #3

0

Show file

File: pipeline.py Project: Tomaat/grammarCorrector

def main(history=1,tiny='.tiny',tbank = None):
	assert history >= 1, "use at least some history"
	t1 = time()
	TRAIN_FILE = '../release3.2/final_data/train-data.pre'
	VAL_FILE =   '../release3.2/final_data/validate-data.pre'
	print 'loading tree bank'
	t2 = time()-t1
	if tbank is None:
		tbank = dts.tbankparser()
	print 'loading sentences'
	dp._init_(tbank)
	all_sentences, feature_dict = dp.process(TRAIN_FILE,history)
	val_sentences, _val_feat = dp.process(VAL_FILE,history)
	t3 = time()-t1-t2
	print "features has been made"
	print "init perceptron"
	sp._init_(len(feature_dict),dts, False)
	print "end init"
	out( ('SSE random weights, only Ne-tags',flaws(dts,val_sentences,feature_dict,tbank,history,with_tags=False)) )
	print "SSE random weights, only Ne-tags"
	out( ( 'SSE random weights',flaws(dts,val_sentences,feature_dict,tbank,history) ) )
	print "SSE random weight"
	t4 = time()
	print "learning"
	weights = sp.train_perceptron(all_sentences, feature_dict, tbank, history)
	np.save('weights'+str(history)+tiny+'.npy',weights)
	t4 = time()-t4
	print weights.shape
	t1=time()-t1
	print "validating"
	out( ( 'after %d sentences, only Ne-tags'%(len(all_sentences)), flaws(dts, val_sentences,feature_dict,tbank,history,weights,False) ) )
	out( ( 'after %d sentences'%(len(all_sentences)), flaws(dts, val_sentences,feature_dict,tbank,history,weights) ) )
	out( ( 'total %f sec (loading: %f, %f; training: %f'%(t1,t2,t3,t4) ) )
	return feature_dict,weights

Example #4

0

Show file

File: pipeline.py Project: Tomaat/grammarCorrector

def main(history=1, tiny='.tiny', tbank=None):
    """
		run the whole proces 
	"""
    assert history >= 1, """"use at least some history"""
    t1 = time()

    TRAIN_FILE = '../release3.2/final_data/train-data.pre'
    VAL_FILE = '../release3.2/final_data/validate-data.pre'

    print 'loading tree bank'
    t2 = time() - t1
    if tbank is None:
        tbank = dts.tbankparser()
    print 'loading sentences'

    dp._init_(tbank)
    all_sentences, feature_dict = dp.process(TRAIN_FILE, history)
    val_sentences, _val_feat = dp.process(VAL_FILE, history)
    t3 = time() - t1 - t2

    print "features has been made"
    print "init perceptron"
    sp._init_(len(feature_dict), dts, False)

    print "end init"
    out(('SSE random weights, only Ne-tags',
         flaws(dts,
               val_sentences,
               feature_dict,
               tbank,
               history,
               with_tags=False)))
    print "SSE random weights, only Ne-tags"
    out(('SSE random weights',
         flaws(dts, val_sentences, feature_dict, tbank, history)))

    print "SSE random weight"
    t4 = time()
    print "learning"
    weights = sp.train_perceptron(all_sentences, feature_dict, tbank, history)
    np.save('weights' + str(history) + tiny + '.npy', weights)
    t4 = time() - t4
    print weights.shape

    t1 = time() - t1
    print "validating"
    out(('after %d sentences, only Ne-tags' % (len(all_sentences)),
         flaws(dts, val_sentences, feature_dict, tbank, history, weights,
               False)))
    out(('after %d sentences' % (len(all_sentences)),
         flaws(dts, val_sentences, feature_dict, tbank, history, weights)))
    out(('total %f sec (loading: %f, %f; training: %f' % (t1, t2, t3, t4)))
    return feature_dict, weights

Example #5

0

Show file

File: preprocessNgrams.py Project: Tomaat/grammarCorrector

		sentence.insert(0,parw)
		sentence.insert(0,parw)
		sentence.insert(0,parw)
		return sentence
	else:
		parw = parent.orth_
		current_word = parent
		sentence.insert(0,parw)
		return recursive_tree_climb(current_word, sentence)

if __name__ == '__main__':
	
	print 'start'
	TRAIN_FILE = 'test_data/test_linear.txt' #'../release3.2/data/test.txt'
	all_sentences, feature_dict = dp.process(TRAIN_FILE,1)
	tbank = dts.tbankparser()
	text_file = open("preprocessed-4gram-sentences2.txt", "w")
	print "start looping through sentece"
	
	for sentence in all_sentences:
		try:
			seen_mistakes = []
			parsed_sentence = tbank.parse(sentence.raw_sentence)
			context_tags = [word_tag[1] for word_tag in sentence.words_tags]
			for i in range(0,len(sentence.raw_sentence.split(' '))):
				if context_tags[i] != "Ne":
					cur = parsed_sentence[i]
					sentence_array = []
					sentence_array.insert(0,cur.orth_)
					result = recursive_tree_climb(cur, sentence_array)
					four_gram = result[len(result)-4:]

Example #6

0

Show file

File: depTreeCompare.py Project: Tomaat/grammarCorrector

def main(xin=0, tbank=None, train_test=None):
	"""load a given treebank, score it's accuracy and time runtime
	"""
	# x is the type of treebank
	if tbank is None:
		x = xin
	else:
		x = -1
		name = "user"
	# X is the amount of train-trees for nltk-based tbank
	# Y is the amount of added flaws to the nltk-based tbank
	# slice X:Z are the sentences tested on
	X,Y,Z = 3750,15000,3900
	
	out( 'making targets')
	tt = time()
	if train_test is None:
		data = nltk.corpus.dependency_treebank
		testing_targets = [t.tree() for t in data.parsed_sents()[X:Z]]
		testing_inputs = data.sents()[X:Z]
	else:
		testing_targets = train_test[1]
		testing_inputs = train_test[0]
	tt = time()-tt
	out( 'in',tt,'sec')
	
	out( "loading tbank")
	
	tl = time()

	if x == 0:
		name = "spacy"
		tbank = dts.tbankparser()
	elif x == 1:
		name = "ntlk no noise"
		tbank = dto.tbankparser()
		tbank.getParser(X)
	elif x == 2:
		name = "nltk random noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,True,False)
		tbank.getParser()
	elif x == 3:
		name = "ntlk flaws noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,True,True)
		tbank.getParser()
	elif x == 4:
		name = "nltk only random noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,False,False)
		tbank.getParser()
	elif x == 5:
		name = "ntlk only flaws noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,False,True)
		tbank.getParser()
	tl = time()-tl
	
	out( "scoring...")
	ts = time()
	s = score(tbank,testing_inputs,testing_targets)
	ts = time()-ts
	
	out("%s loaded in %f sec. Scored %f on %d targets in %f sec."%(name,tl,s.sum(),len(testing_targets),ts))
	np.save(name+str(time())+'data.npy',s)
	return s

Example #7

0

Show file

File: depTreeCompare.py Project: Tomaat/grammarCorrector

def main(xin=0, tbank=None, train_test=None):
    """load a given treebank, score it's accuracy and time runtime
	"""
    # x is the type of treebank
    if tbank is None:
        x = xin
    else:
        x = -1
        name = "user"
    # X is the amount of train-trees for nltk-based tbank
    # Y is the amount of added flaws to the nltk-based tbank
    # slice X:Z are the sentences tested on
    X, Y, Z = 3750, 15000, 3900

    out('making targets')
    tt = time()
    if train_test is None:
        data = nltk.corpus.dependency_treebank
        testing_targets = [t.tree() for t in data.parsed_sents()[X:Z]]
        testing_inputs = data.sents()[X:Z]
    else:
        testing_targets = train_test[1]
        testing_inputs = train_test[0]
    tt = time() - tt
    out('in', tt, 'sec')

    out("loading tbank")

    tl = time()

    if x == 0:
        name = "spacy"
        tbank = dts.tbankparser()
    elif x == 1:
        name = "ntlk no noise"
        tbank = dto.tbankparser()
        tbank.getParser(X)
    elif x == 2:
        name = "nltk random noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, True, False)
        tbank.getParser()
    elif x == 3:
        name = "ntlk flaws noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, True, True)
        tbank.getParser()
    elif x == 4:
        name = "nltk only random noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, False, False)
        tbank.getParser()
    elif x == 5:
        name = "ntlk only flaws noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, False, True)
        tbank.getParser()
    tl = time() - tl

    out("scoring...")
    ts = time()
    s = score(tbank, testing_inputs, testing_targets)
    ts = time() - ts

    out("%s loaded in %f sec. Scored %f on %d targets in %f sec." %
        (name, tl, s.sum(), len(testing_targets), ts))
    np.save(name + str(time()) + 'data.npy', s)
    return s

Example #8

0

Show file

File: preprocessNgrams.py Project: Tomaat/grammarCorrector

        sentence.insert(0, parw)
        sentence.insert(0, parw)
        return sentence
    else:
        parw = parent.orth_
        current_word = parent
        sentence.insert(0, parw)
        return recursive_tree_climb(current_word, sentence)


if __name__ == '__main__':

    print 'start'
    TRAIN_FILE = 'test_data/test_linear.txt'  #'../release3.2/data/test.txt'
    all_sentences, feature_dict = dp.process(TRAIN_FILE, 1)
    tbank = dts.tbankparser()
    text_file = open("preprocessed-4gram-sentences2.txt", "w")
    print "start looping through sentece"

    for sentence in all_sentences:
        try:
            seen_mistakes = []
            parsed_sentence = tbank.parse(sentence.raw_sentence)
            context_tags = [word_tag[1] for word_tag in sentence.words_tags]
            for i in range(0, len(sentence.raw_sentence.split(' '))):
                if context_tags[i] != "Ne":
                    cur = parsed_sentence[i]
                    sentence_array = []
                    sentence_array.insert(0, cur.orth_)
                    result = recursive_tree_climb(cur, sentence_array)
                    four_gram = result[len(result) - 4:]

Example #9

0

Show file

File: correction.py Project: Tomaat/grammarCorrector


if __name__ == '__main__':

	use_spacy = False
	parse_type = "linear"
	#parse_type = "dep"
	#filename_prep = "preprocessed-BrownCorpus.txt"
	filename_prep = "test.txt"
	quatrogram_dict,trigram_dict,bigram_dict,unigram_dict = prepare(parse_type, filename_prep)

	# comment these three lines out if you don't want spacy, uncomment if you do
	if use_spacy:
		print "Starting with the spacy stuff.."
		from spacy.en import LOCAL_DATA_DIR, English
		tbank = dt.tbankparser()
	
	print "Finding corrections..."
	if parse_type == "dep":
		filename = "test_correction.txt" 
		#filename = "preprocessed-4gram-sentences.txt"
		if use_spacy:
			correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp)
		else:
			correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename)
	else:
		filename='test_linear.txt'
		if use_spacy:
			correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp)
		else:
			correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename)

Example #10

0

Show file

    return quatrogram_dict, trigram_dict, bigram_dict, unigram_dict


if __name__ == '__main__':

    use_spacy = False
    parse_type = "linear"
    filename_prep = "test_data/test.txt"
    quatrogram_dict, trigram_dict, bigram_dict, unigram_dict = prepare(
        parse_type, filename_prep)

    # comment these three lines out if you don't want spacy, uncomment if you do
    if use_spacy:
        print "Starting with the spacy stuff.."
        from spacy.en import LOCAL_DATA_DIR, English
        tbank = dt.tbankparser()

    print "Finding corrections..."
    if parse_type == "dep":
        filename = "test_data/test_correction.txt"
        if use_spacy:
            correct_dep(quatrogram_dict, trigram_dict, bigram_dict,
                        unigram_dict, filename, tbank.nlp)
        else:
            correct_dep(quatrogram_dict, trigram_dict, bigram_dict,
                        unigram_dict, filename)
    else:
        filename = 'test_data/test_linear.txt'
        if use_spacy:
            correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict,
                    filename, tbank.nlp)