def main2():  # run this
    filename = '../release3.2/data/conll14st-preprocessed.m2'
    print "Load data from", filename
    f = open(filename, 'r')
    data_raw = [p.split('\n') for p in ''.join(f.readlines()).split('\n\n')]
    sentence_tuples = [
        (sentence[0][2:],
         [tuple(errors.split('|||')) for errors in sentence[1:]])
        for sentence in data_raw[:len(data_raw) - 1]
    ]
    f.close()
    random.shuffle(sentence_tuples)
    sents = sentence_tuples[:150]  # this is the only thing what I have to do
    tbank_s = dts.tbankparser()
    targets = [tbank_s.parse(t[0]) for t in sents]
    inputs = [t[0] for t in sents]
    print "main 0"
    main(0, None, (inputs, targets))
    print "main 1"
    main(1, None, (inputs, targets))
    print "main 4"
    main(4, None, (inputs, targets))
    reload(sys)
    sys.setdefaultencoding('utf8')
    print "main 5"
    main(5, None, (inputs, targets))
def main2(): 
	# preproces the data
	filename= '../release3.2/data/conll14st-preprocessed.m2'
	print "Load data from", filename
	f = open(filename,'r')
	data_raw = [p.split('\n') for p in ''.join(f.readlines() ).split('\n\n')]
	sentence_tuples = [(sentence[0][2:],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw[:len(data_raw)-1]]
	f.close()

	random.shuffle(sentence_tuples)

	sents = sentence_tuples[:150] # select 150 sentences for testing 
	tbank_s = dts.tbankparser()
	targets = [tbank_s.parse(t[0]) for t in sents]
	inputs = [t[0] for t in sents]

	main(0,None,(inputs,targets))

	main(1,None,(inputs,targets))
	
	main(4,None,(inputs,targets))
	reload(sys)  
	sys.setdefaultencoding('utf8')

	main(5,None,(inputs,targets))
Example #3
0
def main(history=1,tiny='.tiny',tbank = None):
	assert history >= 1, "use at least some history"
	t1 = time()
	TRAIN_FILE = '../release3.2/final_data/train-data.pre'
	VAL_FILE =   '../release3.2/final_data/validate-data.pre'
	print 'loading tree bank'
	t2 = time()-t1
	if tbank is None:
		tbank = dts.tbankparser()
	print 'loading sentences'
	dp._init_(tbank)
	all_sentences, feature_dict = dp.process(TRAIN_FILE,history)
	val_sentences, _val_feat = dp.process(VAL_FILE,history)
	t3 = time()-t1-t2
	print "features has been made"
	print "init perceptron"
	sp._init_(len(feature_dict),dts, False)
	print "end init"
	out( ('SSE random weights, only Ne-tags',flaws(dts,val_sentences,feature_dict,tbank,history,with_tags=False)) )
	print "SSE random weights, only Ne-tags"
	out( ( 'SSE random weights',flaws(dts,val_sentences,feature_dict,tbank,history) ) )
	print "SSE random weight"
	t4 = time()
	print "learning"
	weights = sp.train_perceptron(all_sentences, feature_dict, tbank, history)
	np.save('weights'+str(history)+tiny+'.npy',weights)
	t4 = time()-t4
	print weights.shape
	t1=time()-t1
	print "validating"
	out( ( 'after %d sentences, only Ne-tags'%(len(all_sentences)), flaws(dts, val_sentences,feature_dict,tbank,history,weights,False) ) )
	out( ( 'after %d sentences'%(len(all_sentences)), flaws(dts, val_sentences,feature_dict,tbank,history,weights) ) )
	out( ( 'total %f sec (loading: %f, %f; training: %f'%(t1,t2,t3,t4) ) )
	return feature_dict,weights
Example #4
0
def main(history=1, tiny='.tiny', tbank=None):
    """
		run the whole proces 
	"""
    assert history >= 1, """"use at least some history"""
    t1 = time()

    TRAIN_FILE = '../release3.2/final_data/train-data.pre'
    VAL_FILE = '../release3.2/final_data/validate-data.pre'

    print 'loading tree bank'
    t2 = time() - t1
    if tbank is None:
        tbank = dts.tbankparser()
    print 'loading sentences'

    dp._init_(tbank)
    all_sentences, feature_dict = dp.process(TRAIN_FILE, history)
    val_sentences, _val_feat = dp.process(VAL_FILE, history)
    t3 = time() - t1 - t2

    print "features has been made"
    print "init perceptron"
    sp._init_(len(feature_dict), dts, False)

    print "end init"
    out(('SSE random weights, only Ne-tags',
         flaws(dts,
               val_sentences,
               feature_dict,
               tbank,
               history,
               with_tags=False)))
    print "SSE random weights, only Ne-tags"
    out(('SSE random weights',
         flaws(dts, val_sentences, feature_dict, tbank, history)))

    print "SSE random weight"
    t4 = time()
    print "learning"
    weights = sp.train_perceptron(all_sentences, feature_dict, tbank, history)
    np.save('weights' + str(history) + tiny + '.npy', weights)
    t4 = time() - t4
    print weights.shape

    t1 = time() - t1
    print "validating"
    out(('after %d sentences, only Ne-tags' % (len(all_sentences)),
         flaws(dts, val_sentences, feature_dict, tbank, history, weights,
               False)))
    out(('after %d sentences' % (len(all_sentences)),
         flaws(dts, val_sentences, feature_dict, tbank, history, weights)))
    out(('total %f sec (loading: %f, %f; training: %f' % (t1, t2, t3, t4)))
    return feature_dict, weights
		sentence.insert(0,parw)
		sentence.insert(0,parw)
		sentence.insert(0,parw)
		return sentence
	else:
		parw = parent.orth_
		current_word = parent
		sentence.insert(0,parw)
		return recursive_tree_climb(current_word, sentence)

if __name__ == '__main__':
	
	print 'start'
	TRAIN_FILE = 'test_data/test_linear.txt' #'../release3.2/data/test.txt'
	all_sentences, feature_dict = dp.process(TRAIN_FILE,1)
	tbank = dts.tbankparser()
	text_file = open("preprocessed-4gram-sentences2.txt", "w")
	print "start looping through sentece"
	
	for sentence in all_sentences:
		try:
			seen_mistakes = []
			parsed_sentence = tbank.parse(sentence.raw_sentence)
			context_tags = [word_tag[1] for word_tag in sentence.words_tags]
			for i in range(0,len(sentence.raw_sentence.split(' '))):
				if context_tags[i] != "Ne":
					cur = parsed_sentence[i]
					sentence_array = []
					sentence_array.insert(0,cur.orth_)
					result = recursive_tree_climb(cur, sentence_array)
					four_gram = result[len(result)-4:]
def main(xin=0, tbank=None, train_test=None):
	"""load a given treebank, score it's accuracy and time runtime
	"""
	# x is the type of treebank
	if tbank is None:
		x = xin
	else:
		x = -1
		name = "user"
	# X is the amount of train-trees for nltk-based tbank
	# Y is the amount of added flaws to the nltk-based tbank
	# slice X:Z are the sentences tested on
	X,Y,Z = 3750,15000,3900
	
	out( 'making targets')
	tt = time()
	if train_test is None:
		data = nltk.corpus.dependency_treebank
		testing_targets = [t.tree() for t in data.parsed_sents()[X:Z]]
		testing_inputs = data.sents()[X:Z]
	else:
		testing_targets = train_test[1]
		testing_inputs = train_test[0]
	tt = time()-tt
	out( 'in',tt,'sec')
	
	out( "loading tbank")
	
	tl = time()

	if x == 0:
		name = "spacy"
		tbank = dts.tbankparser()
	elif x == 1:
		name = "ntlk no noise"
		tbank = dto.tbankparser()
		tbank.getParser(X)
	elif x == 2:
		name = "nltk random noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,True,False)
		tbank.getParser()
	elif x == 3:
		name = "ntlk flaws noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,True,True)
		tbank.getParser()
	elif x == 4:
		name = "nltk only random noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,False,False)
		tbank.getParser()
	elif x == 5:
		name = "ntlk only flaws noise"
		tbank = dto.tbankparser()
		tbank.truncate(X)
		tbank.add_noise(Y,False,True)
		tbank.getParser()
	tl = time()-tl
	
	out( "scoring...")
	ts = time()
	s = score(tbank,testing_inputs,testing_targets)
	ts = time()-ts
	
	out("%s loaded in %f sec. Scored %f on %d targets in %f sec."%(name,tl,s.sum(),len(testing_targets),ts))
	np.save(name+str(time())+'data.npy',s)
	return s 
def main(xin=0, tbank=None, train_test=None):
    """load a given treebank, score it's accuracy and time runtime
	"""
    # x is the type of treebank
    if tbank is None:
        x = xin
    else:
        x = -1
        name = "user"
    # X is the amount of train-trees for nltk-based tbank
    # Y is the amount of added flaws to the nltk-based tbank
    # slice X:Z are the sentences tested on
    X, Y, Z = 3750, 15000, 3900

    out('making targets')
    tt = time()
    if train_test is None:
        data = nltk.corpus.dependency_treebank
        testing_targets = [t.tree() for t in data.parsed_sents()[X:Z]]
        testing_inputs = data.sents()[X:Z]
    else:
        testing_targets = train_test[1]
        testing_inputs = train_test[0]
    tt = time() - tt
    out('in', tt, 'sec')

    out("loading tbank")

    tl = time()

    if x == 0:
        name = "spacy"
        tbank = dts.tbankparser()
    elif x == 1:
        name = "ntlk no noise"
        tbank = dto.tbankparser()
        tbank.getParser(X)
    elif x == 2:
        name = "nltk random noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, True, False)
        tbank.getParser()
    elif x == 3:
        name = "ntlk flaws noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, True, True)
        tbank.getParser()
    elif x == 4:
        name = "nltk only random noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, False, False)
        tbank.getParser()
    elif x == 5:
        name = "ntlk only flaws noise"
        tbank = dto.tbankparser()
        tbank.truncate(X)
        tbank.add_noise(Y, False, True)
        tbank.getParser()
    tl = time() - tl

    out("scoring...")
    ts = time()
    s = score(tbank, testing_inputs, testing_targets)
    ts = time() - ts

    out("%s loaded in %f sec. Scored %f on %d targets in %f sec." %
        (name, tl, s.sum(), len(testing_targets), ts))
    np.save(name + str(time()) + 'data.npy', s)
    return s
        sentence.insert(0, parw)
        sentence.insert(0, parw)
        return sentence
    else:
        parw = parent.orth_
        current_word = parent
        sentence.insert(0, parw)
        return recursive_tree_climb(current_word, sentence)


if __name__ == '__main__':

    print 'start'
    TRAIN_FILE = 'test_data/test_linear.txt'  #'../release3.2/data/test.txt'
    all_sentences, feature_dict = dp.process(TRAIN_FILE, 1)
    tbank = dts.tbankparser()
    text_file = open("preprocessed-4gram-sentences2.txt", "w")
    print "start looping through sentece"

    for sentence in all_sentences:
        try:
            seen_mistakes = []
            parsed_sentence = tbank.parse(sentence.raw_sentence)
            context_tags = [word_tag[1] for word_tag in sentence.words_tags]
            for i in range(0, len(sentence.raw_sentence.split(' '))):
                if context_tags[i] != "Ne":
                    cur = parsed_sentence[i]
                    sentence_array = []
                    sentence_array.insert(0, cur.orth_)
                    result = recursive_tree_climb(cur, sentence_array)
                    four_gram = result[len(result) - 4:]
Example #9
0

if __name__ == '__main__':

	use_spacy = False
	parse_type = "linear"
	#parse_type = "dep"
	#filename_prep = "preprocessed-BrownCorpus.txt"
	filename_prep = "test.txt"
	quatrogram_dict,trigram_dict,bigram_dict,unigram_dict = prepare(parse_type, filename_prep)

	# comment these three lines out if you don't want spacy, uncomment if you do
	if use_spacy:
		print "Starting with the spacy stuff.."
		from spacy.en import LOCAL_DATA_DIR, English
		tbank = dt.tbankparser()
	
	print "Finding corrections..."
	if parse_type == "dep":
		filename = "test_correction.txt" 
		#filename = "preprocessed-4gram-sentences.txt"
		if use_spacy:
			correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp)
		else:
			correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename)
	else:
		filename='test_linear.txt'
		if use_spacy:
			correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp)
		else:
			correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename)
Example #10
0
    return quatrogram_dict, trigram_dict, bigram_dict, unigram_dict


if __name__ == '__main__':

    use_spacy = False
    parse_type = "linear"
    filename_prep = "test_data/test.txt"
    quatrogram_dict, trigram_dict, bigram_dict, unigram_dict = prepare(
        parse_type, filename_prep)

    # comment these three lines out if you don't want spacy, uncomment if you do
    if use_spacy:
        print "Starting with the spacy stuff.."
        from spacy.en import LOCAL_DATA_DIR, English
        tbank = dt.tbankparser()

    print "Finding corrections..."
    if parse_type == "dep":
        filename = "test_data/test_correction.txt"
        if use_spacy:
            correct_dep(quatrogram_dict, trigram_dict, bigram_dict,
                        unigram_dict, filename, tbank.nlp)
        else:
            correct_dep(quatrogram_dict, trigram_dict, bigram_dict,
                        unigram_dict, filename)
    else:
        filename = 'test_data/test_linear.txt'
        if use_spacy:
            correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict,
                    filename, tbank.nlp)