Example #1
0
def multi_once(sentence_tuple):
    ans = None
    try:
        ans = Sentence(sentence_tuple)
    except:
        pipeline.log('init_mul', sentence_tuple)
    return ans
Example #2
0
def multi_once(sentence_tuple):
	ans = None
	try:
		ans = Sentence(sentence_tuple)
	except:
		pipeline.log('init_mul',sentence_tuple)
	return ans
Example #3
0
def process(filename,history):
	"""
		 make  objects  for every sentence in the dataset and a feature dict
		 rtype: list with sentence objects and feature dictionary
	"""
	reload(sys)  
	sys.setdefaultencoding('utf8') 
	processed_sentences = []
	with open (filename) as datafile: 
		data_lines = datafile.readlines()
		data_raw = [p.split('\n') for p in ''.join(data_lines).replace('\r','').split('\n\n')]
		sentence_tuples = [(sentence[0],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw]
		
	print "parsing sentences"
	for sentence_tuple in sentence_tuples: 
		if len( sentence_tuple[0]) < 1:
			continue
		try:
			processed_sentences.append(Sentence(sentence_tuple))
		except Exception as ex:
			pipeline.log('init',sentence_tuple)
	print "make feature vectors"
	feature_dictionary = makeFeatureDict(processed_sentences,history)

	return processed_sentences,feature_dictionary
Example #4
0
def process(filename, history):
    reload(sys)
    sys.setdefaultencoding(
        'utf8')  # hack for some encoding problems in the sentences
    processed_sentences = []
    with open(filename) as datafile:  # import sgml data-file
        data_lines = datafile.readlines()
        data_raw = [
            p.split('\n')
            for p in ''.join(data_lines).replace('\r', '').split('\n\n')
        ]
        #print data_raw
        sentence_tuples = [
            (sentence[0],
             [tuple(errors.split('|||')) for errors in sentence[1:]])
            for sentence in data_raw
        ]

    print "parsing sentences"
    for sentence_tuple in sentence_tuples:  # er gaat nog iets mis met de eerste zin kijken of dat vaker gebeurt?
        #print sentence_tuple
        if len(sentence_tuple[0]) < 1:
            continue
        try:
            processed_sentences.append(Sentence(sentence_tuple))
        except Exception as ex:
            pipeline.log('init', sentence_tuple)
    print "make feature vectors"
    feature_dictionary = makeFeatureDict(processed_sentences, history)

    return processed_sentences, feature_dictionary
Example #5
0
def process(filename, history):
    """
		 make  objects  for every sentence in the dataset and a feature dict
		 rtype: list with sentence objects and feature dictionary
	"""
    reload(sys)
    sys.setdefaultencoding('utf8')
    processed_sentences = []
    with open(filename) as datafile:
        data_lines = datafile.readlines()
        data_raw = [
            p.split('\n')
            for p in ''.join(data_lines).replace('\r', '').split('\n\n')
        ]
        sentence_tuples = [
            (sentence[0],
             [tuple(errors.split('|||')) for errors in sentence[1:]])
            for sentence in data_raw
        ]

    print "parsing sentences"
    for sentence_tuple in sentence_tuples:
        if len(sentence_tuple[0]) < 1:
            continue
        try:
            processed_sentences.append(Sentence(sentence_tuple))
        except Exception as ex:
            pipeline.log('init', sentence_tuple)
    print "make feature vectors"
    feature_dictionary = makeFeatureDict(processed_sentences, history)

    return processed_sentences, feature_dictionary
Example #6
0
def process(filename,history):
	reload(sys)  
	sys.setdefaultencoding('utf8') # hack for some encoding problems in the sentences 
	processed_sentences = []
	with open (filename) as datafile: # import sgml data-file
		data_lines = datafile.readlines()
		data_raw = [p.split('\n') for p in ''.join(data_lines).replace('\r','').split('\n\n')]
		#print data_raw
		sentence_tuples = [(sentence[0],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw]
		
	print "parsing sentences"
	for sentence_tuple in sentence_tuples: # er gaat nog iets mis met de eerste zin kijken of dat vaker gebeurt?
		#print sentence_tuple
		if len( sentence_tuple[0]) < 1:
			continue
		try:
			processed_sentences.append(Sentence(sentence_tuple))
		except Exception as ex:
			pipeline.log('init',sentence_tuple)
	print "make feature vectors"
	feature_dictionary = makeFeatureDict(processed_sentences,history)

	return processed_sentences,feature_dictionary
def train_perceptron(all_sentences, feature_dict, tbank, history):
    weight_matrix = init_weights(len(feature_dict))
    pre_pros = []
    t1 = time()
    current_sen = 1
    for sentence in all_sentences:
        print "train sentence: " + str(current_sen)
        current_sen += 1
        try:
            parsed_tree = tbank.parse(sentence.raw_sentence)
            # For loop around this, so that you loop through all sentences --> weights should be updated
            #sentence.words_tags
            # """
            # ==== comment 0
            # hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean
            # 	golinear en iterator-functie iterloop (die uit depTree komt)
            # 	Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie
            # """
            histories = []
            target_feature_vectors = []
            if golinear:
                context_words = [w.orth_ for w in iterloop(parsed_tree)]
                context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)]
                context_tags = [
                    sentence.words_tags[dt.sen_idx(sentence.raw_sentence,
                                                   wrd)][1]
                    for wrd in iterloop(parsed_tree)
                ]
                for i, wrd in enumerate(context_words):
                    if i < history:
                        history_tags = tuple(['-TAGSTART-'] +
                                             context_tags[0:i])
                        history_words = ['-START-'] + context_words[0:i]
                        history_pos_tags = ['-POSTAGSTART-'
                                            ] + context_pos_tags[0:i]
                    else:
                        history_tags = context_tags[i - history:i]
                        history_words = context_words[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]
                    history_vectors = ('ph', [history_tags])
                    cur_idx = i
                    prev_idx = cur_idx - 1
                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd, context_tags[i],
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            else:
                for i, wrd in enumerate(iterloop(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(iterloop(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    #else:
                    #	prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head)
                    cur_tag = sentence.words_tags[idx][1]
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd.orth_, cur_tag,
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    # hist_hist = []
                    # for tag in all_tags:
                    # 	hist_hist.append(
                    # 		dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance)
                    # 	)
                    # histories.append(hist_hist)
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            # """
            # /==== end comment 0
            # """
            #print histories
            dict_target_feature_vectors = [
                v2d(target_feature_vector[0][0])
                for target_feature_vector in target_feature_vectors
            ]
            pre_pros.append(
                (parsed_tree, dict_target_feature_vectors, histories))
            #weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict,
            #			history, weight_matrix, context_words, context_pos_tags)
        except Exception as ex:
            print "error"
            pipeline.log('train', sentence)

    print 'pre_pros', time() - t1
    t2 = time()
    print len(pre_pros)

    for i in range(iters):
        iter_time = time()
        print "at iter", i
        cum_weights = (i) * weight_matrix
        for parsed_tree, dict_target_feature_vectors, histories in pre_pros:
            target_feature_vectors = [
                d2v(dict_target_feature_vector)
                for dict_target_feature_vector in dict_target_feature_vectors
            ]
            weight_matrix = train_perceptron_once(parsed_tree,
                                                  target_feature_vectors,
                                                  feature_dict, history,
                                                  weight_matrix, histories)
        weight_matrix = (cum_weights + weight_matrix) / (i + 1)
        print "one iter: ", time() - iter_time
    print 'train', time() - t2
    return weight_matrix
Example #8
0
def makeFeatureDict(processed_sentences,history):
	feature_dictionary = {} # this will be a dict with key the feature name as key 
	feature_dictionary['i tag+-TAGSTART-'] = 0
	index = 1
	for tag in sp.all_tags:
		feature_dictionary['i tag+'+ tag] = index
		index += 1
	for p in range(history):
		for tag in sp.all_tags:
			feature_dictionary['i-'+str(p+1)+' tag+'+ tag] = index
			index += 1


	for sentence in processed_sentences:
		#print sentence.raw_sentence
		try:
			if golinear:
				# """
				# ==== comment 2
				# hier loopt de code nog op de oude manier door de zin, dit moet dus via de nieuwe manier (zie comment 0 in structured_perceptron)
				# """
				context_words = [word_tag[0] for word_tag in sentence.words_tags]
				context_tags  = [word_tag[1] for word_tag in sentence.words_tags]
				context_pos_tags = [ pos_tag_tuple[1] for pos_tag_tuple in sentence.pos_tags_sentence]
				
				#print context_words
				#print context_tags
				#print context_pos_tags

				for i, tagTouple in enumerate(sentence.words_tags):
					history_words = ['-START-']+ context_words[:i]
					history_tags = ['-TAGSTART-']+ context_tags[:i]
					history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i]
					
					if len(history_words) > history:
						history_words = context_words[i-history:i]
						history_tags = context_tags[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]

					distance = nlp(unicode(normalize(history_words[-1:][0]))).similarity(nlp(unicode(normalize(context_words[i]))))
					features =  makeFeatures(context_words[i],history_words,history_tags, history_pos_tags, distance)
					for feature in features:
						#print feature
						for tag in sp.all_tags:
							feature = feature+'+'+tag
							if feature not in feature_dictionary:
								feature_dictionary[feature] = index	
								index += 1
			else:
				parsed_tree = nlp(unicode(sentence.raw_sentence))
				for i,wrd in enumerate(iterloop(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(iterloop(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					features =  makeFeatures(wrd.orth_,history_words,history_tags, history_pos_tags, distance, cur_tag)
				# """
				# /==== end comment 2
				# """
					for feature in features:
						#print feature
						if feature not in feature_dictionary:
							feature_dictionary[feature] = index	
							index += 1
		except:
			pipeline.log('feat',sentence)

	return feature_dictionary
Example #9
0
def makeFeatureDict(processed_sentences, history):
    feature_dictionary = {
    }  # this will be a dict with key the feature name as key
    feature_dictionary['i tag+-TAGSTART-'] = 0
    index = 1
    for tag in sp.all_tags:
        feature_dictionary['i tag+' + tag] = index
        index += 1
    for p in range(history):
        for tag in sp.all_tags:
            feature_dictionary['i-' + str(p + 1) + ' tag+' + tag] = index
            index += 1

    for sentence in processed_sentences:
        #print sentence.raw_sentence
        try:
            if golinear:
                # """
                # ==== comment 2
                # hier loopt de code nog op de oude manier door de zin, dit moet dus via de nieuwe manier (zie comment 0 in structured_perceptron)
                # """
                context_words = [
                    word_tag[0] for word_tag in sentence.words_tags
                ]
                context_tags = [
                    word_tag[1] for word_tag in sentence.words_tags
                ]
                context_pos_tags = [
                    pos_tag_tuple[1]
                    for pos_tag_tuple in sentence.pos_tags_sentence
                ]

                #print context_words
                #print context_tags
                #print context_pos_tags

                for i, tagTouple in enumerate(sentence.words_tags):
                    history_words = ['-START-'] + context_words[:i]
                    history_tags = ['-TAGSTART-'] + context_tags[:i]
                    history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i]

                    if len(history_words) > history:
                        history_words = context_words[i - history:i]
                        history_tags = context_tags[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]

                    distance = nlp(unicode(normalize(
                        history_words[-1:][0]))).similarity(
                            nlp(unicode(normalize(context_words[i]))))
                    features = makeFeatures(context_words[i], history_words,
                                            history_tags, history_pos_tags,
                                            distance)
                    for feature in features:
                        #print feature
                        for tag in sp.all_tags:
                            feature = feature + '+' + tag
                            if feature not in feature_dictionary:
                                feature_dictionary[feature] = index
                                index += 1
            else:
                parsed_tree = nlp(unicode(sentence.raw_sentence))
                for i, wrd in enumerate(iterloop(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(iterloop(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    features = makeFeatures(wrd.orth_, history_words,
                                            history_tags, history_pos_tags,
                                            distance, cur_tag)
                    # """
                    # /==== end comment 2
                    # """
                    for feature in features:
                        #print feature
                        if feature not in feature_dictionary:
                            feature_dictionary[feature] = index
                            index += 1
        except:
            pipeline.log('feat', sentence)

    return feature_dictionary
Example #10
0
def makeFeatureDict(processed_sentences, history):
    """
		 make a dictionary with all the features found in the dataset
		 rtype: dictionary
	"""

    feature_dictionary = {
    }  # this will be a dict with key the feature name as key
    feature_dictionary['i tag+-TAGSTART-'] = 0
    index = 1
    # make a feature for every possible tag for a word
    for tag in sp.all_tags:
        feature_dictionary['i tag+' + tag] = index
        index += 1
    # make feature of every posible history tag and his index
    for p in range(history):
        for tag in sp.all_tags:
            feature_dictionary['i-' + str(p + 1) + ' tag+' + tag] = index
            index += 1

    # make features for every word in the sentence. If lineair parsed, the make different features based on this tyoe of parsing.
    for sentence in processed_sentences:
        try:
            if golinear:
                # make lineair features
                context_words = [
                    word_tag[0] for word_tag in sentence.words_tags
                ]
                context_tags = [
                    word_tag[1] for word_tag in sentence.words_tags
                ]
                context_pos_tags = [
                    pos_tag_tuple[1]
                    for pos_tag_tuple in sentence.pos_tags_sentence
                ]

                for i, tagTouple in enumerate(sentence.words_tags):
                    history_words = ['-START-'] + context_words[:i]
                    history_tags = ['-TAGSTART-'] + context_tags[:i]
                    history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i]

                    if len(history_words) > history:
                        history_words = context_words[i - history:i]
                        history_tags = context_tags[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]

                    distance = nlp(unicode(normalize(
                        history_words[-1:][0]))).similarity(
                            nlp(unicode(normalize(context_words[i]))))
                    features = makeFeatures(context_words[i], history_words,
                                            history_tags, history_pos_tags,
                                            distance)
                    for feature in features:
                        #print feature
                        for tag in sp.all_tags:
                            feature = feature + '+' + tag
                            if feature not in feature_dictionary:
                                feature_dictionary[feature] = index
                                index += 1
            else:
                # depenceny wise parsing features
                parsed_tree = nlp(unicode(sentence.raw_sentence))
                for i, wrd in enumerate(iterloop(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(iterloop(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    features = makeFeatures(wrd.orth_, history_words,
                                            history_tags, history_pos_tags,
                                            distance, cur_tag)
                    for feature in features:
                        if feature not in feature_dictionary:
                            feature_dictionary[feature] = index
                            index += 1
        except:
            pipeline.log('feat', sentence)

    return feature_dictionary
def train_perceptron(all_sentences, feature_dict, tbank, history):
	weight_matrix = init_weights(len(feature_dict))
	pre_pros = []
	t1 = time()
	current_sen = 1
	for sentence in all_sentences:
		print "train sentence: "+str(current_sen)
		current_sen += 1
		try:
			parsed_tree = tbank.parse(sentence.raw_sentence)
			# For loop around this, so that you loop through all sentences --> weights should be updated
		
			histories = []
			target_feature_vectors = []
			if golinear:
				context_words = [w.orth_ for w in iterloop(parsed_tree) ]
				context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ]
				context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)]
				for i,wrd in enumerate(context_words):
					if i < history:
						history_tags = tuple(['-TAGSTART-']+context_tags[0:i])
						history_words = ['-START-']+context_words[0:i]
						history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i]
					else:
						history_tags = context_tags[i-history:i]
						history_words = context_words[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]
					history_vectors = ('ph', [history_tags] )
					cur_idx = i
					prev_idx = cur_idx-1
					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], 
							feature_dict, history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			else:
				for i,wrd in enumerate(iterloop(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(iterloop(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					
					cur_tag = sentence.words_tags[idx][1]
					target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict,
									history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			
			dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors]
			pre_pros.append((parsed_tree,dict_target_feature_vectors,histories))
		except Exception as ex:
			print "error"
			pipeline.log('train',sentence)
	
	print 'pre_pros',time()-t1
	t2 = time()
	print len(pre_pros)

	for i in range(iters):
		iter_time = time()
		print "at iter",i
		cum_weights = (i)*weight_matrix
		for parsed_tree,dict_target_feature_vectors,histories in pre_pros:
			target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors]
			weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, 
							history, weight_matrix, histories)
		weight_matrix = (cum_weights + weight_matrix)/(i+1)
		print "one iter: ", time() - iter_time
	print 'train',time()-t2
	return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history):
	weight_matrix = init_weights(len(feature_dict))
	pre_pros = []
	t1 = time()
	current_sen = 1
	for sentence in all_sentences:
		print "train sentence: "+str(current_sen)
		current_sen += 1
		try:
			parsed_tree = tbank.parse(sentence.raw_sentence)
			# For loop around this, so that you loop through all sentences --> weights should be updated
			#sentence.words_tags
			# """
			# ==== comment 0
			# hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean
			# 	golinear en iterator-functie iterloop (die uit depTree komt)
			# 	Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie
			# """
			histories = []
			target_feature_vectors = []
			if golinear:
				context_words = [w.orth_ for w in iterloop(parsed_tree) ]
				context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ]
				context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)]
				for i,wrd in enumerate(context_words):
					if i < history:
						history_tags = tuple(['-TAGSTART-']+context_tags[0:i])
						history_words = ['-START-']+context_words[0:i]
						history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i]
					else:
						history_tags = context_tags[i-history:i]
						history_words = context_words[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]
					history_vectors = ('ph', [history_tags] )
					cur_idx = i
					prev_idx = cur_idx-1
					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], 
							feature_dict, history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			else:
				for i,wrd in enumerate(iterloop(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(iterloop(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					#else:
					#	prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head)
					cur_tag = sentence.words_tags[idx][1]
					target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict,
									history_words, history, history_vectors, history_pos_tags, distance) )
					# hist_hist = []
					# for tag in all_tags:
					# 	hist_hist.append(
					# 		dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance)
					# 	)
					# histories.append(hist_hist)
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			# """
			# /==== end comment 0
			# """
			#print histories
			dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors]
			pre_pros.append((parsed_tree,dict_target_feature_vectors,histories))
			#weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, 
	 		#			history, weight_matrix, context_words, context_pos_tags)
		except Exception as ex:
			print "error"
			pipeline.log('train',sentence)
	
	print 'pre_pros',time()-t1
	t2 = time()
	print len(pre_pros)

	for i in range(iters):
		iter_time = time()
		print "at iter",i
		cum_weights = (i)*weight_matrix
		for parsed_tree,dict_target_feature_vectors,histories in pre_pros:
			target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors]
			weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, 
							history, weight_matrix, histories)
		weight_matrix = (cum_weights + weight_matrix)/(i+1)
		print "one iter: ", time() - iter_time
	print 'train',time()-t2
	return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history):
    weight_matrix = init_weights(len(feature_dict))
    pre_pros = []
    t1 = time()
    current_sen = 1
    for sentence in all_sentences:
        print "train sentence: " + str(current_sen)
        current_sen += 1
        try:
            parsed_tree = tbank.parse(sentence.raw_sentence)
            # For loop around this, so that you loop through all sentences --> weights should be updated

            histories = []
            target_feature_vectors = []
            if golinear:
                context_words = [w.orth_ for w in iterloop(parsed_tree)]
                context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)]
                context_tags = [
                    sentence.words_tags[dt.sen_idx(sentence.raw_sentence,
                                                   wrd)][1]
                    for wrd in iterloop(parsed_tree)
                ]
                for i, wrd in enumerate(context_words):
                    if i < history:
                        history_tags = tuple(['-TAGSTART-'] +
                                             context_tags[0:i])
                        history_words = ['-START-'] + context_words[0:i]
                        history_pos_tags = ['-POSTAGSTART-'
                                            ] + context_pos_tags[0:i]
                    else:
                        history_tags = context_tags[i - history:i]
                        history_words = context_words[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]
                    history_vectors = ('ph', [history_tags])
                    cur_idx = i
                    prev_idx = cur_idx - 1
                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd, context_tags[i],
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            else:
                for i, wrd in enumerate(iterloop(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(iterloop(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])

                    cur_tag = sentence.words_tags[idx][1]
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd.orth_, cur_tag,
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))

            dict_target_feature_vectors = [
                v2d(target_feature_vector[0][0])
                for target_feature_vector in target_feature_vectors
            ]
            pre_pros.append(
                (parsed_tree, dict_target_feature_vectors, histories))
        except Exception as ex:
            print "error"
            pipeline.log('train', sentence)

    print 'pre_pros', time() - t1
    t2 = time()
    print len(pre_pros)

    for i in range(iters):
        iter_time = time()
        print "at iter", i
        cum_weights = (i) * weight_matrix
        for parsed_tree, dict_target_feature_vectors, histories in pre_pros:
            target_feature_vectors = [
                d2v(dict_target_feature_vector)
                for dict_target_feature_vector in dict_target_feature_vectors
            ]
            weight_matrix = train_perceptron_once(parsed_tree,
                                                  target_feature_vectors,
                                                  feature_dict, history,
                                                  weight_matrix, histories)
        weight_matrix = (cum_weights + weight_matrix) / (i + 1)
        print "one iter: ", time() - iter_time
    print 'train', time() - t2
    return weight_matrix
Example #14
0
def makeFeatureDict(processed_sentences,history):
	"""
		 make a dictionary with all the features found in the dataset
		 rtype: dictionary
	"""

	feature_dictionary = {} # this will be a dict with key the feature name as key 
	feature_dictionary['i tag+-TAGSTART-'] = 0
	index = 1
	# make a feature for every possible tag for a word
	for tag in sp.all_tags:
		feature_dictionary['i tag+'+ tag] = index
		index += 1
	# make feature of every posible history tag and his index 
	for p in range(history):
		for tag in sp.all_tags:
			feature_dictionary['i-'+str(p+1)+' tag+'+ tag] = index
			index += 1

	# make features for every word in the sentence. If lineair parsed, the make different features based on this tyoe of parsing. 
	for sentence in processed_sentences:
		try:
			if golinear:
				# make lineair features 
				context_words = [word_tag[0] for word_tag in sentence.words_tags]
				context_tags  = [word_tag[1] for word_tag in sentence.words_tags]
				context_pos_tags = [ pos_tag_tuple[1] for pos_tag_tuple in sentence.pos_tags_sentence]
				

				for i, tagTouple in enumerate(sentence.words_tags):
					history_words = ['-START-']+ context_words[:i]
					history_tags = ['-TAGSTART-']+ context_tags[:i]
					history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i]
					
					if len(history_words) > history:
						history_words = context_words[i-history:i]
						history_tags = context_tags[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]

					distance = nlp(unicode(normalize(history_words[-1:][0]))).similarity(nlp(unicode(normalize(context_words[i]))))
					features =  makeFeatures(context_words[i],history_words,history_tags, history_pos_tags, distance)
					for feature in features:
						#print feature
						for tag in sp.all_tags:
							feature = feature+'+'+tag
							if feature not in feature_dictionary:
								feature_dictionary[feature] = index	
								index += 1
			else:
				# depenceny wise parsing features
				parsed_tree = nlp(unicode(sentence.raw_sentence))
				for i,wrd in enumerate(iterloop(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(iterloop(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					features =  makeFeatures(wrd.orth_,history_words,history_tags, history_pos_tags, distance, cur_tag)
					for feature in features:
						if feature not in feature_dictionary:
							feature_dictionary[feature] = index	
							index += 1
		except:
			pipeline.log('feat',sentence)

	return feature_dictionary