def multi_once(sentence_tuple): ans = None try: ans = Sentence(sentence_tuple) except: pipeline.log('init_mul', sentence_tuple) return ans
def multi_once(sentence_tuple): ans = None try: ans = Sentence(sentence_tuple) except: pipeline.log('init_mul',sentence_tuple) return ans
def process(filename,history): """ make objects for every sentence in the dataset and a feature dict rtype: list with sentence objects and feature dictionary """ reload(sys) sys.setdefaultencoding('utf8') processed_sentences = [] with open (filename) as datafile: data_lines = datafile.readlines() data_raw = [p.split('\n') for p in ''.join(data_lines).replace('\r','').split('\n\n')] sentence_tuples = [(sentence[0],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw] print "parsing sentences" for sentence_tuple in sentence_tuples: if len( sentence_tuple[0]) < 1: continue try: processed_sentences.append(Sentence(sentence_tuple)) except Exception as ex: pipeline.log('init',sentence_tuple) print "make feature vectors" feature_dictionary = makeFeatureDict(processed_sentences,history) return processed_sentences,feature_dictionary
def process(filename, history): reload(sys) sys.setdefaultencoding( 'utf8') # hack for some encoding problems in the sentences processed_sentences = [] with open(filename) as datafile: # import sgml data-file data_lines = datafile.readlines() data_raw = [ p.split('\n') for p in ''.join(data_lines).replace('\r', '').split('\n\n') ] #print data_raw sentence_tuples = [ (sentence[0], [tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw ] print "parsing sentences" for sentence_tuple in sentence_tuples: # er gaat nog iets mis met de eerste zin kijken of dat vaker gebeurt? #print sentence_tuple if len(sentence_tuple[0]) < 1: continue try: processed_sentences.append(Sentence(sentence_tuple)) except Exception as ex: pipeline.log('init', sentence_tuple) print "make feature vectors" feature_dictionary = makeFeatureDict(processed_sentences, history) return processed_sentences, feature_dictionary
def process(filename, history): """ make objects for every sentence in the dataset and a feature dict rtype: list with sentence objects and feature dictionary """ reload(sys) sys.setdefaultencoding('utf8') processed_sentences = [] with open(filename) as datafile: data_lines = datafile.readlines() data_raw = [ p.split('\n') for p in ''.join(data_lines).replace('\r', '').split('\n\n') ] sentence_tuples = [ (sentence[0], [tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw ] print "parsing sentences" for sentence_tuple in sentence_tuples: if len(sentence_tuple[0]) < 1: continue try: processed_sentences.append(Sentence(sentence_tuple)) except Exception as ex: pipeline.log('init', sentence_tuple) print "make feature vectors" feature_dictionary = makeFeatureDict(processed_sentences, history) return processed_sentences, feature_dictionary
def process(filename,history): reload(sys) sys.setdefaultencoding('utf8') # hack for some encoding problems in the sentences processed_sentences = [] with open (filename) as datafile: # import sgml data-file data_lines = datafile.readlines() data_raw = [p.split('\n') for p in ''.join(data_lines).replace('\r','').split('\n\n')] #print data_raw sentence_tuples = [(sentence[0],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw] print "parsing sentences" for sentence_tuple in sentence_tuples: # er gaat nog iets mis met de eerste zin kijken of dat vaker gebeurt? #print sentence_tuple if len( sentence_tuple[0]) < 1: continue try: processed_sentences.append(Sentence(sentence_tuple)) except Exception as ex: pipeline.log('init',sentence_tuple) print "make feature vectors" feature_dictionary = makeFeatureDict(processed_sentences,history) return processed_sentences,feature_dictionary
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: " + str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated #sentence.words_tags # """ # ==== comment 0 # hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean # golinear en iterator-functie iterloop (die uit depTree komt) # Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie # """ histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree)] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)] context_tags = [ sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree) ] for i, wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-'] + context_tags[0:i]) history_words = ['-START-'] + context_words[0:i] history_pos_tags = ['-POSTAGSTART-' ] + context_pos_tags[0:i] else: history_tags = context_tags[i - history:i] history_words = context_words[i - history:i] history_pos_tags = context_pos_tags[i - history:i] history_vectors = ('ph', [history_tags]) cur_idx = i prev_idx = cur_idx - 1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) else: for i, wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) #else: # prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) # hist_hist = [] # for tag in all_tags: # hist_hist.append( # dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance) # ) # histories.append(hist_hist) histories.append( (prev_idx, history_words, history_pos_tags, distance)) # """ # /==== end comment 0 # """ #print histories dict_target_feature_vectors = [ v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors ] pre_pros.append( (parsed_tree, dict_target_feature_vectors, histories)) #weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, # history, weight_matrix, context_words, context_pos_tags) except Exception as ex: print "error" pipeline.log('train', sentence) print 'pre_pros', time() - t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter", i cum_weights = (i) * weight_matrix for parsed_tree, dict_target_feature_vectors, histories in pre_pros: target_feature_vectors = [ d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors ] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix) / (i + 1) print "one iter: ", time() - iter_time print 'train', time() - t2 return weight_matrix
def makeFeatureDict(processed_sentences,history): feature_dictionary = {} # this will be a dict with key the feature name as key feature_dictionary['i tag+-TAGSTART-'] = 0 index = 1 for tag in sp.all_tags: feature_dictionary['i tag+'+ tag] = index index += 1 for p in range(history): for tag in sp.all_tags: feature_dictionary['i-'+str(p+1)+' tag+'+ tag] = index index += 1 for sentence in processed_sentences: #print sentence.raw_sentence try: if golinear: # """ # ==== comment 2 # hier loopt de code nog op de oude manier door de zin, dit moet dus via de nieuwe manier (zie comment 0 in structured_perceptron) # """ context_words = [word_tag[0] for word_tag in sentence.words_tags] context_tags = [word_tag[1] for word_tag in sentence.words_tags] context_pos_tags = [ pos_tag_tuple[1] for pos_tag_tuple in sentence.pos_tags_sentence] #print context_words #print context_tags #print context_pos_tags for i, tagTouple in enumerate(sentence.words_tags): history_words = ['-START-']+ context_words[:i] history_tags = ['-TAGSTART-']+ context_tags[:i] history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i] if len(history_words) > history: history_words = context_words[i-history:i] history_tags = context_tags[i-history:i] history_pos_tags = context_pos_tags[i-history:i] distance = nlp(unicode(normalize(history_words[-1:][0]))).similarity(nlp(unicode(normalize(context_words[i])))) features = makeFeatures(context_words[i],history_words,history_tags, history_pos_tags, distance) for feature in features: #print feature for tag in sp.all_tags: feature = feature+'+'+tag if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 else: parsed_tree = nlp(unicode(sentence.raw_sentence)) for i,wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) features = makeFeatures(wrd.orth_,history_words,history_tags, history_pos_tags, distance, cur_tag) # """ # /==== end comment 2 # """ for feature in features: #print feature if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 except: pipeline.log('feat',sentence) return feature_dictionary
def makeFeatureDict(processed_sentences, history): feature_dictionary = { } # this will be a dict with key the feature name as key feature_dictionary['i tag+-TAGSTART-'] = 0 index = 1 for tag in sp.all_tags: feature_dictionary['i tag+' + tag] = index index += 1 for p in range(history): for tag in sp.all_tags: feature_dictionary['i-' + str(p + 1) + ' tag+' + tag] = index index += 1 for sentence in processed_sentences: #print sentence.raw_sentence try: if golinear: # """ # ==== comment 2 # hier loopt de code nog op de oude manier door de zin, dit moet dus via de nieuwe manier (zie comment 0 in structured_perceptron) # """ context_words = [ word_tag[0] for word_tag in sentence.words_tags ] context_tags = [ word_tag[1] for word_tag in sentence.words_tags ] context_pos_tags = [ pos_tag_tuple[1] for pos_tag_tuple in sentence.pos_tags_sentence ] #print context_words #print context_tags #print context_pos_tags for i, tagTouple in enumerate(sentence.words_tags): history_words = ['-START-'] + context_words[:i] history_tags = ['-TAGSTART-'] + context_tags[:i] history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i] if len(history_words) > history: history_words = context_words[i - history:i] history_tags = context_tags[i - history:i] history_pos_tags = context_pos_tags[i - history:i] distance = nlp(unicode(normalize( history_words[-1:][0]))).similarity( nlp(unicode(normalize(context_words[i])))) features = makeFeatures(context_words[i], history_words, history_tags, history_pos_tags, distance) for feature in features: #print feature for tag in sp.all_tags: feature = feature + '+' + tag if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 else: parsed_tree = nlp(unicode(sentence.raw_sentence)) for i, wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) features = makeFeatures(wrd.orth_, history_words, history_tags, history_pos_tags, distance, cur_tag) # """ # /==== end comment 2 # """ for feature in features: #print feature if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 except: pipeline.log('feat', sentence) return feature_dictionary
def makeFeatureDict(processed_sentences, history): """ make a dictionary with all the features found in the dataset rtype: dictionary """ feature_dictionary = { } # this will be a dict with key the feature name as key feature_dictionary['i tag+-TAGSTART-'] = 0 index = 1 # make a feature for every possible tag for a word for tag in sp.all_tags: feature_dictionary['i tag+' + tag] = index index += 1 # make feature of every posible history tag and his index for p in range(history): for tag in sp.all_tags: feature_dictionary['i-' + str(p + 1) + ' tag+' + tag] = index index += 1 # make features for every word in the sentence. If lineair parsed, the make different features based on this tyoe of parsing. for sentence in processed_sentences: try: if golinear: # make lineair features context_words = [ word_tag[0] for word_tag in sentence.words_tags ] context_tags = [ word_tag[1] for word_tag in sentence.words_tags ] context_pos_tags = [ pos_tag_tuple[1] for pos_tag_tuple in sentence.pos_tags_sentence ] for i, tagTouple in enumerate(sentence.words_tags): history_words = ['-START-'] + context_words[:i] history_tags = ['-TAGSTART-'] + context_tags[:i] history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i] if len(history_words) > history: history_words = context_words[i - history:i] history_tags = context_tags[i - history:i] history_pos_tags = context_pos_tags[i - history:i] distance = nlp(unicode(normalize( history_words[-1:][0]))).similarity( nlp(unicode(normalize(context_words[i])))) features = makeFeatures(context_words[i], history_words, history_tags, history_pos_tags, distance) for feature in features: #print feature for tag in sp.all_tags: feature = feature + '+' + tag if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 else: # depenceny wise parsing features parsed_tree = nlp(unicode(sentence.raw_sentence)) for i, wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) features = makeFeatures(wrd.orth_, history_words, history_tags, history_pos_tags, distance, cur_tag) for feature in features: if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 except: pipeline.log('feat', sentence) return feature_dictionary
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: "+str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree) ] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ] context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)] for i,wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-']+context_tags[0:i]) history_words = ['-START-']+context_words[0:i] history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i] else: history_tags = context_tags[i-history:i] history_words = context_words[i-history:i] history_pos_tags = context_pos_tags[i-history:i] history_vectors = ('ph', [history_tags] ) cur_idx = i prev_idx = cur_idx-1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) else: for i,wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors] pre_pros.append((parsed_tree,dict_target_feature_vectors,histories)) except Exception as ex: print "error" pipeline.log('train',sentence) print 'pre_pros',time()-t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter",i cum_weights = (i)*weight_matrix for parsed_tree,dict_target_feature_vectors,histories in pre_pros: target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix)/(i+1) print "one iter: ", time() - iter_time print 'train',time()-t2 return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: "+str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated #sentence.words_tags # """ # ==== comment 0 # hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean # golinear en iterator-functie iterloop (die uit depTree komt) # Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie # """ histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree) ] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ] context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)] for i,wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-']+context_tags[0:i]) history_words = ['-START-']+context_words[0:i] history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i] else: history_tags = context_tags[i-history:i] history_words = context_words[i-history:i] history_pos_tags = context_pos_tags[i-history:i] history_vectors = ('ph', [history_tags] ) cur_idx = i prev_idx = cur_idx-1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) histories.append((prev_idx,history_words,history_pos_tags,distance)) else: for i,wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) #else: # prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance) ) # hist_hist = [] # for tag in all_tags: # hist_hist.append( # dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance) # ) # histories.append(hist_hist) histories.append((prev_idx,history_words,history_pos_tags,distance)) # """ # /==== end comment 0 # """ #print histories dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors] pre_pros.append((parsed_tree,dict_target_feature_vectors,histories)) #weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, # history, weight_matrix, context_words, context_pos_tags) except Exception as ex: print "error" pipeline.log('train',sentence) print 'pre_pros',time()-t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter",i cum_weights = (i)*weight_matrix for parsed_tree,dict_target_feature_vectors,histories in pre_pros: target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix)/(i+1) print "one iter: ", time() - iter_time print 'train',time()-t2 return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history): weight_matrix = init_weights(len(feature_dict)) pre_pros = [] t1 = time() current_sen = 1 for sentence in all_sentences: print "train sentence: " + str(current_sen) current_sen += 1 try: parsed_tree = tbank.parse(sentence.raw_sentence) # For loop around this, so that you loop through all sentences --> weights should be updated histories = [] target_feature_vectors = [] if golinear: context_words = [w.orth_ for w in iterloop(parsed_tree)] context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)] context_tags = [ sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree) ] for i, wrd in enumerate(context_words): if i < history: history_tags = tuple(['-TAGSTART-'] + context_tags[0:i]) history_words = ['-START-'] + context_words[0:i] history_pos_tags = ['-POSTAGSTART-' ] + context_pos_tags[0:i] else: history_tags = context_tags[i - history:i] history_words = context_words[i - history:i] history_pos_tags = context_pos_tags[i - history:i] history_vectors = ('ph', [history_tags]) cur_idx = i prev_idx = cur_idx - 1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) else: for i, wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence, par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0, tag) history_words.insert(0, parw) history_pos_tags.insert(0, pos) history_vectors = ('ph', [history_tags]) cur_idx = dt.sen_idx(sentence.raw_sentence, wrd) for prev_idx, w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity( parsed_tree[prev_idx]) cur_tag = sentence.words_tags[idx][1] target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict, history_words, history, history_vectors, history_pos_tags, distance)) histories.append( (prev_idx, history_words, history_pos_tags, distance)) dict_target_feature_vectors = [ v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors ] pre_pros.append( (parsed_tree, dict_target_feature_vectors, histories)) except Exception as ex: print "error" pipeline.log('train', sentence) print 'pre_pros', time() - t1 t2 = time() print len(pre_pros) for i in range(iters): iter_time = time() print "at iter", i cum_weights = (i) * weight_matrix for parsed_tree, dict_target_feature_vectors, histories in pre_pros: target_feature_vectors = [ d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors ] weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, history, weight_matrix, histories) weight_matrix = (cum_weights + weight_matrix) / (i + 1) print "one iter: ", time() - iter_time print 'train', time() - t2 return weight_matrix
def makeFeatureDict(processed_sentences,history): """ make a dictionary with all the features found in the dataset rtype: dictionary """ feature_dictionary = {} # this will be a dict with key the feature name as key feature_dictionary['i tag+-TAGSTART-'] = 0 index = 1 # make a feature for every possible tag for a word for tag in sp.all_tags: feature_dictionary['i tag+'+ tag] = index index += 1 # make feature of every posible history tag and his index for p in range(history): for tag in sp.all_tags: feature_dictionary['i-'+str(p+1)+' tag+'+ tag] = index index += 1 # make features for every word in the sentence. If lineair parsed, the make different features based on this tyoe of parsing. for sentence in processed_sentences: try: if golinear: # make lineair features context_words = [word_tag[0] for word_tag in sentence.words_tags] context_tags = [word_tag[1] for word_tag in sentence.words_tags] context_pos_tags = [ pos_tag_tuple[1] for pos_tag_tuple in sentence.pos_tags_sentence] for i, tagTouple in enumerate(sentence.words_tags): history_words = ['-START-']+ context_words[:i] history_tags = ['-TAGSTART-']+ context_tags[:i] history_pos_tags = ['-POSTAGSTART-'] + context_pos_tags[:i] if len(history_words) > history: history_words = context_words[i-history:i] history_tags = context_tags[i-history:i] history_pos_tags = context_pos_tags[i-history:i] distance = nlp(unicode(normalize(history_words[-1:][0]))).similarity(nlp(unicode(normalize(context_words[i])))) features = makeFeatures(context_words[i],history_words,history_tags, history_pos_tags, distance) for feature in features: #print feature for tag in sp.all_tags: feature = feature+'+'+tag if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 else: # depenceny wise parsing features parsed_tree = nlp(unicode(sentence.raw_sentence)) for i,wrd in enumerate(iterloop(parsed_tree)): cur = wrd history_words = [] history_tags = [] history_pos_tags = [] for j in range(history): par = cur.head if cur == par: parw = '-START-' idx = -1 tag = '-TAGSTART-' pos = '-POSTAGSTART-' history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) break else: parw = par.orth_ idx = dt.sen_idx(sentence.raw_sentence,par) tag = sentence.words_tags[idx][1] pos = par.tag_ cur = par history_tags.insert(0,tag) history_words.insert(0,parw) history_pos_tags.insert(0,pos) history_vectors = ('ph',[history_tags] ) cur_idx = dt.sen_idx(sentence.raw_sentence,wrd) for prev_idx,w in enumerate(iterloop(parsed_tree)): if w == wrd.head: break if wrd.head == wrd: prev_idx = -1 distance = 0 if prev_idx >= 0: distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx]) features = makeFeatures(wrd.orth_,history_words,history_tags, history_pos_tags, distance, cur_tag) for feature in features: if feature not in feature_dictionary: feature_dictionary[feature] = index index += 1 except: pipeline.log('feat',sentence) return feature_dictionary