def lemmacategoryvector(lemma): category_vector = [] for category in DV.ALL_CATEGORIES: if lemma in category: category_vector.append(truth(True)) else: category_vector.append(truth(False)) return category_vector
def lemmavector(lemma): vector = [] for lemma_type in DV.VPE_TRIGGERS_IN_WSJ: if lemma == lemma_type: vector.append(truth(True)) else: vector.append(truth(False)) return vector
def antecedent_description(trig_sentdict, ant_sentdict, ant, POS_TAGS): vector = [] ant_words = ant.get_words() subtree = ant.get_subtree() # Feature 1. vector.append(truth(len(subtree.leaves()) == len(ant_words))) # Feature 2. vector.append(len(ant_words)) # Feature 3. vector.append(truth(NT.dominates(subtree.root(), subtree, ant.get_trigger().get_subtree()))) # Features 4. pos_tags_dict = {} for tag in POS_TAGS: pos_tags_dict[tag] = 0 idx = get_antecedent_head_index(ant.get_context(), ant) for tag in ant.get_context()['pos'][idx:len(ant_words)]: pos_tags_dict[tag] += 1 vector += [pos_tags_dict[tag] for tag in pos_tags_dict] # Feature 5: if the antecedent starts with an auxiliary, verb, adj. vector.append(truth(DV.isauxiliary(ant_sentdict, idx))) vector.append(truth(DV.isverb(ant_sentdict['pos'][idx]))) vector.append(truth(DV.isadj(ant_sentdict['pos'][idx]))) return vector
def antecedent_description(trig_sentdict, ant_sentdict, ant, POS_TAGS): vector = [] ant_words = ant.get_words() subtree = ant.get_subtree() # Feature 1. vector.append(truth(len(subtree.leaves()) == len(ant_words))) # Feature 2. vector.append(len(ant_words)) # Feature 3. vector.append( truth( NT.dominates(subtree.root(), subtree, ant.get_trigger().get_subtree()))) # Features 4. pos_tags_dict = {} for tag in POS_TAGS: pos_tags_dict[tag] = 0 idx = get_antecedent_head_index(ant.get_context(), ant) for tag in ant.get_context()['pos'][idx:len(ant_words)]: pos_tags_dict[tag] += 1 vector += [pos_tags_dict[tag] for tag in pos_tags_dict] # Feature 5: if the antecedent starts with an auxiliary, verb, adj. vector.append(truth(DV.isauxiliary(ant_sentdict, idx))) vector.append(truth(DV.isverb(ant_sentdict['pos'][idx]))) vector.append(truth(DV.isadj(ant_sentdict['pos'][idx]))) return vector
def auxwordvector(word, all_auxs): vector = [] for aux in all_auxs: if word == aux: vector.append(truth(True)) else: vector.append(truth(False)) return vector
def addauxs(self, mrgmatrix, gsdict, gs_sent_list, make_file=False): crt_sentnum,crt_auxidx = 0,-1 found_aux = False sent_has_vpe = False #print gsdict #print gs_sent_list while crt_sentnum < len(mrgmatrix): try: old_sentnum = crt_sentnum # Reassign the values for the next auxiliary, recursively. crt_sentnum,crt_auxidx = nextaux(mrgmatrix, crt_sentnum, crt_auxidx+1) if make_file: self.auxiliary_names.append(mrgmatrix[crt_sentnum]['words'][crt_auxidx].lower()) # This is to check if we missed a GS aux by accident. if old_sentnum+1 == crt_sentnum: if sent_has_vpe and not found_aux: auxs = getauxs(mrgmatrix[old_sentnum]) crt_auxnum_out_of_total = len(self.aux_bools) - len(auxs) for idx,aux in auxs: if auxindict(aux, idx, gsdict): self.aux_bools[crt_auxnum_out_of_total] = truth(True) found_aux = True print 'RULE 1 Added sentence.\n' crt_auxnum_out_of_total += 1 if not found_aux: print 'We missed the sentence below.'#sentence: %d'%old_sentnum printsent(mrgmatrix, old_sentnum) print dict(zip(gs_sent_list,gsdict)) print '*', self.missed_sentences += 1 found_aux = False sent_has_vpe = False except TypeError: return self.nth_aux += 1 if crt_sentnum in gs_sent_list: sent_has_vpe = True if auxandidxindict(mrgmatrix[crt_sentnum]['words'][crt_auxidx], crt_auxidx, gsdict): #idxindict(crt_auxidx, gsdict, gs_sent_list, crt_sentnum): found_aux = True self.aux_bools.append(truth(True)) else: self.aux_bools.append(truth(False)) else: self.aux_bools.append(truth(False))
def trigger_description(trig_sentdict, ant_sentdict, trigger, POS_TAGS, AUX_WORDS): vector = [] trig_words = trigger.get_words() subtree = trigger.get_subtree() context_idx = 0 for w in trigger.get_context()['words']: if w == trig_sentdict['words'][trigger.get_idx()]: break context_idx += 1 # Features 1,2 vector.append(truth(len(subtree.leaves()) == len(trig_words))) vector.append(len(trig_words)) # Feature set 3. pos_tags_dict = {} for tag in POS_TAGS: pos_tags_dict[tag] = 0 for tag in trigger.get_context()['pos'][context_idx:len(trig_words)]: pos_tags_dict[tag] += 1 vector += [pos_tags_dict[tag] for tag in pos_tags_dict] # Feature sets 4,5,6. Description of the auxiliary. vector += VC.lemmacategoryvector( trig_sentdict['lemmas'][trigger.get_idx()]) vector += VC.lemmavector(trig_sentdict['lemmas'][trigger.get_idx()]) vector += VC.auxwordvector(trig_sentdict['words'][trigger.get_idx()], AUX_WORDS) return vector
def trigger_description(trig_sentdict, ant_sentdict, trigger, POS_TAGS, AUX_WORDS): vector = [] trig_words = trigger.get_words() subtree = trigger.get_subtree() context_idx = 0 for w in trigger.get_context()['words']: if w == trig_sentdict['words'][trigger.get_idx()]: break context_idx += 1 # Features 1,2 vector.append(truth(len(subtree.leaves()) == len(trig_words))) vector.append(len(trig_words)) # Feature set 3. pos_tags_dict = {} for tag in POS_TAGS: pos_tags_dict[tag] = 0 for tag in trigger.get_context()['pos'][context_idx:len(trig_words)]: pos_tags_dict[tag] += 1 vector += [pos_tags_dict[tag] for tag in pos_tags_dict] # Feature sets 4,5,6. Description of the auxiliary. vector += VC.lemmacategoryvector(trig_sentdict['lemmas'][trigger.get_idx()]) vector += VC.lemmavector(trig_sentdict['lemmas'][trigger.get_idx()]) vector += VC.auxwordvector(trig_sentdict['words'][trigger.get_idx()], AUX_WORDS) return vector
def verblocativevector(sentdict, auxidx): vector = [] verb_locations = [] num_auxiliaries = 0 closest = 99 for i in range(0, len(sentdict["pos"])): if DV.isverb(sentdict["pos"][i]) and not i == auxidx: verb_locations.append(i) closest = min(closest, abs(auxidx - i)) if sentdict["lemmas"][i] in DV.VPE_TRIGGERS_IN_WSJ: num_auxiliaries += 1 # The first feature is the distance between the Auxiliary and the closest verb. if closest != 99: vector.append(closest) else: vector.append(truth(False)) # Distance between auxiliary and closest previous verb. closest = 99 for idx in verb_locations: if idx < auxidx: closest = min(closest, abs(auxidx - i)) if closest != 99: vector.append(closest) else: vector.append(truth(False)) # Distance between auxiliary and closest following verb. closest = 99 for idx in verb_locations: if idx > auxidx: closest = min(closest, abs(auxidx - i)) if closest != 99: vector.append(closest) else: vector.append(truth(False)) # This next feature is the number of verbs in the auxiliary's sentence. vector.append(len(verb_locations)) # This feature is the number of auxiliary's in the sentence. vector.append(num_auxiliaries) return vector
def makestructtypevector(sentdict, idx, key_in_sentdict, lst): vector = [] try: test_val = sentdict[key_in_sentdict][idx] if key_in_sentdict == 'words': test_val = test_val.lower() except IndexError: test_val = '-~NONE~-' if test_val == '-~NONE~-': for val in lst: vector.append(truth(False)) else: got = False for val in lst: if not got and val == test_val: vector.append(truth(True)) else: vector.append(truth(False)) return vector
def verblocativevector(sentdict, auxidx): vector = [] verb_locations = [] num_auxiliaries = 0 closest = 99 for i in range(0, len(sentdict['pos'])): if DV.isverb(sentdict['pos'][i]) and not i == auxidx: verb_locations.append(i) closest = min(closest, abs(auxidx - i)) if sentdict['lemmas'][i] in DV.VPE_TRIGGERS_IN_WSJ: num_auxiliaries += 1 # The first feature is the distance between the Auxiliary and the closest verb. if closest != 99: vector.append(closest) else: vector.append(truth(False)) # Distance between auxiliary and closest previous verb. closest = 99 for idx in verb_locations: if idx < auxidx: closest = min(closest, abs(auxidx - i)) if closest != 99: vector.append(closest) else: vector.append(truth(False)) # Distance between auxiliary and closest following verb. closest = 99 for idx in verb_locations: if idx > auxidx: closest = min(closest, abs(auxidx - i)) if closest != 99: vector.append(closest) else: vector.append(truth(False)) # This next feature is the number of verbs in the auxiliary's sentence. vector.append(len(verb_locations)) # This feature is the number of auxiliary's in the sentence. vector.append(num_auxiliaries) return vector
def compare(self, gs, my_alg, end_training_set, multiplier=1, verbose=False): results = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0} if len(gs) != len(my_alg): print 'Error -> the vectors are not the same size!' print 'GS length %d, comparison length %d' % (len(gs), len(my_alg)) quit() try: training_data_length = len(self.getgsdata(-1, end_training_set)) except KeyError: training_data_length = 0 for i in range(0, len(gs)): # print '%dv%d'%(gs[i],my_alg[i]), mapped_index = i + training_data_length if gs[i] == truth(True) and my_alg[i] == truth(True): results['tp'] += 1 if verbose and False: print '\nTrue positive: %s' % self.gold_standard_auxs.auxiliary_names[ mapped_index], self.each_sentence.printsentence( self.auxnum_to_sent_map[mapped_index]) elif gs[i] == truth(True) and my_alg[i] == truth(False): results['fn'] += 1 if verbose: print '\nFalse negative: %s' % self.gold_standard_auxs.auxiliary_names[ mapped_index], self.each_sentence.printsentence( self.auxnum_to_sent_map[mapped_index]) elif gs[i] == truth(False) and my_alg[i] == truth(True): results['fp'] += 1 if verbose: print '\nFalse positive: %s' % self.gold_standard_auxs.auxiliary_names[ mapped_index], self.each_sentence.printsentence( self.auxnum_to_sent_map[mapped_index]) else: results['tn'] += 1 for k in results: if k in ['tp', 'fn']: results[k] /= multiplier print results scores = f1(results) for k in scores: print k.capitalize() + ' : %0.2f' % scores[k]
def oversample(self, start_section, end_section, multiplier): new_feature_vector,new_gs_bools = [],[] print 'Adding x%d oversample vectors...'%multiplier for i in range(0, len(self.getgsdata(start_section, end_section))): if self.gold_standard_auxs.aux_bools[i] == truth(True): for k in range(0, multiplier): new_feature_vector.append(self.getfeaturevector(i)) new_gs_bools.append(self.getgsentry(i)) else: new_feature_vector.append(self.getfeaturevector(i)) new_gs_bools.append(self.getgsentry(i)) return new_feature_vector,new_gs_bools
def oversample(self, start_section, end_section, multiplier): new_feature_vector, new_gs_bools = [], [] print 'Adding x%d oversample vectors...' % multiplier for i in range(0, len(self.getgsdata(start_section, end_section))): if self.gold_standard_auxs.aux_bools[i] == truth(True): for k in range(0, multiplier): new_feature_vector.append(self.getfeaturevector(i)) new_gs_bools.append(self.getgsentry(i)) else: new_feature_vector.append(self.getfeaturevector(i)) new_gs_bools.append(self.getgsentry(i)) return new_feature_vector, new_gs_bools
def makestructtypevector(sentdict, idx, key_in_sentdict, lst): vector = [] try: test_val = sentdict[key_in_sentdict][idx] if key_in_sentdict == "words": test_val = test_val.lower() except IndexError: test_val = "-~NONE~-" if test_val == "-~NONE~-": for val in lst: vector.append(truth(False)) else: got = False for val in lst: if not got and val == test_val: vector.append(truth(True)) else: vector.append(truth(False)) return vector
def call_graph(self,graph_type,params): self.m = mongo('10.0.0.1', 27017,'rktest','sensors') self.a = analysis() self.p = plotting() self.t = truth('test.txt') self.z = zone('zone.txt') print params dates = params[4][1].split("/")[0].split("-") times = params[4][1].split("/")[1].split(":") start = datetime.datetime(int(dates[2]),int(dates[1]),int(dates[0]),int(times[0]),int(times[1]),int(times[2])); dates = params[5][1].split("/")[0].split("-") times = params[5][1].split("/")[1].split(":") end = datetime.datetime(int(dates[2]),int(dates[1]),int(dates[0]),int(times[0]),int(times[1]),int(times[2])); start = time.mktime(start.timetuple())-36000 end = time.mktime(end.timetuple())-36000 self.start = start self.end = end print start print end self.m_time = self.m.get_array_time(start, end,self.m.get_array()) if(graph_type == "occupancy"): self.p.new(self.fig) try: c = self.a.room_occupency(self.m, 0,start,end) self.p.add_line(c[0], c[1], 'b') self.p.show_legend() except: a = 1 self.p.show() if(graph_type == "average"): self.p.new(self.fig) self.average(params) self.p.show_legend() self.p.show() if(graph_type == "weighted"): self.p.new(self.fig) self.weighted_vote(params) self.p.show_legend() self.p.show() if(graph_type == "joined"): self.p.new(self.fig) self.average(params) self.weighted_vote(params) self.p.show_legend() self.p.show() if(graph_type == "heat"): self.heatmap(params) self.fig = self.fig+1
def testmyrules(classifier, section_start, section_end): gs_vector = classifier.getgsdata(section_start, section_end) aux_start, aux_end = classifier.section_split[ section_start], classifier.section_split[section_end] my_rules_return_vector = [] count = 0 for sentdict in classifier.each_sentence.sentences: for i in range(0, len(sentdict['lemmas'])): word = sentdict['lemmas'][i] if isauxiliary(sentdict, i): count += 1 if aux_start < count <= aux_end: tree = NT.maketree(sentdict['tree'][0]) subtree_positions = NT.get_smallest_subtree_positions(tree) if word in MODALS: my_rules_return_vector.append( truth( modalcheck(sentdict, i, tree, subtree_positions)) ) #Todo: I modified these b/c they were incorrectly written. elif word in BE: my_rules_return_vector.append( truth(becheck(sentdict, i, tree, subtree_positions))) elif word in HAVE: my_rules_return_vector.append( truth( havecheck(sentdict, i, tree, subtree_positions))) elif word in DO: my_rules_return_vector.append( truth(docheck(sentdict, i, tree, subtree_positions))) elif word in TO: my_rules_return_vector.append( truth(tocheck(sentdict, i, tree, subtree_positions))) elif word in SO: my_rules_return_vector.append( truth(socheck(sentdict, i, tree, subtree_positions))) classifier.compare(gs_vector, my_rules_return_vector, section_start - 1, verbose=False)
def compare(self, gs, my_alg, end_training_set, multiplier=1, verbose=False): results = {'tp': 0, 'fp': 0, 'fn': 0, 'tn':0} if len(gs) != len(my_alg): print 'Error -> the vectors are not the same size!' print 'GS length %d, comparison length %d'%(len(gs), len(my_alg)) quit() try: training_data_length = len(self.getgsdata(-1,end_training_set)) except KeyError: training_data_length = 0 for i in range(0, len(gs)): # print '%dv%d'%(gs[i],my_alg[i]), mapped_index = i+training_data_length if gs[i] == truth(True) and my_alg[i] == truth(True): results['tp'] += 1 if verbose and False: print '\nTrue positive: %s'%self.gold_standard_auxs.auxiliary_names[mapped_index], self.each_sentence.printsentence(self.auxnum_to_sent_map[mapped_index]) elif gs[i] == truth(True) and my_alg[i] == truth(False): results['fn'] += 1 if verbose: print '\nFalse negative: %s'%self.gold_standard_auxs.auxiliary_names[mapped_index], self.each_sentence.printsentence(self.auxnum_to_sent_map[mapped_index]) elif gs[i] == truth(False) and my_alg[i] == truth(True): results['fp'] += 1 if verbose: print '\nFalse positive: %s'%self.gold_standard_auxs.auxiliary_names[mapped_index], self.each_sentence.printsentence(self.auxnum_to_sent_map[mapped_index]) else: results['tn'] += 1 for k in results: if k in ['tp', 'fn']: results[k] /= multiplier print results scores = f1(results) for k in scores: print k.capitalize()+' : %0.2f' %scores[k]
def makeposbigramsvector(sentdict, auxidx, postags, combine=False): vector = [] true_idxs = [] for i in range(auxidx - 3, auxidx + 3): try: crtpos = sentdict["pos"][i] nextpos = sentdict["pos"][i + 1] except IndexError: for p1 in postags: for p2 in postags: if not combine: vector.append(truth(False)) continue got = False count = 0 for k in range(0, len(postags)): for j in range(0, len(postags)): count += 1 if not got: if crtpos == postags[k] and nextpos == postags[j]: if not combine: vector.append(truth(True)) else: true_idxs.append(count) got = True else: if not combine: vector.append(truth(False)) else: if not combine: vector.append(truth(False)) if combine: length_of_bigrams_set = (len(postags) - 1) ** 2 for i in range(0, length_of_bigrams_set): if i in true_idxs: vector.append(truth(True) * true_idxs.count(i)) else: vector.append(truth(False)) return vector
def makeposbigramsvector(sentdict, auxidx, postags, combine=False): vector = [] true_idxs = [] for i in range(auxidx - 3, auxidx + 3): try: crtpos = sentdict['pos'][i] nextpos = sentdict['pos'][i + 1] except IndexError: for p1 in postags: for p2 in postags: if not combine: vector.append(truth(False)) continue got = False count = 0 for k in range(0, len(postags)): for j in range(0, len(postags)): count += 1 if not got: if crtpos == postags[k] and nextpos == postags[j]: if not combine: vector.append(truth(True)) else: true_idxs.append(count) got = True else: if not combine: vector.append(truth(False)) else: if not combine: vector.append(truth(False)) if combine: length_of_bigrams_set = (len(postags) - 1)**2 for i in range(0, length_of_bigrams_set): if i in true_idxs: vector.append(truth(True) * true_idxs.count(i)) else: vector.append(truth(False)) return vector
def testmyrules(classifier, section_start, section_end): gs_vector = classifier.getgsdata(section_start, section_end) aux_start,aux_end = classifier.section_split[section_start], classifier.section_split[section_end] my_rules_return_vector = [] count = 0 for sentdict in classifier.each_sentence.sentences: for i in range(0,len(sentdict['lemmas'])): word = sentdict['lemmas'][i] if isauxiliary(sentdict, i): count += 1 if aux_start < count <= aux_end: tree = NT.maketree(sentdict['tree'][0]) subtree_positions = NT.get_smallest_subtree_positions(tree) if word in MODALS: my_rules_return_vector.append(truth(modalcheck(sentdict, i, tree, subtree_positions))) #Todo: I modified these b/c they were incorrectly written. elif word in BE: my_rules_return_vector.append(truth(becheck(sentdict, i, tree, subtree_positions))) elif word in HAVE: my_rules_return_vector.append(truth(havecheck(sentdict, i, tree, subtree_positions))) elif word in DO: my_rules_return_vector.append(truth(docheck(sentdict, i, tree, subtree_positions))) elif word in TO: my_rules_return_vector.append(truth(tocheck(sentdict, i, tree, subtree_positions))) elif word in SO: my_rules_return_vector.append(truth(socheck(sentdict, i, tree, subtree_positions))) classifier.compare(gs_vector, my_rules_return_vector, section_start-1, verbose=False)
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict): vector = [] ant_context_sentdict = ant.get_context() trig_context_sentdict = trigger.get_context() ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) # Feature 1. ant_auxs = [] for i in range(0, len(ant_sentdict['words'])): if DV.isauxiliary(ant_sentdict, i): ant_auxs.append(ant_sentdict['lemmas'][i]) found = False for aux in ant_auxs: if aux in trig_sentdict['lemmas']: vector.append(truth(True)) found = True break if not found: vector.append(truth(False)) # Feature 2. if ant.get_sentnum() == trigger.get_sentnum(): vector.append(truth(ant_head_idx > trigger.get_idx())) vector.append(truth(ant_head_idx == trigger.get_idx())) vector.append(truth(ant_head_idx < trigger.get_idx())) else: vector += [0, 0, 0] # Features 3,4,5. for k in ['words', 'lemmas', 'pos']: total = len(ant_context_sentdict[k]) + len(trig_context_sentdict[k]) common = len( set(ant_context_sentdict[k]).intersection( trig_context_sentdict[k])) vector.append(common) vector.append((2.0 * float(common)) / float(total)) # Feature 6 - number of words between trigger and antecedent. vector.append(ant.get_sentnum() - trigger.get_sentnum()) if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx()) else: crt_sentnum = trigger.get_sentnum() distance = ant_head_idx while crt_sentnum < ant.get_sentnum(): distance += len(trig_sentdict['words']) crt_sentnum += 1 vector.append(distance) # Feature 7. # First we get the vecs from the Ant NP and average them. blank_np = False ant_np_word2vec = [] ant_np_location = ant.get_context()['np'] if ant_np_location != (-1, -1): ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1]) else: blank_np = True # Next we do the same for the Trigger NP. trig_np_word2vec = [] trig_np_location = trigger.get_context()['np'] if trig_np_location != (-1, -1): trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1]) else: blank_np = True # Adding the angle of the vector between the trigger NP and antecedent NP. if not blank_np: ant_length = vector_length(ant_np_word2vec) trig_length = vector_length(trig_np_word2vec) try: angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length) except ValueError: angle = 90.0 vector.append(angle) vector.append(truth(angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) if not ant_np_word2vec: vector += [0 for _ in range(0, WORD2VEC_LENGTH)] else: vector += ant_np_word2vec if not trig_np_word2vec: vector += [0 for _ in range(0, WORD2VEC_LENGTH)] else: vector += trig_np_word2vec # Now for what comes after the head. ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words'])) # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += ant_post_head_w2vec stop_idx = len(trig_sentdict['words']) for i in range(trigger.get_idx(), len(trig_sentdict['words'])): if DV.ispunctuation(trig_sentdict['lemmas'][i]): stop_idx = i break post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx) # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += post_trig_w2vec if ant_post_head_w2vec and post_trig_w2vec: try: post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec) except ValueError: post_angle = 90.0 vector.append(post_angle) vector.append(truth(post_angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) # Sentenial complement check. tree = NT.maketree(ant_sentdict['tree'][0]) if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()): vector.append( truth( NT.has_phrases_between_trees( ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES))) else: vector.append(truth(False)) # Features to account for the number of each phrase type between the antecedent and trigger. phrases_between = [0 for _ in ALL_PHRASES] if ant.get_sentnum() == trigger.get_sentnum(): for i in range(0, len(phrases_between)): if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]): phrases_between[i] += 1 vector += phrases_between vector.append(sum(phrases_between)) return vector
from mongo import * from truth import * from analysis import * from plotting import * import pylab t = truth('test.txt') m = mongo('10.0.0.1', 27017,'rktest','sensors') a = analysis() p = plotting() master = m.get_array() #print m.get_nodes() end_time = a.get_time_bounds(master)[1] start_time = end_time-800 master = m.get_array_time(start_time,end_time,master) #print m.get_array_time(0,end_time,master) #print m.get_sensortype(master) #print m.get_sensortype_activation('VC',master) #print m.get_sensortype_activation('ACCL',master) #print t.get_table() tmaster = t.parse_raw(master) i = 0 for d in m.get_nodes(): print d col = a.collective_average(m,t,{d:master[d]}) wv = a.weighted_vote(m,t,{d:master[d]}) #print col #print wv pos = [] neg = []
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict): vector = [] ant_context_sentdict = ant.get_context() trig_context_sentdict = trigger.get_context() ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) # Feature 1. ant_auxs = [] for i in range(0,len(ant_sentdict['words'])): if DV.isauxiliary(ant_sentdict, i): ant_auxs.append(ant_sentdict['lemmas'][i]) found = False for aux in ant_auxs: if aux in trig_sentdict['lemmas']: vector.append(truth(True)) found = True break if not found: vector.append(truth(False)) # Feature 2. if ant.get_sentnum() == trigger.get_sentnum(): vector.append(truth(ant_head_idx > trigger.get_idx())) vector.append(truth(ant_head_idx == trigger.get_idx())) vector.append(truth(ant_head_idx < trigger.get_idx())) else: vector += [0,0,0] # Features 3,4,5. for k in ['words','lemmas','pos']: total = len(ant_context_sentdict[k])+len(trig_context_sentdict[k]) common = len(set(ant_context_sentdict[k]).intersection(trig_context_sentdict[k])) vector.append(common) vector.append((2.0*float(common))/float(total)) # Feature 6 - number of words between trigger and antecedent. vector.append(ant.get_sentnum()-trigger.get_sentnum()) if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx()) else: crt_sentnum = trigger.get_sentnum() distance = ant_head_idx while crt_sentnum < ant.get_sentnum(): distance += len(trig_sentdict['words']) crt_sentnum += 1 vector.append(distance) # Feature 7. # First we get the vecs from the Ant NP and average them. blank_np = False ant_np_word2vec = [] ant_np_location = ant.get_context()['np'] if ant_np_location != (-1,-1): ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1]) else: blank_np = True # Next we do the same for the Trigger NP. trig_np_word2vec = [] trig_np_location = trigger.get_context()['np'] if trig_np_location != (-1,-1): trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1]) else: blank_np = True # Adding the angle of the vector between the trigger NP and antecedent NP. if not blank_np: ant_length = vector_length(ant_np_word2vec) trig_length = vector_length(trig_np_word2vec) try: angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length) except ValueError: angle = 90.0 vector.append(angle) vector.append(truth(angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) if not ant_np_word2vec: vector += [0 for _ in range(0,WORD2VEC_LENGTH)] else: vector += ant_np_word2vec if not trig_np_word2vec: vector += [0 for _ in range(0,WORD2VEC_LENGTH)] else: vector += trig_np_word2vec # Now for what comes after the head. ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words'])) # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += ant_post_head_w2vec stop_idx = len(trig_sentdict['words']) for i in range(trigger.get_idx(), len(trig_sentdict['words'])): if DV.ispunctuation(trig_sentdict['lemmas'][i]): stop_idx = i break post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx) # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += post_trig_w2vec if ant_post_head_w2vec and post_trig_w2vec: try: post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec) except ValueError: post_angle = 90.0 vector.append(post_angle) vector.append(truth(post_angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) # Sentenial complement check. tree = NT.maketree(ant_sentdict['tree'][0]) if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()): vector.append(truth( NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES))) else: vector.append(truth(False)) # Features to account for the number of each phrase type between the antecedent and trigger. phrases_between = [0 for _ in ALL_PHRASES] if ant.get_sentnum() == trigger.get_sentnum(): for i in range(0,len(phrases_between)): if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]): phrases_between[i] += 1 vector += phrases_between vector.append(sum(phrases_between)) return vector
def myfeaturesvector(sentdict, idx, features): vector = [] tree = NT.maketree(sentdict['tree'][0]) subtrees = NT.getsmallestsubtrees(tree) subtree_positions = NT.get_smallest_subtree_positions( tree, subtree_list=subtrees) aux = sentdict['lemmas'][idx] if 'my_features' in features: vector.append( truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions))) vector.append( truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=['.']))) vector.append( truth(DV.previouswordisasorsoorthan(sentdict['words'], idx))) vector.append(truth(DV.thesamecheck(sentdict['words'], idx))) vector.append(truth(DV.toprecedesaux(sentdict, idx))) vector.append(truth(DV.verbfollowsaux(sentdict, idx))) # TODO: added this new feature! vector.append(truth(DV.nextwordistoo(sentdict, idx))) if 'my_rules' in features: vector.append( truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions))) # This adds a new layer of features by combining all of the ones I had. if 'square_rules' in features: size = len(vector) for i in range(0, size): for j in range(0, size): if i != j: vector.append( truth(untruth(vector[i]) and untruth(vector[j]))) if 'combine_aux_type' in features: bools = [ aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO ] vec = [v for v in vector] for v in vec: for b in bools: vector.append(truth(untruth(v) and b)) return vector
def addsentences(self, gsvpelist, size): for i in range(0, size): if i in gsvpelist: self.sentence_booleans.append(truth(True)) else: self.sentence_booleans.append(truth(False))
def myfeaturesvector(sentdict, idx, features): vector = [] tree = NT.maketree(sentdict["tree"][0]) subtrees = NT.getsmallestsubtrees(tree) subtree_positions = NT.get_smallest_subtree_positions(tree, subtree_list=subtrees) aux = sentdict["lemmas"][idx] if "my_features" in features: vector.append(truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=["."]))) vector.append(truth(DV.previouswordisasorsoorthan(sentdict["words"], idx))) vector.append(truth(DV.thesamecheck(sentdict["words"], idx))) vector.append(truth(DV.toprecedesaux(sentdict, idx))) vector.append(truth(DV.verbfollowsaux(sentdict, idx))) # TODO: added this new feature! vector.append(truth(DV.nextwordistoo(sentdict, idx))) if "my_rules" in features: vector.append(truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions))) # This adds a new layer of features by combining all of the ones I had. if "square_rules" in features: size = len(vector) for i in range(0, size): for j in range(0, size): if i != j: vector.append(truth(untruth(vector[i]) and untruth(vector[j]))) if "combine_aux_type" in features: bools = [aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO] vec = [v for v in vector] for v in vec: for b in bools: vector.append(truth(untruth(v) and b)) return vector
def counttruth(vector): count = 0 for val in vector: if val == truth(True): count += 1 return count
def addauxs(self, mrgmatrix, gsdict, gs_sent_list, make_file=False): crt_sentnum, crt_auxidx = 0, -1 found_aux = False sent_has_vpe = False #print gsdict #print gs_sent_list while crt_sentnum < len(mrgmatrix): try: old_sentnum = crt_sentnum # Reassign the values for the next auxiliary, recursively. crt_sentnum, crt_auxidx = nextaux(mrgmatrix, crt_sentnum, crt_auxidx + 1) if make_file: self.auxiliary_names.append( mrgmatrix[crt_sentnum]['words'][crt_auxidx].lower()) # This is to check if we missed a GS aux by accident. if old_sentnum + 1 == crt_sentnum: if sent_has_vpe and not found_aux: auxs = getauxs(mrgmatrix[old_sentnum]) crt_auxnum_out_of_total = len( self.aux_bools) - len(auxs) for idx, aux in auxs: if auxindict(aux, idx, gsdict): self.aux_bools[ crt_auxnum_out_of_total] = truth(True) found_aux = True print 'RULE 1 Added sentence.\n' crt_auxnum_out_of_total += 1 if not found_aux: print 'We missed the sentence below.' #sentence: %d'%old_sentnum printsent(mrgmatrix, old_sentnum) print dict(zip(gs_sent_list, gsdict)) print '*', self.missed_sentences += 1 found_aux = False sent_has_vpe = False except TypeError: return self.nth_aux += 1 if crt_sentnum in gs_sent_list: sent_has_vpe = True if auxandidxindict( mrgmatrix[crt_sentnum]['words'][crt_auxidx], crt_auxidx, gsdict ): #idxindict(crt_auxidx, gsdict, gs_sent_list, crt_sentnum): found_aux = True self.aux_bools.append(truth(True)) else: self.aux_bools.append(truth(False)) else: self.aux_bools.append(truth(False))
def number_of_positive_auxs(self): count = 0 for aux_bool in self.aux_bools: if aux_bool == truth(True): count += 1 return count
def number_of_positive_auxs(self): count = 0 for aux_bool in self.aux_bools: if aux_bool == truth(True): count+=1 return count