def set_possible_ants(self, trigger, pos_tests): for sentnum in range(max(0, trigger.sentnum - SENTENCE_SEARCH_DISTANCE), trigger.sentnum + 1): functions = [f for f in pos_tests if hasattr(f, '__call__')] for i in range(len(self.sentences[sentnum])): tag = self.sentences[sentnum].pos[i] # TODO: ADDED SECOND CLAUSE TO THIS IF TO LOWER NUMBER OF CANDIDATES GENERATED if True in (f(tag) for f in functions): # and not wc.is_aux_lemma(self.sentences[sentnum].lemmas[i]): phrase = nt.get_nearest_phrase(nt.maketree(self.sentences[sentnum]['tree'][0]), i, pos_tests) phrase_length = nt.get_phrase_length(phrase) # if phrase_length <= 2: # print phrase for j in range(i, min(i + phrase_length + 1, len(self.sentences[sentnum]))): if not ant_after_trigger(sentnum, i, j, trigger): bad = False for pos_check in [wc.is_preposition, wc.is_punctuation, wc.is_determiner]: if pos_check(self.sentences[sentnum].pos[j - 1]): bad = True if not bad: ant = self.idxs_to_ant(sentnum, i, j, trigger) if len(ant.sub_sentdict) > 0: trigger.add_possible_ant(ant)
def myfeaturesvector(sentdict, idx, features): vector = [] tree = NT.maketree(sentdict["tree"][0]) subtrees = NT.getsmallestsubtrees(tree) subtree_positions = NT.get_smallest_subtree_positions(tree, subtree_list=subtrees) aux = sentdict["lemmas"][idx] if "my_features" in features: vector.append(truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=["."]))) vector.append(truth(DV.previouswordisasorsoorthan(sentdict["words"], idx))) vector.append(truth(DV.thesamecheck(sentdict["words"], idx))) vector.append(truth(DV.toprecedesaux(sentdict, idx))) vector.append(truth(DV.verbfollowsaux(sentdict, idx))) # TODO: added this new feature! vector.append(truth(DV.nextwordistoo(sentdict, idx))) if "my_rules" in features: vector.append(truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions))) # This adds a new layer of features by combining all of the ones I had. if "square_rules" in features: size = len(vector) for i in range(0, size): for j in range(0, size): if i != j: vector.append(truth(untruth(vector[i]) and untruth(vector[j]))) if "combine_aux_type" in features: bools = [aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO] vec = [v for v in vector] for v in vec: for b in bools: vector.append(truth(untruth(v) and b)) return vector
def testmyrules(classifier, section_start, section_end): gs_vector = classifier.getgsdata(section_start, section_end) aux_start, aux_end = classifier.section_split[ section_start], classifier.section_split[section_end] my_rules_return_vector = [] count = 0 for sentdict in classifier.each_sentence.sentences: for i in range(0, len(sentdict['lemmas'])): word = sentdict['lemmas'][i] if isauxiliary(sentdict, i): count += 1 if aux_start < count <= aux_end: tree = NT.maketree(sentdict['tree'][0]) subtree_positions = NT.get_smallest_subtree_positions(tree) if word in MODALS: my_rules_return_vector.append( truth( modalcheck(sentdict, i, tree, subtree_positions)) ) #Todo: I modified these b/c they were incorrectly written. elif word in BE: my_rules_return_vector.append( truth(becheck(sentdict, i, tree, subtree_positions))) elif word in HAVE: my_rules_return_vector.append( truth( havecheck(sentdict, i, tree, subtree_positions))) elif word in DO: my_rules_return_vector.append( truth(docheck(sentdict, i, tree, subtree_positions))) elif word in TO: my_rules_return_vector.append( truth(tocheck(sentdict, i, tree, subtree_positions))) elif word in SO: my_rules_return_vector.append( truth(socheck(sentdict, i, tree, subtree_positions))) classifier.compare(gs_vector, my_rules_return_vector, section_start - 1, verbose=False)
def set_possible_ants(self, trigger, pos_tests): for sentnum in range( max(0, trigger.sentnum - SENTENCE_SEARCH_DISTANCE), trigger.sentnum + 1): functions = [f for f in pos_tests if hasattr(f, '__call__')] for i in range(len(self.sentences[sentnum])): tag = self.sentences[sentnum].pos[i] # TODO: ADDED SECOND CLAUSE TO THIS IF TO LOWER NUMBER OF CANDIDATES GENERATED if True in ( f(tag) for f in functions ): # and not wc.is_aux_lemma(self.sentences[sentnum].lemmas[i]): phrase = nt.get_nearest_phrase( nt.maketree(self.sentences[sentnum]['tree'][0]), i, pos_tests) phrase_length = nt.get_phrase_length(phrase) # if phrase_length <= 2: # print phrase for j in range( i, min(i + phrase_length + 1, len(self.sentences[sentnum]))): if not ant_after_trigger(sentnum, i, j, trigger): bad = False for pos_check in [ wc.is_preposition, wc.is_punctuation, wc.is_determiner ]: if pos_check(self.sentences[sentnum].pos[j - 1]): bad = True if not bad: ant = self.idxs_to_ant(sentnum, i, j, trigger) if len(ant.sub_sentdict) > 0: trigger.add_possible_ant(ant)
def testmyrules(classifier, section_start, section_end): gs_vector = classifier.getgsdata(section_start, section_end) aux_start,aux_end = classifier.section_split[section_start], classifier.section_split[section_end] my_rules_return_vector = [] count = 0 for sentdict in classifier.each_sentence.sentences: for i in range(0,len(sentdict['lemmas'])): word = sentdict['lemmas'][i] if isauxiliary(sentdict, i): count += 1 if aux_start < count <= aux_end: tree = NT.maketree(sentdict['tree'][0]) subtree_positions = NT.get_smallest_subtree_positions(tree) if word in MODALS: my_rules_return_vector.append(truth(modalcheck(sentdict, i, tree, subtree_positions))) #Todo: I modified these b/c they were incorrectly written. elif word in BE: my_rules_return_vector.append(truth(becheck(sentdict, i, tree, subtree_positions))) elif word in HAVE: my_rules_return_vector.append(truth(havecheck(sentdict, i, tree, subtree_positions))) elif word in DO: my_rules_return_vector.append(truth(docheck(sentdict, i, tree, subtree_positions))) elif word in TO: my_rules_return_vector.append(truth(tocheck(sentdict, i, tree, subtree_positions))) elif word in SO: my_rules_return_vector.append(truth(socheck(sentdict, i, tree, subtree_positions))) classifier.compare(gs_vector, my_rules_return_vector, section_start-1, verbose=False)
def get_nltk_tree(self): return nt.maketree(self.tree_text[0])
def get_sentence_tree(self, i): return nt.maketree(self.sentences[i]['tree'][0])
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict): vector = [] ant_context_sentdict = ant.get_context() trig_context_sentdict = trigger.get_context() ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) # Feature 1. ant_auxs = [] for i in range(0, len(ant_sentdict['words'])): if DV.isauxiliary(ant_sentdict, i): ant_auxs.append(ant_sentdict['lemmas'][i]) found = False for aux in ant_auxs: if aux in trig_sentdict['lemmas']: vector.append(truth(True)) found = True break if not found: vector.append(truth(False)) # Feature 2. if ant.get_sentnum() == trigger.get_sentnum(): vector.append(truth(ant_head_idx > trigger.get_idx())) vector.append(truth(ant_head_idx == trigger.get_idx())) vector.append(truth(ant_head_idx < trigger.get_idx())) else: vector += [0, 0, 0] # Features 3,4,5. for k in ['words', 'lemmas', 'pos']: total = len(ant_context_sentdict[k]) + len(trig_context_sentdict[k]) common = len( set(ant_context_sentdict[k]).intersection( trig_context_sentdict[k])) vector.append(common) vector.append((2.0 * float(common)) / float(total)) # Feature 6 - number of words between trigger and antecedent. vector.append(ant.get_sentnum() - trigger.get_sentnum()) if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx()) else: crt_sentnum = trigger.get_sentnum() distance = ant_head_idx while crt_sentnum < ant.get_sentnum(): distance += len(trig_sentdict['words']) crt_sentnum += 1 vector.append(distance) # Feature 7. # First we get the vecs from the Ant NP and average them. blank_np = False ant_np_word2vec = [] ant_np_location = ant.get_context()['np'] if ant_np_location != (-1, -1): ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1]) else: blank_np = True # Next we do the same for the Trigger NP. trig_np_word2vec = [] trig_np_location = trigger.get_context()['np'] if trig_np_location != (-1, -1): trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1]) else: blank_np = True # Adding the angle of the vector between the trigger NP and antecedent NP. if not blank_np: ant_length = vector_length(ant_np_word2vec) trig_length = vector_length(trig_np_word2vec) try: angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length) except ValueError: angle = 90.0 vector.append(angle) vector.append(truth(angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) if not ant_np_word2vec: vector += [0 for _ in range(0, WORD2VEC_LENGTH)] else: vector += ant_np_word2vec if not trig_np_word2vec: vector += [0 for _ in range(0, WORD2VEC_LENGTH)] else: vector += trig_np_word2vec # Now for what comes after the head. ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words'])) # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += ant_post_head_w2vec stop_idx = len(trig_sentdict['words']) for i in range(trigger.get_idx(), len(trig_sentdict['words'])): if DV.ispunctuation(trig_sentdict['lemmas'][i]): stop_idx = i break post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx) # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += post_trig_w2vec if ant_post_head_w2vec and post_trig_w2vec: try: post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec) except ValueError: post_angle = 90.0 vector.append(post_angle) vector.append(truth(post_angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) # Sentenial complement check. tree = NT.maketree(ant_sentdict['tree'][0]) if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()): vector.append( truth( NT.has_phrases_between_trees( ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES))) else: vector.append(truth(False)) # Features to account for the number of each phrase type between the antecedent and trigger. phrases_between = [0 for _ in ALL_PHRASES] if ant.get_sentnum() == trigger.get_sentnum(): for i in range(0, len(phrases_between)): if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]): phrases_between[i] += 1 vector += phrases_between vector.append(sum(phrases_between)) return vector
def alignment_comparison(trig_sentdict, ant_sentdict, ant, trigger, word2vec_dict): vector = [] ant_context_sentdict = ant.get_context() trig_context_sentdict = trigger.get_context() ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) # Feature 1. ant_auxs = [] for i in range(0,len(ant_sentdict['words'])): if DV.isauxiliary(ant_sentdict, i): ant_auxs.append(ant_sentdict['lemmas'][i]) found = False for aux in ant_auxs: if aux in trig_sentdict['lemmas']: vector.append(truth(True)) found = True break if not found: vector.append(truth(False)) # Feature 2. if ant.get_sentnum() == trigger.get_sentnum(): vector.append(truth(ant_head_idx > trigger.get_idx())) vector.append(truth(ant_head_idx == trigger.get_idx())) vector.append(truth(ant_head_idx < trigger.get_idx())) else: vector += [0,0,0] # Features 3,4,5. for k in ['words','lemmas','pos']: total = len(ant_context_sentdict[k])+len(trig_context_sentdict[k]) common = len(set(ant_context_sentdict[k]).intersection(trig_context_sentdict[k])) vector.append(common) vector.append((2.0*float(common))/float(total)) # Feature 6 - number of words between trigger and antecedent. vector.append(ant.get_sentnum()-trigger.get_sentnum()) if ant.get_sentnum() == trigger.get_sentnum(): vector.append(ant_head_idx - trigger.get_idx()) else: crt_sentnum = trigger.get_sentnum() distance = ant_head_idx while crt_sentnum < ant.get_sentnum(): distance += len(trig_sentdict['words']) crt_sentnum += 1 vector.append(distance) # Feature 7. # First we get the vecs from the Ant NP and average them. blank_np = False ant_np_word2vec = [] ant_np_location = ant.get_context()['np'] if ant_np_location != (-1,-1): ant_np_word2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_np_location[0], ant_np_location[1]) else: blank_np = True # Next we do the same for the Trigger NP. trig_np_word2vec = [] trig_np_location = trigger.get_context()['np'] if trig_np_location != (-1,-1): trig_np_word2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trig_np_location[0], trig_np_location[1]) else: blank_np = True # Adding the angle of the vector between the trigger NP and antecedent NP. if not blank_np: ant_length = vector_length(ant_np_word2vec) trig_length = vector_length(trig_np_word2vec) try: angle = angle_btwn_vectors(ant_np_word2vec, trig_np_word2vec, v1_length=ant_length, v2_length=trig_length) except ValueError: angle = 90.0 vector.append(angle) vector.append(truth(angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) if not ant_np_word2vec: vector += [0 for _ in range(0,WORD2VEC_LENGTH)] else: vector += ant_np_word2vec if not trig_np_word2vec: vector += [0 for _ in range(0,WORD2VEC_LENGTH)] else: vector += trig_np_word2vec # Now for what comes after the head. ant_head_idx = get_antecedent_head_index(ant_sentdict, ant) ant_post_head_w2vec = get_average_np_vec(word2vec_dict, ant_sentdict, ant_head_idx, len(ant_sentdict['words'])) # if not ant_post_head_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += ant_post_head_w2vec stop_idx = len(trig_sentdict['words']) for i in range(trigger.get_idx(), len(trig_sentdict['words'])): if DV.ispunctuation(trig_sentdict['lemmas'][i]): stop_idx = i break post_trig_w2vec = get_average_np_vec(word2vec_dict, trig_sentdict, trigger.get_idx(), stop_idx) # if not post_trig_w2vec: vector += [0 for i in range(0,WORD2VEC_LENGTH)] # else: vector += post_trig_w2vec if ant_post_head_w2vec and post_trig_w2vec: try: post_angle = angle_btwn_vectors(ant_post_head_w2vec, post_trig_w2vec) except ValueError: post_angle = 90.0 vector.append(post_angle) vector.append(truth(post_angle == 0.0)) else: vector.append(90.0) vector.append(truth(90.0 == 0.0)) # Sentenial complement check. tree = NT.maketree(ant_sentdict['tree'][0]) if NT.dominates(tree, ant.get_subtree(), trigger.get_subtree()): vector.append(truth( NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), NIELSON_SENTENIAL_COMPLEMENT_PHRASES))) else: vector.append(truth(False)) # Features to account for the number of each phrase type between the antecedent and trigger. phrases_between = [0 for _ in ALL_PHRASES] if ant.get_sentnum() == trigger.get_sentnum(): for i in range(0,len(phrases_between)): if NT.has_phrases_between_trees(ant.get_subtree(), trigger.get_subtree(), [ALL_PHRASES[i]]): phrases_between[i] += 1 vector += phrases_between vector.append(sum(phrases_between)) return vector
def myfeaturesvector(sentdict, idx, features): vector = [] tree = NT.maketree(sentdict['tree'][0]) subtrees = NT.getsmallestsubtrees(tree) subtree_positions = NT.get_smallest_subtree_positions( tree, subtree_list=subtrees) aux = sentdict['lemmas'][idx] if 'my_features' in features: vector.append( truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions))) vector.append( truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=['.']))) vector.append( truth(DV.previouswordisasorsoorthan(sentdict['words'], idx))) vector.append(truth(DV.thesamecheck(sentdict['words'], idx))) vector.append(truth(DV.toprecedesaux(sentdict, idx))) vector.append(truth(DV.verbfollowsaux(sentdict, idx))) # TODO: added this new feature! vector.append(truth(DV.nextwordistoo(sentdict, idx))) if 'my_rules' in features: vector.append( truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions))) # This adds a new layer of features by combining all of the ones I had. if 'square_rules' in features: size = len(vector) for i in range(0, size): for j in range(0, size): if i != j: vector.append( truth(untruth(vector[i]) and untruth(vector[j]))) if 'combine_aux_type' in features: bools = [ aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO ] vec = [v for v in vector] for v in vec: for b in bools: vector.append(truth(untruth(v) and b)) return vector