def nearest_trig_np(trig, sentences, all_tags, get_words=False): """ @type trig: vpe_objects.Auxiliary @type sentences: vpe_objects.AllSentences """ t = sentences.get_sentence_tree(trig.sentnum) tree_tuples = nt.pos_word_tuples(t) all_nps = nt.find_subtree_phrases(t, ['NP', 'NP-PRD']) trig_tup = (trig.pos, trig.word) trig_tup_idx = tree_tuples.index(trig_tup) closest_np_value = 99 closest_np = None for NP in all_nps: last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1]) if abs(trig_tup_idx - last_np_word_idx) < closest_np_value: closest_np_value = trig_tup_idx - last_np_word_idx closest_np = NP if closest_np == None: closest_np = t if get_words: try: return closest_np.leaves() except AttributeError: return [] np_pos = [ subtree.label() for subtree in nt.getsmallestsubtrees(closest_np) ] return encode_pos_tag_frequencies(np_pos, all_tags)
def nexttopunct(sentdict, auxidx, t, word_positions_in_tree): localt = nltktree.generate_local_structure_from_subtree(t, t[word_positions_in_tree[auxidx-1]]) local_word_subtrees = nltktree.getsmallestsubtrees(localt) try: checkpuncttag = sentdict['pos'][auxidx+1] if isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag): endbool = True # for subtree in local_word_subtrees: # if isverb( subtree.label() ) and subtree != t[word_positions_in_tree[auxidx-1]]: # if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]): # endbool = False # break if endbool: return endbool except IndexError: return False try: checkpuncttag = sentdict['pos'][auxidx+2] if sentdict['lemmas'][auxidx+1] == 'not' and (isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag)): endbool = True # for subtree in local_word_subtrees: # if isverb( subtree.label() ): # if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]): # endbool = False # break if endbool: return endbool except IndexError: return False return False
def nearest_trig_np(trig, sentences, all_tags, get_words=False): """ @type trig: vpe_objects.Auxiliary @type sentences: vpe_objects.AllSentences """ t = sentences.get_sentence_tree(trig.sentnum) tree_tuples = nt.pos_word_tuples(t) all_nps = nt.find_subtree_phrases(t, ['NP','NP-PRD']) trig_tup = (trig.pos, trig.word) trig_tup_idx = tree_tuples.index(trig_tup) closest_np_value = 99 closest_np = None for NP in all_nps: last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1]) if abs(trig_tup_idx - last_np_word_idx) < closest_np_value: closest_np_value = trig_tup_idx - last_np_word_idx closest_np = NP if closest_np == None: closest_np = t if get_words: try: return closest_np.leaves() except AttributeError: return [] np_pos = [subtree.label() for subtree in nt.getsmallestsubtrees(closest_np)] return encode_pos_tag_frequencies(np_pos, all_tags)
def nearest_ant_np(ant, sentences, all_tags, get_words=False): """ @type ant: vpe_objects.Antecedent @type sentences: vpe_objects.AllSentences """ t = sentences.get_sentence_tree(ant.sentnum) tree_tuples = nt.pos_word_tuples(t) all_nps = nt.find_subtree_phrases(t, ['NP','NP-PRD']) ant_tup = (ant.sub_sentdict.pos[len(ant.sub_sentdict)/2], ant.sub_sentdict.words[len(ant.sub_sentdict)/2]) if ant.sentnum != ant.trigger.sentnum: ant_tup_idx = len(tree_tuples) else: ant_tup_idx = tree_tuples.index(ant_tup) closest_np_value = 99 closest_np = None for NP in all_nps: last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1]) if abs(ant_tup_idx - last_np_word_idx) < closest_np_value: closest_np_value = ant_tup_idx - last_np_word_idx closest_np = NP if get_words: try: return closest_np.leaves() except AttributeError: return [] try: np_pos = [subtree.label() for subtree in nt.getsmallestsubtrees(closest_np)] except AttributeError: np_pos = [] return encode_pos_tag_frequencies(np_pos, all_tags)
def do_rule(sentdict, aux, tree, word_positions_in_tree): auxidx = aux.wordnum try: if sentdict.lemmas[auxidx+1] == 'that': return True except IndexError: pass if not aux_locally_ccommanded_by_verb(sentdict, aux, tree, word_positions_in_tree): if to_precedes_aux(sentdict, aux): return False localt = nt.generate_local_structure_from_subtree(tree, tree[word_positions_in_tree[auxidx-1]]) local_word_subtrees = nt.getsmallestsubtrees(localt) try: checkpuncttag = sentdict.pos[auxidx+1] if is_period(checkpuncttag) or is_comma(checkpuncttag) or is_dash_or_colon(checkpuncttag): endbool = True for subtree in local_word_subtrees: if is_verb(subtree.label()) and subtree != tree[word_positions_in_tree[auxidx-1]]: if nt.ccommands(localt, subtree, tree[word_positions_in_tree[auxidx-1]]): endbool = False break if endbool: return endbool except IndexError: pass # Don't at the end of sentence. try: checkpuncttag = sentdict.pos[auxidx+2] if sentdict.lemmas[auxidx+1] == 'not' and (is_period(checkpuncttag) or is_comma(checkpuncttag) or is_dash_or_colon(checkpuncttag)): endbool = True for subtree in local_word_subtrees: if is_verb(subtree.label()): if nt.ccommands(localt, subtree, tree[word_positions_in_tree[auxidx-1]]): endbool = False break if endbool: return endbool except IndexError: pass if is_ccommanded_by_continuation_word(sentdict ,aux, tree, word_positions_in_tree): return True if verb_follows_aux(sentdict, aux): return False try: if is_preposition(sentdict.pos[auxidx+1]) and sentdict.words[auxidx] != 'done': return True except IndexError: pass return False
def auxccommandsverb(sentdict, auxidx, t, word_positions_in_tree): subtrees = nltktree.getsmallestsubtrees(t) for subtree in subtrees: if isverb(subtree.label()): try: if nltktree.ccommands(t, t[word_positions_in_tree[auxidx-1]], subtree): return True except IndexError: pass return False
def aux_ccommanded_by_verb(sentdict, aux, tree, word_positions_in_tree): subtrees = nt.getsmallestsubtrees(tree) for subtree in subtrees: if is_verb(subtree.label()): try: if nt.ccommands(tree, subtree, tree[word_positions_in_tree[aux.wordnum-1]]): return True except IndexError: pass return False
def auxlocallyccommandsverb(sentdict, auxidx, t, word_positions_in_tree): try: localt = nltktree.generate_local_structure_from_subtree(t, t[word_positions_in_tree[auxidx-1]]) local_word_subtrees = nltktree.getsmallestsubtrees(localt) for subtree in local_word_subtrees: if isverb(subtree.label()): if nltktree.ccommands(localt, t[word_positions_in_tree[auxidx-1]], subtree)\ and not nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]): return True except IndexError: pass return False
def aux_locally_ccommands_verb(sentdict, aux, tree, word_positions_in_tree): try: localt = nt.generate_local_structure_from_subtree(tree, tree[word_positions_in_tree[aux.wordnum-1]]) local_word_subtrees = nt.getsmallestsubtrees(localt) for subtree in local_word_subtrees: if is_verb(subtree.label()): if nt.ccommands(localt, tree[word_positions_in_tree[aux.wordnum-1]], subtree)\ and not nt.ccommands(localt, subtree, tree[word_positions_in_tree[aux.wordnum-1]]): return True except IndexError: pass return False
def myfeaturesvector(sentdict, idx, features): vector = [] tree = NT.maketree(sentdict["tree"][0]) subtrees = NT.getsmallestsubtrees(tree) subtree_positions = NT.get_smallest_subtree_positions(tree, subtree_list=subtrees) aux = sentdict["lemmas"][idx] if "my_features" in features: vector.append(truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=["."]))) vector.append(truth(DV.previouswordisasorsoorthan(sentdict["words"], idx))) vector.append(truth(DV.thesamecheck(sentdict["words"], idx))) vector.append(truth(DV.toprecedesaux(sentdict, idx))) vector.append(truth(DV.verbfollowsaux(sentdict, idx))) # TODO: added this new feature! vector.append(truth(DV.nextwordistoo(sentdict, idx))) if "my_rules" in features: vector.append(truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions))) vector.append(truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions))) # This adds a new layer of features by combining all of the ones I had. if "square_rules" in features: size = len(vector) for i in range(0, size): for j in range(0, size): if i != j: vector.append(truth(untruth(vector[i]) and untruth(vector[j]))) if "combine_aux_type" in features: bools = [aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO] vec = [v for v in vector] for v in vec: for b in bools: vector.append(truth(untruth(v) and b)) return vector
def nearest_ant_np(ant, sentences, all_tags, get_words=False): """ @type ant: vpe_objects.Antecedent @type sentences: vpe_objects.AllSentences """ t = sentences.get_sentence_tree(ant.sentnum) tree_tuples = nt.pos_word_tuples(t) all_nps = nt.find_subtree_phrases(t, ['NP', 'NP-PRD']) ant_tup = (ant.sub_sentdict.pos[len(ant.sub_sentdict) / 2], ant.sub_sentdict.words[len(ant.sub_sentdict) / 2]) if ant.sentnum != ant.trigger.sentnum: ant_tup_idx = len(tree_tuples) else: ant_tup_idx = tree_tuples.index(ant_tup) closest_np_value = 99 closest_np = None for NP in all_nps: last_np_word_idx = tree_tuples.index(nt.pos_word_tuples(NP)[-1]) if abs(ant_tup_idx - last_np_word_idx) < closest_np_value: closest_np_value = ant_tup_idx - last_np_word_idx closest_np = NP if get_words: try: return closest_np.leaves() except AttributeError: return [] try: np_pos = [ subtree.label() for subtree in nt.getsmallestsubtrees(closest_np) ] except AttributeError: np_pos = [] return encode_pos_tag_frequencies(np_pos, all_tags)
def myfeaturesvector(sentdict, idx, features): vector = [] tree = NT.maketree(sentdict['tree'][0]) subtrees = NT.getsmallestsubtrees(tree) subtree_positions = NT.get_smallest_subtree_positions( tree, subtree_list=subtrees) aux = sentdict['lemmas'][idx] if 'my_features' in features: vector.append( truth(DV.auxccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxccommandsverbthatcomesafter(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxisccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxislocallyccommandedbyverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.auxlocallyccommandsverb(sentdict, idx, tree, subtree_positions))) vector.append( truth( DV.isccommandedbycontinuationword(sentdict, idx, tree, subtree_positions))) vector.append( truth(DV.nexttopunct(sentdict, idx, tree, subtree_positions))) vector.append(truth(DV.isfollowedbypunct(sentdict, idx, end=['.']))) vector.append( truth(DV.previouswordisasorsoorthan(sentdict['words'], idx))) vector.append(truth(DV.thesamecheck(sentdict['words'], idx))) vector.append(truth(DV.toprecedesaux(sentdict, idx))) vector.append(truth(DV.verbfollowsaux(sentdict, idx))) # TODO: added this new feature! vector.append(truth(DV.nextwordistoo(sentdict, idx))) if 'my_rules' in features: vector.append( truth(aux in DV.MODALS and DV.modalcheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.BE and DV.becheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.HAVE and DV.havecheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.DO and DV.docheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.TO and DV.tocheck(sentdict, idx, tree, subtree_positions))) vector.append( truth(aux in DV.SO and DV.socheck(sentdict, idx, tree, subtree_positions))) # This adds a new layer of features by combining all of the ones I had. if 'square_rules' in features: size = len(vector) for i in range(0, size): for j in range(0, size): if i != j: vector.append( truth(untruth(vector[i]) and untruth(vector[j]))) if 'combine_aux_type' in features: bools = [ aux in DV.MODALS, aux in DV.BE, aux in DV.HAVE, aux in DV.DO, aux in DV.TO, aux in DV.SO ] vec = [v for v in vector] for v in vec: for b in bools: vector.append(truth(untruth(v) and b)) return vector
def docheck(sentdict, auxidx, t, word_positions_in_tree, verbose=False): # We DO NOT want to consider 'do so' or 'do the same' sentences here! """try: if sentdict['lemmas'][auxidx+1] == 'so' or (sentdict['lemmas'][auxidx+1] == 'the' and sentdict['lemmas'][auxidx+2] == 'same'): return False # We can be POSITIVE that there is NO vpe if we have 'don't do ...' or 'x does do ...' if sentdict['lemmas'][auxidx-1] == 'do' or (sentdict['lemmas'][auxidx-2] == 'do' and sentdict['lemmas'][auxidx-1] == 'not'): return False if sentdict['lemmas'][auxidx+1] == 'do' or (sentdict['lemmas'][auxidx+1] == 'do' and sentdict['lemmas'][auxidx+2] == 'not'): return False except IndexError: pass""" try: if sentdict['lemmas'][auxidx+1] == 'that': return True except IndexError: pass if not auxislocallyccommandedbyverb(sentdict, auxidx, t, word_positions_in_tree): # # If 'do' locally c-commands a verb AND is locally c-commanded by a verb, we can be basically 100% sure that there is no VPE. # if auxislocallyccommandedbyverb(sentdict, auxidx, tree, word_positions_in_tree) and auxlocallyccommandsverb(sentdict, auxidx, tree, word_positions_in_tree): # return False if toprecedesaux(sentdict, auxidx): return False localt = nltktree.generate_local_structure_from_subtree(t, t[word_positions_in_tree[auxidx-1]]) local_word_subtrees = nltktree.getsmallestsubtrees(localt) # Do at the end of sentence. checkpuncttag = sentdict['pos'][auxidx+1] if isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag): endbool = True for subtree in local_word_subtrees: if isverb( subtree.label() ) and subtree != t[word_positions_in_tree[auxidx-1]]: if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]): endbool = False break if endbool: return endbool # Don't at the end of sentence. try: checkpuncttag = sentdict['pos'][auxidx+2] if sentdict['lemmas'][auxidx+1] == 'not' and (isperiod(checkpuncttag) or iscomma(checkpuncttag) or isdashorcolon(checkpuncttag)): endbool = True for subtree in local_word_subtrees: if isverb( subtree.label() ): if nltktree.ccommands(localt, subtree, t[word_positions_in_tree[auxidx-1]]): endbool = False break if endbool: return endbool except IndexError: pass # Small increase in recall, decrease in precision from this. # numverbs = 0 # for subtree in local_word_subtrees: # if isverb(subtree.label()) or isnounorprep(subtree.label()): # numverbs+=1 # if numverbs == 1: # return True if isccommandedbycontinuationword(sentdict ,auxidx, t, word_positions_in_tree): #if not auxlocallyccommandsverb(sentdict ,auxidx, tree, word_positions_in_tree): # 8% recall traded for 4% precision. #if (not isverb(sentdict['pos'][auxidx+1])) or (sentdict['lemmas'][auxidx+1]=='not' and not isverb(sentdict['pos'][auxidx+2])): return True # if not auxccommandsverbthatcomesafter(sentdict ,auxidx, tree, word_positions_in_tree): # return True if verbfollowsaux(sentdict, auxidx): return False if isprep(sentdict['pos'][auxidx+1]) and sentdict['words'][auxidx] != 'done': return True return False