Ejemplo n.º 1
0
    def gold_analysis(self):
        print 'Triggers:',
        freq = {}
        for trig in self.train_triggers + self.test_triggers + self.val_triggers:
            if not freq.has_key(trig.type):
                freq[trig.type] = 1
            else:
                freq[trig.type] += 1
        print freq
        print 'Total:', sum(freq.values())

        d = {'starts_with_aux': [], 'ends_with_aux': []}
        all_pos = set()
        for s in self.sentences:
            for p in s.pos:
                all_pos.add(p)

        ant_start_pos = set()
        for trig in self.train_triggers:
            if wc.is_aux_lemma(trig.gold_ant.sub_sentdict.words[0]):
                d['starts_with_aux'].append(trig.gold_ant)
            ant_start_pos.add(trig.gold_ant.sub_sentdict.pos[0])

        ant_end_pos = set()
        for trig in self.train_triggers:
            if wc.is_aux_lemma(trig.gold_ant.sub_sentdict.words[-1]):
                d['ends_with_aux'].append(trig.gold_ant)
            ant_start_pos.add(trig.gold_ant.sub_sentdict.pos[-1])

        print 'Ants never start with these tags: ', all_pos - ant_start_pos
        print 'Percent of ants that start with auxs: ', len(d['starts_with_aux']) / float(len(self.train_triggers))

        print 'Ants never END with these tags: ', all_pos - ant_end_pos
        print 'Percent of ants that END with auxs: ', len(d['ends_with_aux']) / float(len(self.train_triggers))
Ejemplo n.º 2
0
    def get_head(self, idx=False, idx_in_subsentdict=False):
        """
            @type return: str
        """
        for i in range(len(self.sub_sentdict)):
            if wc.is_verb(self.sub_sentdict.pos[i]) and not wc.is_aux_lemma(self.sub_sentdict.lemmas[i]):
                if idx:
                    if idx_in_subsentdict:
                        return i
                    else:
                        return self.start + i
                else:
                    return self.sub_sentdict.words[i]

        if idx and not idx_in_subsentdict:
            return self.start
        elif idx_in_subsentdict:
            return 0
        else:
            try:
                return self.sub_sentdict.words[0]
            except IndexError:
                return ''
Ejemplo n.º 3
0
    def get_head(self, idx=False, idx_in_subsentdict=False):
        """
            @type return: str
        """
        for i in range(len(self.sub_sentdict)):
            if wc.is_verb(self.sub_sentdict.pos[i]) and not wc.is_aux_lemma(
                    self.sub_sentdict.lemmas[i]):
                if idx:
                    if idx_in_subsentdict:
                        return i
                    else:
                        return self.start + i
                else:
                    return self.sub_sentdict.words[i]

        if idx and not idx_in_subsentdict:
            return self.start
        elif idx_in_subsentdict:
            return 0
        else:
            try:
                return self.sub_sentdict.words[0]
            except IndexError:
                return ''
Ejemplo n.º 4
0
def hardt_features(ant, trig, sentences, pos_tags):
    """
    This exists to add features that are somewhat based on what Hardt did in 1997.
        @type ant: vpe_objects.Antecedent
        @type trig: vpe_objects.Auxiliary
        @type sentences: vpe_objects.AllSentences
    """
    v = []
    sent_tree = sentences.get_sentence_tree(ant.sentnum)
    ant_sent = sentences.get_sentence(ant.sentnum)
    trig_sent = sentences.get_sentence(trig.sentnum)

    vp = sentences.nearest_vp(trig)
    vp_head = vp.get_head()
    vp_head_idx = vp.get_head(idx=True)

    ant_head = ant.get_head()
    ant_head_idx = ant.get_head(idx=True)

    v.append(1.0 if ant == vp else 0.0)
    v.append(1.0 if ant_head == vp_head else 0.0)
    v.append(1.0 if vp.start <= ant_head_idx <= vp.end else 0.0)
    v.append(1.0 if ant.start <= vp_head_idx <= ant.end else 0.0)
    v.append(ant.sentnum - vp.sentnum)
    v.append(ant.start - vp.start)
    v.append(ant.end - vp.end)

    # be-do form
    try:
        v.append(1.0 if wc.is_be(ant_sent.lemmas[ant.start - 1])
                 or wc.is_be(ant_sent.lemmas[ant.start]) else 0.0)
        v.append(1.0 if trig.type == 'do' and v[-1] == 1.0 else 0.0)
    except IndexError:
        v += [0.0, 0.0]

    # quotation features
    quote_start_trig, quote_end_trig = None, None
    for i, w in enumerate(trig_sent.lemmas):
        if w == "\"":
            if not quote_start_trig:
                quote_start_trig = i
            else:
                quote_end_trig = i
                break

    trig_in_quotes = False
    if quote_start_trig and quote_end_trig:
        trig_in_quotes = quote_start_trig <= trig.wordnum <= quote_end_trig
        v.append(1.0 if trig_in_quotes else 0.0)
    else:
        v.append(0.0)

    quote_start_ant, quote_end_ant = None, None
    for i, w in enumerate(ant_sent.lemmas):
        if w == "\"":
            if not quote_start_ant:
                quote_start_ant = i
            else:
                quote_end_ant = i
                break

    ant_in_quotes = False
    if quote_start_ant and quote_end_ant:
        ant_in_quotes = quote_start_ant <= ant.start <= quote_end_ant and quote_start_ant <= ant.end <= quote_end_ant
        v.append(1.0 if quote_start_ant <= ant.start <= quote_end_ant else 0.0)
        v.append(1.0 if quote_start_ant <= ant.end <= quote_end_ant else 0.0)
    else:
        v += [0.0, 0.0]

    v.append(1.0 if trig_in_quotes and ant_in_quotes else 0.0)

    # Nielsen features
    v.append(1.0 if wc.is_aux_lemma(ant.sub_sentdict.lemmas[0]) else 0.0)
    v.append(1.0 if wc.is_aux_lemma(ant.sub_sentdict.lemmas[ant.get_head(
        idx=True, idx_in_subsentdict=True)]) else 0.0)
    for tag in pos_tags:
        v.append(1.0 if tag == ant.sub_sentdict.pos[0] else
                 0.0)  # Sparse encoding of the pos tag of first word in ant
        v.append(1.0 if tag == ant.sub_sentdict.pos[-1] else
                 0.0)  # Sparse encoding of the pos tag of last word in ant
        v.append(
            float(ant.sub_sentdict.pos.count(tag)) /
            len(ant.sub_sentdict))  # Frequency of the given pos tag in ant

    for fun in [
            wc.is_adverb, wc.is_verb, wc.is_adverb, wc.is_noun,
            wc.is_preposition, wc.is_punctuation, wc.is_predicative
    ]:
        v.append(1.0 if fun(ant.sub_sentdict.pos[0]) else
                 0.0)  # Sparse encoding of the identity of first word in ant
        v.append(1.0 if fun(ant.sub_sentdict.pos[-1]) else
                 0.0)  # Sparse encoding of the identity of last word in ant
        v.append(
            float(len(map(fun, ant.sub_sentdict.pos))) /
            len(ant.sub_sentdict))  # Frequency of the given function in ant

    sent_phrases = get_phrases(sent_tree)
    ant_phrases = lowest_common_subtree_phrases(sent_tree, ant.get_words())

    v.append(float(len(ant_phrases)) / len(sent_phrases))
    for phrase in ['NP', 'VP', 'S', 'SINV', 'ADVP', 'ADJP', 'PP']:
        v.append(
            len(map(lambda s: s.startswith(phrase), ant_phrases)) /
            float(len(ant_phrases)))
        v.append(
            len(map(lambda s: s.startswith(phrase), sent_phrases)) /
            float(len(sent_phrases)))

    continuation_words = ['than', 'as', 'so']
    if ant.sentnum == trig.sentnum:
        v.append(1.0)
        for word in continuation_words:
            v.append(1.0 if word in
                     ant_sent.words[ant.end:trig.wordnum] else 0.0)
    else:
        v.append(0.0)
        for _ in continuation_words:
            v.append(0.0)
    try:
        v.append(1.0 if ant_sent.words[ant.start - 1] == trig.word else 0.0)
        v.append(1.0 if ant_sent.lemmas[ant.start - 1] == trig.lemma else 0.0)
        v.append(1.0 if ant_sent.lemmas[ant.start - 1] == trig.type else 0.0)
        v.append(1.0 if ant_sent.pos[ant.start - 1] == trig.pos else 0.0)
    except IndexError:
        v += [0.0, 0.0, 0.0, 0.0]

    # Theoretical linguistics features
    if ant.sentnum == trig.sentnum:
        word_positions = getwordtreepositions(sent_tree)

        v.append(1.0)
        v.append(1.0 if wc.ccommands(ant.start, trig.wordnum, sent_tree,
                                     word_positions) else 0.0)
        v.append(1.0 if wc.ccommands(trig.wordnum, ant.start, sent_tree,
                                     word_positions) else 0.0)
        v.append(1.0 if wc.ccommands(ant.end, trig.wordnum, sent_tree,
                                     word_positions) else 0.0)
        v.append(1.0 if wc.ccommands(trig.wordnum, ant.
                                     end, sent_tree, word_positions) else 0.0)

        # Check if a word in the antecedent c-commands the trig and vice versa.
        ant_word_ccommands, trig_ccommands = False, False
        for idx in range(ant.start, ant.end + 1):
            if wc.ccommands(idx, trig.wordnum, sent_tree, word_positions):
                v.append(1.0)
                ant_word_ccommands = True

            if wc.ccommands(trig.wordnum, idx, sent_tree, word_positions):
                v.append(1.0)
                trig_ccommands = True

            if ant_word_ccommands and trig_ccommands:  # speed boost of 0.02ms kek
                break

        if not ant_word_ccommands:
            v.append(0.0)

        if not trig_ccommands:
            v.append(0.0)
    else:
        v += [0.0 for _ in range(7)]

    return v