Ejemplo n.º 1
0
    def _f_debug(self, fmap, transition):
        nsents = len(self.doc.sents)
        head = s2i(transition[0], end=nsents)
        tail = s2i(transition[1], end=nsents)

        if transition.position == head and tail + 1 == head:
            fmap['DEBUG'] = 1
Ejemplo n.º 2
0
def bigram_acc(transitions):
    """
    Compute the bigram overlap (accuracy) for a list of predicted
    Transitions.

    transitions -- A list of discourse.hypergaph.Transition objects.

    returns bigram overlap (accuracy)
    """
    ntrans = len(transitions)
    # Get predicted bigrams.
    pred_bg = set([(s2i(t.sentences[1]), s2i(t.sentences[0], end='end'))
                   for t in recover_order(transitions)])

    # Create gold bigrams.
    gold = set([(i, i+1) for i in range(-1, ntrans - 2)])
    gold.add((ntrans - 2, 'end'))

    # If either sets are empty return None.
    if len(pred_bg) == 0 or len(gold) == 0:
        return None

    nbigrams = len(gold)
    acc = len(pred_bg & gold) / float(nbigrams)
    return acc
Ejemplo n.º 3
0
    def _f_sentiment(self, fmap, transition):
        if transition[1] == u'START':
            sentiment1 = u'START'
            s1_value = u'0'
        else:
            idx = s2i(transition[1])
            sentiment1 = self.doc[idx].sentiment
            s1_value = self.doc[idx].sentiment_value
        if transition[0] == u'END':
            sentiment0 = u'END'
            s0_value = u'0'
        else:
            idx = s2i(transition[0])
            sentiment0 = self.doc[idx].sentiment
            s0_value = self.doc[idx].sentiment_value

        fstr1 = u'SENTIMENT {}:{} --> {}:{}'.format(sentiment1, s1_value,
                                                    sentiment0, s0_value)
        fmap[fstr1] = 1
        
        fstr2 = u'SENTIMENT {} --> {}'.format(sentiment1,
                                              sentiment0)
        fmap[fstr2] = 1

        fstr3 = u'SENTIMENT {} --> __'.format(sentiment1)
        fmap[fstr3] = 1

        fstr4 = u'SENTIMENT __ --> {}'.format(sentiment0)
        fmap[fstr4] = 1
Ejemplo n.º 4
0
    def feature_map(self, transition):
        nsents = len(self)
        idxs = [s2i(s, end=nsents) for s in transition if s2i(s,end=nsents) != -1000] 
        
        for i, idx in enumerate(idxs):
            if i+1 < len(idxs):
                if idx - 1 != idxs[i+1]:
                    return {'GOLD':0}

        return {'GOLD': 1}
Ejemplo n.º 5
0
    def _f_ne_types(self, fmap, transition):
        """ Mark NE type transitions and counts.
        E.g. NE Counts ORG_3 --> DATE_1

        Parameters
        ----------

        fmap : dict (string -> int)
            A dict mapping feature names to feature values
            for this transition. This function mutates this dict.

        transition : Transition
            the graph transition, for which this function
            extracts features.
        """

        if transition.sentences[1] == u'START':
            sent1 = set([(u'START', 1)])
            idx = s2i(transition.sentences[0])
            sent0 = self._ne_counts(self.doc[idx])
        elif transition.sentences[0] == u'END':
            sent0 = set([(u'END', 1)])
            idx = s2i(transition.sentences[1])
            sent1 = self._ne_counts(self.doc[idx])
        else:
            idx1 = s2i(transition.sentences[1])
            sent1 = self._ne_counts(self.doc[idx1])
            idx0 = s2i(transition.sentences[0])
            sent0 = self._ne_counts(self.doc[idx0])

        if len(sent1) == 0:
            sent1.add((u'X', 1))

        if len(sent0) == 0:
            sent0.add((u'X', 1))

        for ne1 in sent1:
            for ne0 in sent0:

                count0 = ne0[1] if ne0[1] < 4 else '>=4'
                count1 = ne1[1] if ne1[1] < 4 else '>=4'
                fstr1 = u'NE Counts {}_{} --> {}_{}'.format(ne1[0], count1,
                                                            ne0[0], count0)
                fmap[fstr1] = 1

                fstr2 = u'NE Counts {} --> {}'.format(ne1[0], ne0[0])
                fmap[fstr2] = 1

                fstr3 = u'NE Counts __ --> {}'.format(ne0[0])
                fmap[fstr3] = 1

                fstr4 = u'NE Counts {} --> __'.format(ne1[0])
                fmap[fstr4] = 1
Ejemplo n.º 6
0
    def _f_topics(self, fmap, transition):
        if transition[1] == u'START':
            topic1 = u'START'
        else:
            idx = s2i(transition[1])
            topic1 = self._topic_map[idx]
        if transition[0] == u'END':
            topic0 = u'END'
        else:
            idx = s2i(transition[0])
            topic0 = self._topic_map[idx]

        fstr = u'TOPIC {} --> {}'.format(topic1, topic0)
        fmap[fstr] = 1
Ejemplo n.º 7
0
    def _f_discourse_new(self, fmap, transition):
        """ Marks feature map if the head sentence contains the first
        occurrence of a salient entity, that is, a discourse new
        entity.

        Parameters
        ----------

        fmap : dict (string -> int)
            A dict mapping feature names to feature values
            for this transition. This function mutates this dict.

        transition : Transition
            the graph transition, for which this function
            extracts features.

        """

        if transition.sentences[0] != u'END':
            idx = s2i(transition.sentences[0])

            s2e = self.sent2ents
            num_new = 0
            for ent in s2e[idx]:
                if ent not in transition.previous_entities:
                    num_new += 1

            if num_new > 0:
                fmap[u'Discourse New'] = num_new
Ejemplo n.º 8
0
    def _f_topics_rewrite(self, fmap, transition):
        if transition[0] == u'END':
            topic0 = u'END'
        else:
            idx = s2i(transition[0])
            topic0 = self._topic_map[idx]

        for feat, val in fmap.items():
            fstr = u'(TPC {}) {}'.format(topic0, feat)
            fmap[fstr] = val
Ejemplo n.º 9
0
def oso_acc(transitions):

    ntrans = len(transitions)
    # Get predicted bigrams.
    pred = [s2i(t.sentences[0], end=ntrans-1)
               for t in recover_order(transitions)]
    if tuple(pred) == tuple([i for i in range(ntrans)]):
        return 1
    else:
        return 0
Ejemplo n.º 10
0
def _position2transition_map(transitions):
    """
    Return a dict mapping transition tail sentence indices to
    transitions.

    transitions -- A list of discourse.hypergaph.Transition objects.
    """
    m = OrderedDict()
    for t in transitions:
        m[s2i(t.sents[1])] = t
    return m
Ejemplo n.º 11
0
    def _f_syntax_lev(self, fmap, transition, depth):
        """ Marks the non-terminal sequence transition in the
        feature map. E.g. S , NP VP . ---> NP VP .

        Parameters
        ----------
        fmap : dict (string -> int)
            A dict mapping feature names to feature values
            for this transition. This function mutates this dict.

        transition : Transition
            the graph transition, for which this function
            extracts features.

        depth : int
            The depth of the sequence to extract from the parse
            tree.
        """

        # Extract syntax sequence for the tail sentence.
        if transition.sentences[1] == 'START':
            seq1 = 'START'
        else:
            idx = s2i(transition[1])
            seq1_parse = self.doc[idx].parse
            seq1 = syn_sequence(seq1_parse, depth)

        # Extract syntax sequence for the head sentence.
        if transition.sentences[0] == 'END':
            seq0 = 'END'
        else:
            idx = s2i(transition[0])
            seq0_parse = self.doc[idx].parse
            seq0 = syn_sequence(seq0_parse, depth)

        # Assign feature value.
        fmap['{} -sl{}-> {}'.format(seq1, depth, seq0)] = 1

        # Smoothed features.
        fmap['__ -sl{}-> {}'.format(depth, seq0)] = 1
        fmap['{} -sl{}-> __'.format(seq1, depth)] = 1
Ejemplo n.º 12
0
    def _f_verbs(self, fmap, transition):
        if transition[1] == u'START':
            verbs1 = set([u'START'])
        else:
            idx = s2i(transition[1])
            verbs1 = set([t.lem.lower() 
                          for t in self.doc[idx] if u'VB' in t.pos])
        if transition[0] == u'END':
            verbs0 = set([u'END'])
        else:
            idx = s2i(transition[0])
            verbs0 = set([t.lem.lower() 
                          for t in self.doc[idx] if u'VB' in t.pos])

        for v1 in verbs1:
            for v0 in verbs0:
                fstr1 = u'VBZ: {} --> {}'.format(v1, v0)
                fmap[fstr1] = 1
                
                fstr2 = u'VBZ: __ --> {}'.format(v0)
                fmap[fstr2] = 1
 
                fstr3 = u'VBZ: {} --> __'.format(v1)
                fmap[fstr3] = 1
Ejemplo n.º 13
0
def kendalls_tau(transitions):
    """
    Compute Kendall's tau and pvalue for a list of
    discourse.hypergraph.Transition objects.

    transitions -- A list of discourse.hypergaph.Transition objects.

    returns (kt, pval)
    """
    # Get list sentence indices implied by the transition set.
    indices = [s2i(t.sentences[0]) for t in recover_order(transitions)[:-1]]
    # Get gold indices.
    gold = [i for i in range(len(indices))]
    # Compute Kendall's tau for these two sequences.
    kt, pval = sp.stats.kendalltau(indices, gold)
    return kt, pval
Ejemplo n.º 14
0
    def _f_discourse_connectives(self, fmap, transition):
        if transition.sentences[0] == u'END':
            dcon = u'END'
        else:
            idx = s2i(transition.sentences[0])
            dcon = self.sent2dcon[idx]

        prev_dcon = transition.previous_dcon
        if dcon == u'':
            dcon = prev_dcon

        fstr1 = u'Discourse Connective {} -> {}'.format(prev_dcon, dcon)
        fmap[fstr1] = 1

        fstr2 = u'Discourse Connective {} -> __'.format(prev_dcon)
        fmap[fstr2] = 1

        fstr3 = u'Discourse Connective __ -> {}'.format(dcon)
        fmap[fstr3] = 1

        is_match = u'MATCH' if dcon == prev_dcon else u'not a match'
        fstr4 = u'Discourse Connective {}'.format(is_match)
        fmap[fstr4] = 1
Ejemplo n.º 15
0
def eval_against_baseline(testX, baselineY, newY, baseline_model, new_model,
                          base_feats, new_feats,
                          baseline_pred_trainY=None,
                          new_pred_trainY=None):
    """
    Evaluate differences in two models. Prints out per instance
    analysis of transitions predicted by baseline and new models.

    testX -- A list of corenlp.Document objects to evaluate on.

    baselineY -- A list of lists of discourse.hypergraph.Transition
        objects predicted by the baseline model for the documents
        in testX.

    newY -- A list of lists of discourse.hypergraph.Transition
        objects predicted by the new model for the documents
        in testX.

    baseline_model -- A discourse.perceptron.Perceptron object trained
        on the features in base_feats.

    new_model -- A discourse.perceptron.Perceptron object trained
        on the features in new_feats.

    base_feats -- A dict of feature names to boolean values,
        indicating the features active in the baseline model.

    new_feats -- A dict of feature names to boolean values,
        indicating the features active in the new model.
    """

    # Limit text output to 80 chars and wrap nicely.
    wrapper = textwrap.TextWrapper(subsequent_indent='\t')

    print u'OVERALL STATS FOR TEST DOCUMENTS'

    # Print macro averaged Kendall's Tau and pvalues for baseline
    # and new model.
    bl_avg_kt, bl_avg_pval = avg_kendalls_tau(baselineY)
    new_avg_kt, new_avg_pval = avg_kendalls_tau(newY)
        
    print u'\t     | BASELINE      | NEW'
    print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(u'Kendalls Tau',
                                                              bl_avg_kt,
                                                              bl_avg_pval,
                                                              new_avg_kt,
                                                              new_avg_pval)

    # Print bigram gold sequence overlap (accuracy) for baseline and
    # new model.
    bl_bg_acc = mac_avg_bigram_acc(baselineY)
    new_bg_acc = mac_avg_bigram_acc(newY)
    print u'\t     | BASELINE      | NEW'
    print u'{:12} | {:.3f}         | {:.3f} \n'.format(u'bigram acc',
                                                       bl_bg_acc,
                                                       new_bg_acc)

    if baseline_pred_trainY is not None or new_pred_trainY is not None:
         
        if baseline_pred_trainY is not None:
            bl_avg_kt_train, bl_avg_pval_train = avg_kendalls_tau(
                baseline_pred_trainY)

            bl_bg_acc_train = mac_avg_bigram_acc(baseline_pred_trainY)
        
        else: 
            bl_avg_kt_train = float('nan')
            bl_avg_pval_train = float('nan')
            bl_bg_acc_train = float('nan')

        if new_pred_trainY is not None:
            new_avg_kt_train, new_avg_pval_train = avg_kendalls_tau(
                new_pred_trainY)

            new_bg_acc_train = mac_avg_bigram_acc(new_pred_trainY)

        else: 
            new_avg_kt_train = float('nan')
            new_avg_pval_train = float('nan')
            new_bg_acc_train = float('nan')

        print u'OVERALL STATS FOR TRAINING DOCUMENTS'
        print u'\t     | BASELINE      | NEW'
        print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(
            u'Kendalls Tau',
            bl_avg_kt_train,
            bl_avg_pval_train,
            new_avg_kt_train,
            new_avg_pval_train)
        print u'\t     | BASELINE      | NEW'
        print u'{:12} | {:.3f}         | {:.3f} \n'.format(u'bigram acc',
                                                           bl_bg_acc_train,
                                                           new_bg_acc_train)

    # Print stats for individual test instances.
    for test_idx, datum in enumerate(izip(testX, baselineY, newY), 1):
        testx, baseliney, newy = datum
        print u'TEST NO. {:4}\n=============\n'.format(test_idx)

        # Print Kendalls Tau and pvalue for baseline and new model
        # for this test instance.
        bl_kt, bl_pval = kendalls_tau(baseliney)
        new_kt, new_pval = kendalls_tau(newy)
        print u'\t     | BASELINE      | NEW'
        print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(u'K. Tau',
                                                                  bl_kt,
                                                                  bl_pval,
                                                                  new_kt,
                                                                  new_pval)

        # Print bigram gold sequence overlap (accuracy) for baseline
        # and new model.
        bl_acc = bigram_acc(baseliney)
        new_acc = bigram_acc(newy)
        print u'\t     | BASELINE      | NEW'
        print u'{:12} | {:.3f}         | {:.3f} \n'.format(u'bigram acc',
                                                           bl_acc,
                                                           new_acc)

        # Print document sentences in correct order.
        print u'GOLD TEXT\n=========\n'
        for i, s in enumerate(testx):
            print wrapper.fill(u'({:3}) {}'.format(i, unicode(s)))
        print u'\n\n'

        # Print document sentences in baseline order.
        print u'BASELINE TEXT\n=========\n'
        indices = [s2i(t.sents[0]) for t in recover_order(baseliney)[:-1]]
        for i in indices:
            print wrapper.fill(u'({}) {}'.format(i, unicode(testx[i])))
        print u'\n\n'

        # Print document sentences in new model order.
        print u'NEW MODEL TEXT\n=========\n'
        indices = [s2i(t.sents[0]) for t in recover_order(newy)[:-1]]
        for i in indices:
            print wrapper.fill(u'({}) {}'.format(i, unicode(testx[i])))
        print u'\n\n'

        # Get predicted transitions in order for both models.
        # NOTE: The predict function of the Perceptron object returns
        # the predicted transitions in no particular order.
        # When in doubt, use recover_order on any predicted output
        # if you want to iterate over it as if you were traversing the
        # graph of sentence transitions.
        baseline_trans = discourse.hypergraph.recover_order(baseliney)
        new_trans = discourse.hypergraph.recover_order(newy)

        # Map tail sentence of a transition to the transition.
        p2t_baseline = _position2transition_map(baseline_trans)
        p2t_new = _position2transition_map(new_trans)

        # For each transition leaving the same sentence, if the models
        # disagree on what the next sentence is, print analysis of
        # the model features.
        for pos, t_bl in p2t_baseline.items():
            if p2t_new[pos].sents[0] != t_bl.sents[0]:
                t_new = p2t_new[pos]

                # Print tail sentence.
                if pos > -1:
                    pos_str = unicode(testx[pos])
                else:
                    pos_str = u'START'
                print u'=' * 80
                print wrapper.fill(u'({:3}) {}'.format(pos, pos_str))
                print (u'-' * 80)
                print u'  |\n  V'

                # Print baseline head sentence
                if s2i(t_bl.sents[0]) is not None:
                    bl_str = unicode(testx[s2i(t_bl.sents[0])])
                else:
                    bl_str = u'END'
                print wrapper.fill(u'(OLD) {}\n'.format(bl_str)) + u'\n'

                # Print baseline model features for the predicted
                # baseline transition.
                explain(t_bl, baseline_model, new_model, testx,
                        base_feats, new_feats)

                # Print new model head sentence.
                if s2i(t_new.sents[0]) is not None:
                    new_str = unicode(testx[s2i(t_new.sents[0])])
                else:
                    new_str = 'END'
                print wrapper.fill(u'(NEW) {}\n'.format(new_str)) + u'\n'

                # Print new model features for the predicted new
                # model transition.
                explain(t_new, baseline_model, new_model, testx,
                        base_feats, new_feats)

                # Print gold head sentence, that is, the sentence the
                # models should have selected.
                if pos + 1 < len(testx):
                    gstr = u'(GLD) {}\n'.format(unicode(testx[pos + 1]))
                    print wrapper.fill(gstr) + u'\n'

                if pos + 1 == s2i(t_bl.sents[0], end=len(testx)):
                    print 'OLD MODEL IS CORRECT\n'
                if pos + 1 == s2i(t_new.sents[0], end=len(testx)):
                    print 'NEW MODEL IS CORRECT\n'
                print
Ejemplo n.º 16
0
    def _f_personal_pronoun_res(self, fmap, transition):
        
        ecnts = self.entity_counts
        
        if transition[0] == u'END':
            prn0 = set([u'END'])
            per0 = set([u'END'])
        else:
            idx0 = s2i(transition[0])
            prn0 = self._extract_personal_pronouns(self.doc[idx0])
            per0 = self._extract_persons(self.doc[idx0])

        if transition[1] == u'START':
            prn1 = set([u'START'])
            per1 = set([u'START'])
        else:
            idx1 = s2i(transition[1])
            prn1 = self._extract_personal_pronouns(self.doc[idx1])
            per1 = self._extract_persons(self.doc[idx1])
        
        for prn in prn0:
            if len(per1) > 0:
                for per in per1:
                    if per != u'START' and ecnts[per] > 1:
                        is_salient = u'SALIENT'
                    else: 
                        is_salient = u'not salient'
     
                    fstr1 = u'Per. Prn.: PERSON --> {}'.format(per) 
                    fmap[fstr1] = 1

                    fstr2 = u'Per. Prn.: PERSON --> {} {}'.format(per,
                                                                  is_salient) 
                    fmap[fstr2] = 1
                    
                    fstr3 = u'Per. Prn.: PERSON --> prn'
                    fmap[fstr3] = 1
                    
                    fstr4 = u'Per. Prn.: PERSON --> prn {}'.format(is_salient)
                    fmap[fstr4] = 1

            else: 
                fstr1 = u'Per. Prn.: X --> {}'.format(prn)
                fmap[fstr1] = 1
                fstr2 = u'Per. Prn.: X --> prn'
                fmap[fstr2] = 1

        for prn in prn1:
            if len(per0) > 0:
                for per in per0:
                    if per != u'END' and ecnts[per] > 1:
                        is_salient = u'SALIENT'
                    else: 
                        is_salient = u'not salient'

                    fstr1 = u'Per. Prn.: PERSON <-- {}'.format(per) 
                    fmap[fstr1] = 1

                    fstr2 = u'Per. Prn.: PERSON <-- {} {}'.format(per,
                                                                  is_salient) 
                    fmap[fstr2] = 1
                    
                    fstr3 = u'Per. Prn.: PERSON <-- prn'
                    fmap[fstr3] = 1
                    
                    fstr4 = u'Per. Prn.: PERSON <-- prn {}'.format(is_salient)
                    fmap[fstr4] = 1

            else: 
                fstr1 = u'Personal Prn: X <-- {}'.format(prn)
                fmap[fstr1] = 1
                fstr2 = u'Personal Prn: X <-- prn'
                fmap[fstr2] = 1
Ejemplo n.º 17
0
    def _f_role_match(self, fmap, transition):
        """ This feature counts noun phrase head matches across
        sentences. The feature takes the form of the dependency
        relation for the entity in each sentence, and whether or not
        the entitiy in question is a salient entity.
        E.g. 'nsubj --> dobj SALIENT'. Start and end role transitions
        are similarly captured, e.g. 'START -> other' and
        'iobj -> END'.

        Parameters
        ----------

        fmap : dict (string -> int)
            A dict mapping feature names to feature values
            for this transition. This function mutates this dict.

        transition : Transition
            the graph transition, for which this function
            extracts features.
        """

        # If the tail sentence is START, create a START role for each entity
        # that occurs in the head sentence.
        if transition[1] == u'START':
            idx0 = s2i(transition[0])
            s0_ents = self._entity_roles(self.doc[idx0])
            s1_ents = [(token, u'START') for token, role in s0_ents]

        # If the head sentence is END, create an END role for each entity
        # that occurs in the tail sentence.
        elif transition[0] == u'END':
            idx1 = s2i(transition[1])
            s1_ents = self._entity_roles(self.doc[idx1])
            s0_ents = [(token, u'END') for token, role in s1_ents]

        # Default behavior, extract entity role tuples for each sentence.
        else:
            idx0 = s2i(transition[0])
            idx1 = s2i(transition[1])
            s0_ents = self._entity_roles(self.doc[idx0])
            s1_ents = self._entity_roles(self.doc[idx1])

        # Entity counts
        ecnts = self.entity_counts

        # This set records entities matched in the head sentence.
        # For entites in the head sentence that are NOT matched, this set
        # makes it possible to create a feature of the form "X -> role"
        # where the X indicates that the entitiy did not appear in the tail.
        used_ents = set()

        # This default dict is used to build the feature counts that will be
        # added to fmap.
        role_matches = defaultdict(int)

        # Find matching entities across sentences, and mark them as features.
        for ent1 in s1_ents:
            lem1 = ent1[0].lem.lower()
            is_salient = u'SALIENT' if ecnts[lem1] > 2 else u'not salient'

            #ne1 = ent1[0].ne
            no_match = True
            for ent0 in s0_ents:
                lem0 = ent0[0].lem.lower()
                if lem0 == lem1:
                    no_match = False
                    used_ents.add(lem0)

                    fstr1 = u'Role Trans: {} --> {}'.format(ent1[1], ent0[1])
                    role_matches[fstr1] += 1

                    sfstr1 = fstr1 + u' {}'.format(is_salient)
                    role_matches[sfstr1] += 1

                    # Backoff features with generic __ symbol
                    fstr2 = u'Role Trans: __ --> {}'.format(ent0[1])
                    role_matches[fstr2] += 1

                    sfstr2 = fstr2 + u' {}'.format(is_salient)
                    role_matches[sfstr2] += 1

                    fstr3 = u'Role Trans: {} --> __'.format(ent1[1])
                    role_matches[fstr3] += 1

                    sfstr3 = fstr3 + u' {}'.format(is_salient)
                    role_matches[sfstr3] += 1

            if no_match:
                fstr1 = u'Role Trans: {} --> X'.format(ent1[1])
                role_matches[fstr1] += 1
                sfstr1 = fstr1 + u' {}'.format(is_salient)
                role_matches[sfstr1] += 1

        for ent, role in s0_ents:
            lem = ent.lem.lower()
            if lem not in used_ents:
                is_salient = u'SALIENT' if ecnts[lem] > 2 else u'not salient'
                fstr1 = u'Role Trans: X --> {}'.format(role)
                role_matches[fstr1] += 1
                sfstr1 = fstr1 + u' {}'.format(is_salient)
                role_matches[sfstr1] += 1

        for feature, val in role_matches.items():
            if 'SALIENT' in feature:
                fmap[feature] = val
Ejemplo n.º 18
0
    def _f_first_word(self, fmap, transition):
        """ Marks the sequence first words of the sentences selected
        by the graph edge *transition*.
        E.g. 'a' ---> 'the' .

        Parameters
        ----------
        fmap : dict (string -> int)
            A dict mapping feature names to feature values
            for this transition. This function mutates this dict.

        transition : Transition
            The graph edge, from which this function
            extracts features.
        """

        # Extract first word from tail sentence.
        if transition[1] == u'START':
            word1 = u'START'
            ne1 = u'START'
        else:
            idx = s2i(transition[1])
            sent1 = self.doc[idx]
            token1 = sent1.tokens[0]

            word1 = token1.lem.lower()
            ne1 = token1.ne

        # Extract first word from head sentence.
        if transition[0] == u'END':
            word0 = u'END'
            ne0 = u'END'
        else:
            idx = s2i(transition[0])
            sent0 = self.doc[idx]
            token0 = sent0.tokens[0]
            word0 = token0.lem.lower()
            ne0 = token0.ne

        # Mark the feature
        fstr1 = u'First Word Trans: {} --> {}'.format(word1, word0)
        fmap[fstr1] = 1

        # Mark smoothed versions of this feature.
        fstr2 = u'First Word Trans: __ --> {}'.format(unicode(word0))
        fmap[fstr2] = 1

        fstr3 = u'First Word Trans: {} --> __'.format(unicode(word1))
        fmap[fstr3] = 1

        fstr4 = u'First Word Trans: {} --> {}'.format(ne1, ne0)
        fmap[fstr4] = 1

        fstr5 = u'First Word Trans: {} --> {}'.format(ne1, word0)
        fmap[fstr5] = 1

        fstr6 = u'First Word Trans: {} --> {}'.format(word1, ne0)
        fmap[fstr6] = 1

        fstr7 = u'First Word Trans: {} --> __'.format(ne1)
        fmap[fstr7] = 1

        fstr8 = u'First Word Trans: __ --> {}'.format(ne0)
        fmap[fstr8] = 1
Ejemplo n.º 19
0
    testx, gtesty, ptesty = datum
    kt, pval = evaluation.kendalls_tau(ptesty)
    print u'Kendall\'s Tau : {:.3f} (pval {:.3f})'.format(kt, pval)
    print u'Bigram Acc.    : {:.3f}'.format(evaluation.bigram_acc(ptesty))
    print

    print u'GOLD ORDERING\n==================\n'
    print unicode(testx.trans2str(gtesty))
    print 

    for t in hypergraph.recover_order(gtesty):
        
        print u'TRANSITION: {}'.format(unicode(t))
        print u'=' * 79

        idx1 = hypergraph.s2i(t.sents[1])
        sent1 = testx[idx1] if idx1 > -1 else 'START'
        idx2 = hypergraph.s2i(t.sents[0])
        sent2 = testx[idx2] if idx2 is not None else 'END'

        print textwrap.fill(u'({:3}) {}'.format(idx1, unicode(sent1)))
        print u' |\n V'
        print textwrap.fill(u'({:3}) {}\n'.format(idx2, unicode(sent2)))
        evaluation.explain_transition(t, model, testx)
        print 

    print u'PREDICTED ORDERING\n==================\n'
    print unicode(testx.trans2str(ptesty))
    print
    
    for t in hypergraph.recover_order(ptesty):