def _f_debug(self, fmap, transition): nsents = len(self.doc.sents) head = s2i(transition[0], end=nsents) tail = s2i(transition[1], end=nsents) if transition.position == head and tail + 1 == head: fmap['DEBUG'] = 1
def bigram_acc(transitions): """ Compute the bigram overlap (accuracy) for a list of predicted Transitions. transitions -- A list of discourse.hypergaph.Transition objects. returns bigram overlap (accuracy) """ ntrans = len(transitions) # Get predicted bigrams. pred_bg = set([(s2i(t.sentences[1]), s2i(t.sentences[0], end='end')) for t in recover_order(transitions)]) # Create gold bigrams. gold = set([(i, i+1) for i in range(-1, ntrans - 2)]) gold.add((ntrans - 2, 'end')) # If either sets are empty return None. if len(pred_bg) == 0 or len(gold) == 0: return None nbigrams = len(gold) acc = len(pred_bg & gold) / float(nbigrams) return acc
def _f_sentiment(self, fmap, transition): if transition[1] == u'START': sentiment1 = u'START' s1_value = u'0' else: idx = s2i(transition[1]) sentiment1 = self.doc[idx].sentiment s1_value = self.doc[idx].sentiment_value if transition[0] == u'END': sentiment0 = u'END' s0_value = u'0' else: idx = s2i(transition[0]) sentiment0 = self.doc[idx].sentiment s0_value = self.doc[idx].sentiment_value fstr1 = u'SENTIMENT {}:{} --> {}:{}'.format(sentiment1, s1_value, sentiment0, s0_value) fmap[fstr1] = 1 fstr2 = u'SENTIMENT {} --> {}'.format(sentiment1, sentiment0) fmap[fstr2] = 1 fstr3 = u'SENTIMENT {} --> __'.format(sentiment1) fmap[fstr3] = 1 fstr4 = u'SENTIMENT __ --> {}'.format(sentiment0) fmap[fstr4] = 1
def feature_map(self, transition): nsents = len(self) idxs = [s2i(s, end=nsents) for s in transition if s2i(s,end=nsents) != -1000] for i, idx in enumerate(idxs): if i+1 < len(idxs): if idx - 1 != idxs[i+1]: return {'GOLD':0} return {'GOLD': 1}
def _f_ne_types(self, fmap, transition): """ Mark NE type transitions and counts. E.g. NE Counts ORG_3 --> DATE_1 Parameters ---------- fmap : dict (string -> int) A dict mapping feature names to feature values for this transition. This function mutates this dict. transition : Transition the graph transition, for which this function extracts features. """ if transition.sentences[1] == u'START': sent1 = set([(u'START', 1)]) idx = s2i(transition.sentences[0]) sent0 = self._ne_counts(self.doc[idx]) elif transition.sentences[0] == u'END': sent0 = set([(u'END', 1)]) idx = s2i(transition.sentences[1]) sent1 = self._ne_counts(self.doc[idx]) else: idx1 = s2i(transition.sentences[1]) sent1 = self._ne_counts(self.doc[idx1]) idx0 = s2i(transition.sentences[0]) sent0 = self._ne_counts(self.doc[idx0]) if len(sent1) == 0: sent1.add((u'X', 1)) if len(sent0) == 0: sent0.add((u'X', 1)) for ne1 in sent1: for ne0 in sent0: count0 = ne0[1] if ne0[1] < 4 else '>=4' count1 = ne1[1] if ne1[1] < 4 else '>=4' fstr1 = u'NE Counts {}_{} --> {}_{}'.format(ne1[0], count1, ne0[0], count0) fmap[fstr1] = 1 fstr2 = u'NE Counts {} --> {}'.format(ne1[0], ne0[0]) fmap[fstr2] = 1 fstr3 = u'NE Counts __ --> {}'.format(ne0[0]) fmap[fstr3] = 1 fstr4 = u'NE Counts {} --> __'.format(ne1[0]) fmap[fstr4] = 1
def _f_topics(self, fmap, transition): if transition[1] == u'START': topic1 = u'START' else: idx = s2i(transition[1]) topic1 = self._topic_map[idx] if transition[0] == u'END': topic0 = u'END' else: idx = s2i(transition[0]) topic0 = self._topic_map[idx] fstr = u'TOPIC {} --> {}'.format(topic1, topic0) fmap[fstr] = 1
def _f_discourse_new(self, fmap, transition): """ Marks feature map if the head sentence contains the first occurrence of a salient entity, that is, a discourse new entity. Parameters ---------- fmap : dict (string -> int) A dict mapping feature names to feature values for this transition. This function mutates this dict. transition : Transition the graph transition, for which this function extracts features. """ if transition.sentences[0] != u'END': idx = s2i(transition.sentences[0]) s2e = self.sent2ents num_new = 0 for ent in s2e[idx]: if ent not in transition.previous_entities: num_new += 1 if num_new > 0: fmap[u'Discourse New'] = num_new
def _f_topics_rewrite(self, fmap, transition): if transition[0] == u'END': topic0 = u'END' else: idx = s2i(transition[0]) topic0 = self._topic_map[idx] for feat, val in fmap.items(): fstr = u'(TPC {}) {}'.format(topic0, feat) fmap[fstr] = val
def oso_acc(transitions): ntrans = len(transitions) # Get predicted bigrams. pred = [s2i(t.sentences[0], end=ntrans-1) for t in recover_order(transitions)] if tuple(pred) == tuple([i for i in range(ntrans)]): return 1 else: return 0
def _position2transition_map(transitions): """ Return a dict mapping transition tail sentence indices to transitions. transitions -- A list of discourse.hypergaph.Transition objects. """ m = OrderedDict() for t in transitions: m[s2i(t.sents[1])] = t return m
def _f_syntax_lev(self, fmap, transition, depth): """ Marks the non-terminal sequence transition in the feature map. E.g. S , NP VP . ---> NP VP . Parameters ---------- fmap : dict (string -> int) A dict mapping feature names to feature values for this transition. This function mutates this dict. transition : Transition the graph transition, for which this function extracts features. depth : int The depth of the sequence to extract from the parse tree. """ # Extract syntax sequence for the tail sentence. if transition.sentences[1] == 'START': seq1 = 'START' else: idx = s2i(transition[1]) seq1_parse = self.doc[idx].parse seq1 = syn_sequence(seq1_parse, depth) # Extract syntax sequence for the head sentence. if transition.sentences[0] == 'END': seq0 = 'END' else: idx = s2i(transition[0]) seq0_parse = self.doc[idx].parse seq0 = syn_sequence(seq0_parse, depth) # Assign feature value. fmap['{} -sl{}-> {}'.format(seq1, depth, seq0)] = 1 # Smoothed features. fmap['__ -sl{}-> {}'.format(depth, seq0)] = 1 fmap['{} -sl{}-> __'.format(seq1, depth)] = 1
def _f_verbs(self, fmap, transition): if transition[1] == u'START': verbs1 = set([u'START']) else: idx = s2i(transition[1]) verbs1 = set([t.lem.lower() for t in self.doc[idx] if u'VB' in t.pos]) if transition[0] == u'END': verbs0 = set([u'END']) else: idx = s2i(transition[0]) verbs0 = set([t.lem.lower() for t in self.doc[idx] if u'VB' in t.pos]) for v1 in verbs1: for v0 in verbs0: fstr1 = u'VBZ: {} --> {}'.format(v1, v0) fmap[fstr1] = 1 fstr2 = u'VBZ: __ --> {}'.format(v0) fmap[fstr2] = 1 fstr3 = u'VBZ: {} --> __'.format(v1) fmap[fstr3] = 1
def kendalls_tau(transitions): """ Compute Kendall's tau and pvalue for a list of discourse.hypergraph.Transition objects. transitions -- A list of discourse.hypergaph.Transition objects. returns (kt, pval) """ # Get list sentence indices implied by the transition set. indices = [s2i(t.sentences[0]) for t in recover_order(transitions)[:-1]] # Get gold indices. gold = [i for i in range(len(indices))] # Compute Kendall's tau for these two sequences. kt, pval = sp.stats.kendalltau(indices, gold) return kt, pval
def _f_discourse_connectives(self, fmap, transition): if transition.sentences[0] == u'END': dcon = u'END' else: idx = s2i(transition.sentences[0]) dcon = self.sent2dcon[idx] prev_dcon = transition.previous_dcon if dcon == u'': dcon = prev_dcon fstr1 = u'Discourse Connective {} -> {}'.format(prev_dcon, dcon) fmap[fstr1] = 1 fstr2 = u'Discourse Connective {} -> __'.format(prev_dcon) fmap[fstr2] = 1 fstr3 = u'Discourse Connective __ -> {}'.format(dcon) fmap[fstr3] = 1 is_match = u'MATCH' if dcon == prev_dcon else u'not a match' fstr4 = u'Discourse Connective {}'.format(is_match) fmap[fstr4] = 1
def eval_against_baseline(testX, baselineY, newY, baseline_model, new_model, base_feats, new_feats, baseline_pred_trainY=None, new_pred_trainY=None): """ Evaluate differences in two models. Prints out per instance analysis of transitions predicted by baseline and new models. testX -- A list of corenlp.Document objects to evaluate on. baselineY -- A list of lists of discourse.hypergraph.Transition objects predicted by the baseline model for the documents in testX. newY -- A list of lists of discourse.hypergraph.Transition objects predicted by the new model for the documents in testX. baseline_model -- A discourse.perceptron.Perceptron object trained on the features in base_feats. new_model -- A discourse.perceptron.Perceptron object trained on the features in new_feats. base_feats -- A dict of feature names to boolean values, indicating the features active in the baseline model. new_feats -- A dict of feature names to boolean values, indicating the features active in the new model. """ # Limit text output to 80 chars and wrap nicely. wrapper = textwrap.TextWrapper(subsequent_indent='\t') print u'OVERALL STATS FOR TEST DOCUMENTS' # Print macro averaged Kendall's Tau and pvalues for baseline # and new model. bl_avg_kt, bl_avg_pval = avg_kendalls_tau(baselineY) new_avg_kt, new_avg_pval = avg_kendalls_tau(newY) print u'\t | BASELINE | NEW' print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(u'Kendalls Tau', bl_avg_kt, bl_avg_pval, new_avg_kt, new_avg_pval) # Print bigram gold sequence overlap (accuracy) for baseline and # new model. bl_bg_acc = mac_avg_bigram_acc(baselineY) new_bg_acc = mac_avg_bigram_acc(newY) print u'\t | BASELINE | NEW' print u'{:12} | {:.3f} | {:.3f} \n'.format(u'bigram acc', bl_bg_acc, new_bg_acc) if baseline_pred_trainY is not None or new_pred_trainY is not None: if baseline_pred_trainY is not None: bl_avg_kt_train, bl_avg_pval_train = avg_kendalls_tau( baseline_pred_trainY) bl_bg_acc_train = mac_avg_bigram_acc(baseline_pred_trainY) else: bl_avg_kt_train = float('nan') bl_avg_pval_train = float('nan') bl_bg_acc_train = float('nan') if new_pred_trainY is not None: new_avg_kt_train, new_avg_pval_train = avg_kendalls_tau( new_pred_trainY) new_bg_acc_train = mac_avg_bigram_acc(new_pred_trainY) else: new_avg_kt_train = float('nan') new_avg_pval_train = float('nan') new_bg_acc_train = float('nan') print u'OVERALL STATS FOR TRAINING DOCUMENTS' print u'\t | BASELINE | NEW' print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format( u'Kendalls Tau', bl_avg_kt_train, bl_avg_pval_train, new_avg_kt_train, new_avg_pval_train) print u'\t | BASELINE | NEW' print u'{:12} | {:.3f} | {:.3f} \n'.format(u'bigram acc', bl_bg_acc_train, new_bg_acc_train) # Print stats for individual test instances. for test_idx, datum in enumerate(izip(testX, baselineY, newY), 1): testx, baseliney, newy = datum print u'TEST NO. {:4}\n=============\n'.format(test_idx) # Print Kendalls Tau and pvalue for baseline and new model # for this test instance. bl_kt, bl_pval = kendalls_tau(baseliney) new_kt, new_pval = kendalls_tau(newy) print u'\t | BASELINE | NEW' print u'{:14} {:.3f} ({:.3f}) | {:.3f} ({:.3f})\n'.format(u'K. Tau', bl_kt, bl_pval, new_kt, new_pval) # Print bigram gold sequence overlap (accuracy) for baseline # and new model. bl_acc = bigram_acc(baseliney) new_acc = bigram_acc(newy) print u'\t | BASELINE | NEW' print u'{:12} | {:.3f} | {:.3f} \n'.format(u'bigram acc', bl_acc, new_acc) # Print document sentences in correct order. print u'GOLD TEXT\n=========\n' for i, s in enumerate(testx): print wrapper.fill(u'({:3}) {}'.format(i, unicode(s))) print u'\n\n' # Print document sentences in baseline order. print u'BASELINE TEXT\n=========\n' indices = [s2i(t.sents[0]) for t in recover_order(baseliney)[:-1]] for i in indices: print wrapper.fill(u'({}) {}'.format(i, unicode(testx[i]))) print u'\n\n' # Print document sentences in new model order. print u'NEW MODEL TEXT\n=========\n' indices = [s2i(t.sents[0]) for t in recover_order(newy)[:-1]] for i in indices: print wrapper.fill(u'({}) {}'.format(i, unicode(testx[i]))) print u'\n\n' # Get predicted transitions in order for both models. # NOTE: The predict function of the Perceptron object returns # the predicted transitions in no particular order. # When in doubt, use recover_order on any predicted output # if you want to iterate over it as if you were traversing the # graph of sentence transitions. baseline_trans = discourse.hypergraph.recover_order(baseliney) new_trans = discourse.hypergraph.recover_order(newy) # Map tail sentence of a transition to the transition. p2t_baseline = _position2transition_map(baseline_trans) p2t_new = _position2transition_map(new_trans) # For each transition leaving the same sentence, if the models # disagree on what the next sentence is, print analysis of # the model features. for pos, t_bl in p2t_baseline.items(): if p2t_new[pos].sents[0] != t_bl.sents[0]: t_new = p2t_new[pos] # Print tail sentence. if pos > -1: pos_str = unicode(testx[pos]) else: pos_str = u'START' print u'=' * 80 print wrapper.fill(u'({:3}) {}'.format(pos, pos_str)) print (u'-' * 80) print u' |\n V' # Print baseline head sentence if s2i(t_bl.sents[0]) is not None: bl_str = unicode(testx[s2i(t_bl.sents[0])]) else: bl_str = u'END' print wrapper.fill(u'(OLD) {}\n'.format(bl_str)) + u'\n' # Print baseline model features for the predicted # baseline transition. explain(t_bl, baseline_model, new_model, testx, base_feats, new_feats) # Print new model head sentence. if s2i(t_new.sents[0]) is not None: new_str = unicode(testx[s2i(t_new.sents[0])]) else: new_str = 'END' print wrapper.fill(u'(NEW) {}\n'.format(new_str)) + u'\n' # Print new model features for the predicted new # model transition. explain(t_new, baseline_model, new_model, testx, base_feats, new_feats) # Print gold head sentence, that is, the sentence the # models should have selected. if pos + 1 < len(testx): gstr = u'(GLD) {}\n'.format(unicode(testx[pos + 1])) print wrapper.fill(gstr) + u'\n' if pos + 1 == s2i(t_bl.sents[0], end=len(testx)): print 'OLD MODEL IS CORRECT\n' if pos + 1 == s2i(t_new.sents[0], end=len(testx)): print 'NEW MODEL IS CORRECT\n' print
def _f_personal_pronoun_res(self, fmap, transition): ecnts = self.entity_counts if transition[0] == u'END': prn0 = set([u'END']) per0 = set([u'END']) else: idx0 = s2i(transition[0]) prn0 = self._extract_personal_pronouns(self.doc[idx0]) per0 = self._extract_persons(self.doc[idx0]) if transition[1] == u'START': prn1 = set([u'START']) per1 = set([u'START']) else: idx1 = s2i(transition[1]) prn1 = self._extract_personal_pronouns(self.doc[idx1]) per1 = self._extract_persons(self.doc[idx1]) for prn in prn0: if len(per1) > 0: for per in per1: if per != u'START' and ecnts[per] > 1: is_salient = u'SALIENT' else: is_salient = u'not salient' fstr1 = u'Per. Prn.: PERSON --> {}'.format(per) fmap[fstr1] = 1 fstr2 = u'Per. Prn.: PERSON --> {} {}'.format(per, is_salient) fmap[fstr2] = 1 fstr3 = u'Per. Prn.: PERSON --> prn' fmap[fstr3] = 1 fstr4 = u'Per. Prn.: PERSON --> prn {}'.format(is_salient) fmap[fstr4] = 1 else: fstr1 = u'Per. Prn.: X --> {}'.format(prn) fmap[fstr1] = 1 fstr2 = u'Per. Prn.: X --> prn' fmap[fstr2] = 1 for prn in prn1: if len(per0) > 0: for per in per0: if per != u'END' and ecnts[per] > 1: is_salient = u'SALIENT' else: is_salient = u'not salient' fstr1 = u'Per. Prn.: PERSON <-- {}'.format(per) fmap[fstr1] = 1 fstr2 = u'Per. Prn.: PERSON <-- {} {}'.format(per, is_salient) fmap[fstr2] = 1 fstr3 = u'Per. Prn.: PERSON <-- prn' fmap[fstr3] = 1 fstr4 = u'Per. Prn.: PERSON <-- prn {}'.format(is_salient) fmap[fstr4] = 1 else: fstr1 = u'Personal Prn: X <-- {}'.format(prn) fmap[fstr1] = 1 fstr2 = u'Personal Prn: X <-- prn' fmap[fstr2] = 1
def _f_role_match(self, fmap, transition): """ This feature counts noun phrase head matches across sentences. The feature takes the form of the dependency relation for the entity in each sentence, and whether or not the entitiy in question is a salient entity. E.g. 'nsubj --> dobj SALIENT'. Start and end role transitions are similarly captured, e.g. 'START -> other' and 'iobj -> END'. Parameters ---------- fmap : dict (string -> int) A dict mapping feature names to feature values for this transition. This function mutates this dict. transition : Transition the graph transition, for which this function extracts features. """ # If the tail sentence is START, create a START role for each entity # that occurs in the head sentence. if transition[1] == u'START': idx0 = s2i(transition[0]) s0_ents = self._entity_roles(self.doc[idx0]) s1_ents = [(token, u'START') for token, role in s0_ents] # If the head sentence is END, create an END role for each entity # that occurs in the tail sentence. elif transition[0] == u'END': idx1 = s2i(transition[1]) s1_ents = self._entity_roles(self.doc[idx1]) s0_ents = [(token, u'END') for token, role in s1_ents] # Default behavior, extract entity role tuples for each sentence. else: idx0 = s2i(transition[0]) idx1 = s2i(transition[1]) s0_ents = self._entity_roles(self.doc[idx0]) s1_ents = self._entity_roles(self.doc[idx1]) # Entity counts ecnts = self.entity_counts # This set records entities matched in the head sentence. # For entites in the head sentence that are NOT matched, this set # makes it possible to create a feature of the form "X -> role" # where the X indicates that the entitiy did not appear in the tail. used_ents = set() # This default dict is used to build the feature counts that will be # added to fmap. role_matches = defaultdict(int) # Find matching entities across sentences, and mark them as features. for ent1 in s1_ents: lem1 = ent1[0].lem.lower() is_salient = u'SALIENT' if ecnts[lem1] > 2 else u'not salient' #ne1 = ent1[0].ne no_match = True for ent0 in s0_ents: lem0 = ent0[0].lem.lower() if lem0 == lem1: no_match = False used_ents.add(lem0) fstr1 = u'Role Trans: {} --> {}'.format(ent1[1], ent0[1]) role_matches[fstr1] += 1 sfstr1 = fstr1 + u' {}'.format(is_salient) role_matches[sfstr1] += 1 # Backoff features with generic __ symbol fstr2 = u'Role Trans: __ --> {}'.format(ent0[1]) role_matches[fstr2] += 1 sfstr2 = fstr2 + u' {}'.format(is_salient) role_matches[sfstr2] += 1 fstr3 = u'Role Trans: {} --> __'.format(ent1[1]) role_matches[fstr3] += 1 sfstr3 = fstr3 + u' {}'.format(is_salient) role_matches[sfstr3] += 1 if no_match: fstr1 = u'Role Trans: {} --> X'.format(ent1[1]) role_matches[fstr1] += 1 sfstr1 = fstr1 + u' {}'.format(is_salient) role_matches[sfstr1] += 1 for ent, role in s0_ents: lem = ent.lem.lower() if lem not in used_ents: is_salient = u'SALIENT' if ecnts[lem] > 2 else u'not salient' fstr1 = u'Role Trans: X --> {}'.format(role) role_matches[fstr1] += 1 sfstr1 = fstr1 + u' {}'.format(is_salient) role_matches[sfstr1] += 1 for feature, val in role_matches.items(): if 'SALIENT' in feature: fmap[feature] = val
def _f_first_word(self, fmap, transition): """ Marks the sequence first words of the sentences selected by the graph edge *transition*. E.g. 'a' ---> 'the' . Parameters ---------- fmap : dict (string -> int) A dict mapping feature names to feature values for this transition. This function mutates this dict. transition : Transition The graph edge, from which this function extracts features. """ # Extract first word from tail sentence. if transition[1] == u'START': word1 = u'START' ne1 = u'START' else: idx = s2i(transition[1]) sent1 = self.doc[idx] token1 = sent1.tokens[0] word1 = token1.lem.lower() ne1 = token1.ne # Extract first word from head sentence. if transition[0] == u'END': word0 = u'END' ne0 = u'END' else: idx = s2i(transition[0]) sent0 = self.doc[idx] token0 = sent0.tokens[0] word0 = token0.lem.lower() ne0 = token0.ne # Mark the feature fstr1 = u'First Word Trans: {} --> {}'.format(word1, word0) fmap[fstr1] = 1 # Mark smoothed versions of this feature. fstr2 = u'First Word Trans: __ --> {}'.format(unicode(word0)) fmap[fstr2] = 1 fstr3 = u'First Word Trans: {} --> __'.format(unicode(word1)) fmap[fstr3] = 1 fstr4 = u'First Word Trans: {} --> {}'.format(ne1, ne0) fmap[fstr4] = 1 fstr5 = u'First Word Trans: {} --> {}'.format(ne1, word0) fmap[fstr5] = 1 fstr6 = u'First Word Trans: {} --> {}'.format(word1, ne0) fmap[fstr6] = 1 fstr7 = u'First Word Trans: {} --> __'.format(ne1) fmap[fstr7] = 1 fstr8 = u'First Word Trans: __ --> {}'.format(ne0) fmap[fstr8] = 1
testx, gtesty, ptesty = datum kt, pval = evaluation.kendalls_tau(ptesty) print u'Kendall\'s Tau : {:.3f} (pval {:.3f})'.format(kt, pval) print u'Bigram Acc. : {:.3f}'.format(evaluation.bigram_acc(ptesty)) print print u'GOLD ORDERING\n==================\n' print unicode(testx.trans2str(gtesty)) print for t in hypergraph.recover_order(gtesty): print u'TRANSITION: {}'.format(unicode(t)) print u'=' * 79 idx1 = hypergraph.s2i(t.sents[1]) sent1 = testx[idx1] if idx1 > -1 else 'START' idx2 = hypergraph.s2i(t.sents[0]) sent2 = testx[idx2] if idx2 is not None else 'END' print textwrap.fill(u'({:3}) {}'.format(idx1, unicode(sent1))) print u' |\n V' print textwrap.fill(u'({:3}) {}\n'.format(idx2, unicode(sent2))) evaluation.explain_transition(t, model, testx) print print u'PREDICTED ORDERING\n==================\n' print unicode(testx.trans2str(ptesty)) print for t in hypergraph.recover_order(ptesty):