def avg_metric_by_label(t, metric_fn, label_attachment): """ >>> from nltk.tree import Tree >>> t_0 = Tree.fromstring("(A (B b))") >>> t_1 = Tree.fromstring("(A a)") >>> fn = lambda t: t.height() >>> t = Tree("sentences", [Tree("id: 0", [t_0]), Tree("id: 1", [t_1])]) >>> avg_heights = avg_metric_by_label(t, fn, '_avg_height') >>> sorted(avg_heights.keys()) ['A_avg_height', 'B_avg_height'] >>> [avg_heights[l] for l in sorted(avg_heights.keys())] [2.5, 2.0] """ is_not_sentence_label = lambda t: not lime_utils.is_sentence_label(t.label( )) metric_by_label = nltk.ConditionalFreqDist( (s.label(), metric_fn(s)) for s in t.subtrees(filter=is_not_sentence_label)) avg_metric = avg_vals_fn(metric_by_label) def set_metric(fd, label): a = avg_metric(label) if a is not None: fd[label + label_attachment] = a return fd return reduce(set_metric, metric_by_label, nltk.FreqDist())
def dep_metrics(t, fn_map, acc=None): """ :param t :param fn_map :param acc >>> from nltk.tree import Tree >>> word = lambda w, i: 'word: %s, index: %s' % (w, i) >>> sent_tree = lambda i, t: Tree('id: %s' % i, [t]) >>> sent_1 = sent_tree(1, Tree('dobj', [word('b', 1)])) >>> sent_2 = sent_tree(2, Tree('dobj', ... [Tree('nsubj', [word('c', 1)]), word('d', 2)])) >>> t = Tree('sentences', [sent_1, sent_2]) >>> width_fn = lambda t: len(t.leaves()) >>> height_fn = lambda t: t.height() >>> dep_metrics(t, {'width': width_fn, 'height': height_fn}) {'dobj_width': [1, 2], 'dobj_height': [2, 3], 'nsubj_height': [2], 'nsubj_width': [1]} """ if acc is None: acc = dict() if not tree_utils.is_tree(t): return acc label = t.label() is_word_label = "word: " in label is_tag = not (lime_utils.is_sentence_label(label) or is_word_label) if is_tag: for m in fn_map: k = label + "_" + m acc[k] = acc.get(k, []) + [fn_map[m](t)] for s in t: dep_metrics(s, fn_map, acc) return acc
def tag_counts(t): """ >>> from nltk.tree import Tree >>> s = "(S (NP (DT The) (NN cat)) (VP (VBD ate) (NP (DT the) (NN mouse))))" >>> t = Tree.fromstring(s) >>> p = tag_counts(t) >>> sorted(p.keys()) ['DT', 'NN', 'NP', 'S', 'VBD', 'VP'] >>> [p[k] for k in sorted(p.keys())] [2, 2, 2, 1, 1, 1] >>> t = Tree("sentences", [Tree('id: 0', [t]), Tree('id: 1', [t])]) >>> p = tag_counts(t) >>> sorted(p.keys()) ['DT', 'NN', 'NP', 'S', 'VBD', 'VP'] >>> [p[k] for k in sorted(p.keys())] [4, 4, 4, 2, 2, 2] """ return nltk.FreqDist(s.label() for s in t.subtrees() if not lime_utils.is_sentence_label(s.label()))
def phrase_yngve_depths(t): """ >>> from nltk.tree import Tree >>> t = Tree.fromstring('(A (B b) (C c (B b)))') >>> d = phrase_yngve_depths(t) >>> expected = {'A_avg_yngve_depth': 0, 'B_avg_yngve_depth': .5, 'C_avg_yngve_depth': 0} >>> assert(expected == d) """ is_not_sentence_label = lambda tree: not lime_utils.is_sentence_label( tree.label()) concat = dict() for sub_tree in tree_utils.below_condition(t, is_not_sentence_label): sub_tree_yngve = tree_utils.yngve_depth(sub_tree) for label in sub_tree_yngve: concat[label] = concat.get(label, []) + sub_tree_yngve[label] return {l + '_avg_yngve_depth': np.average(concat[l]) for l in concat}
def phrase_sentence_cover(t, coeff=1.0, covers=None): """ >>> from nltk.tree import Tree >>> s = "(A (B (C c) (D d)) (E e))" >>> t = Tree.fromstring(s) >>> c = phrase_sentence_cover(t) >>> sorted(c.keys()) ['A', 'B', 'C', 'D', 'E'] >>> [c[k] for k in sorted(c.keys())] [1.0, 0.5, 0.25, 0.25, 0.5] >>> s_small = "(A (B (C c) (D d)))" >>> t_small = Tree.fromstring(s_small) >>> t = Tree("sentences", [Tree("id: 0", [t]), Tree("id: 1", [t_small])]) >>> c = phrase_sentence_cover(t) >>> sorted(c.keys()) ['A', 'B', 'C', 'D', 'E'] >>> [c[k] for k in sorted(c.keys())] [1.0, 0.75, 0.375, 0.375, 0.5] """ if covers is None: covers = dict() if not tree_utils.is_tree(t): return covers label = t.label() if lime_utils.is_sentence_label(label): covers_per_sent = map( lambda s: phrase_sentence_cover(s, coeff, dict()), t) return tree_utils.avg_dicts(covers_per_sent) covers[label] = covers.get(label, 0) + coeff num_children = len(t) for c in t: phrase_sentence_cover(c, coeff / num_children, covers) return covers