Beispiel #1
0
def avg_metric_by_label(t, metric_fn, label_attachment):
    """
    >>> from nltk.tree import Tree
    >>> t_0 = Tree.fromstring("(A (B b))")
    >>> t_1 = Tree.fromstring("(A a)")
    >>> fn = lambda t: t.height()
    >>> t = Tree("sentences", [Tree("id: 0", [t_0]), Tree("id: 1", [t_1])])
    >>> avg_heights = avg_metric_by_label(t, fn, '_avg_height')
    >>> sorted(avg_heights.keys())
    ['A_avg_height', 'B_avg_height']
    >>> [avg_heights[l] for l in sorted(avg_heights.keys())]
    [2.5, 2.0]
    """
    is_not_sentence_label = lambda t: not lime_utils.is_sentence_label(t.label(
    ))
    metric_by_label = nltk.ConditionalFreqDist(
        (s.label(), metric_fn(s))
        for s in t.subtrees(filter=is_not_sentence_label))

    avg_metric = avg_vals_fn(metric_by_label)

    def set_metric(fd, label):
        a = avg_metric(label)
        if a is not None:
            fd[label + label_attachment] = a
        return fd

    return reduce(set_metric, metric_by_label, nltk.FreqDist())
def dep_metrics(t, fn_map, acc=None):
    """
    :param t
    :param fn_map
    :param acc
    >>> from nltk.tree import Tree
    >>> word = lambda w, i: 'word: %s, index: %s' % (w, i)
    >>> sent_tree = lambda i, t: Tree('id: %s' % i, [t])
    >>> sent_1 = sent_tree(1, Tree('dobj', [word('b', 1)])) 
    >>> sent_2 = sent_tree(2, Tree('dobj', 
    ...    [Tree('nsubj', [word('c', 1)]), word('d', 2)])) 
    >>> t = Tree('sentences', [sent_1, sent_2])
    >>> width_fn = lambda t: len(t.leaves())
    >>> height_fn = lambda t: t.height()
    >>> dep_metrics(t, {'width': width_fn, 'height': height_fn})
    {'dobj_width': [1, 2], 'dobj_height': [2, 3], 'nsubj_height': [2], 'nsubj_width': [1]}
    """
    if acc is None:
        acc = dict()

    if not tree_utils.is_tree(t):
        return acc

    label = t.label()
    is_word_label = "word: " in label
    is_tag = not (lime_utils.is_sentence_label(label) or is_word_label)
    if is_tag:
        for m in fn_map:
            k = label + "_" + m
            acc[k] = acc.get(k, []) + [fn_map[m](t)]

    for s in t:
        dep_metrics(s, fn_map, acc)

    return acc
Beispiel #3
0
def tag_counts(t):
    """
    >>> from nltk.tree import Tree
    >>> s = "(S (NP (DT The) (NN cat)) (VP (VBD ate) (NP (DT the) (NN mouse))))"
    >>> t = Tree.fromstring(s)
    >>> p = tag_counts(t)
    >>> sorted(p.keys())
    ['DT', 'NN', 'NP', 'S', 'VBD', 'VP']
    >>> [p[k] for k in sorted(p.keys())]
    [2, 2, 2, 1, 1, 1]
    >>> t = Tree("sentences", [Tree('id: 0', [t]), Tree('id: 1', [t])])
    >>> p = tag_counts(t)
    >>> sorted(p.keys())
    ['DT', 'NN', 'NP', 'S', 'VBD', 'VP']
    >>> [p[k] for k in sorted(p.keys())]
    [4, 4, 4, 2, 2, 2]
    """
    return nltk.FreqDist(s.label() for s in t.subtrees()
                         if not lime_utils.is_sentence_label(s.label()))
Beispiel #4
0
def phrase_yngve_depths(t):
    """
    >>> from nltk.tree import Tree
    >>> t = Tree.fromstring('(A (B b) (C c (B b)))')
    >>> d = phrase_yngve_depths(t)
    >>> expected = {'A_avg_yngve_depth': 0, 'B_avg_yngve_depth': .5, 'C_avg_yngve_depth': 0}
    >>> assert(expected == d)
    """

    is_not_sentence_label = lambda tree: not lime_utils.is_sentence_label(
        tree.label())

    concat = dict()
    for sub_tree in tree_utils.below_condition(t, is_not_sentence_label):
        sub_tree_yngve = tree_utils.yngve_depth(sub_tree)
        for label in sub_tree_yngve:
            concat[label] = concat.get(label, []) + sub_tree_yngve[label]

    return {l + '_avg_yngve_depth': np.average(concat[l]) for l in concat}
Beispiel #5
0
def phrase_sentence_cover(t, coeff=1.0, covers=None):
    """
    >>> from nltk.tree import Tree
    >>> s = "(A (B (C c) (D d)) (E e))"
    >>> t = Tree.fromstring(s)
    >>> c = phrase_sentence_cover(t)
    >>> sorted(c.keys())
    ['A', 'B', 'C', 'D', 'E']
    >>> [c[k] for k in sorted(c.keys())]
    [1.0, 0.5, 0.25, 0.25, 0.5]
    >>> s_small = "(A (B (C c) (D d)))"
    >>> t_small = Tree.fromstring(s_small)
    >>> t = Tree("sentences", [Tree("id: 0", [t]), Tree("id: 1", [t_small])])
    >>> c = phrase_sentence_cover(t)
    >>> sorted(c.keys())
    ['A', 'B', 'C', 'D', 'E']
    >>> [c[k] for k in sorted(c.keys())]
    [1.0, 0.75, 0.375, 0.375, 0.5]
    """
    if covers is None:
        covers = dict()

    if not tree_utils.is_tree(t):
        return covers

    label = t.label()

    if lime_utils.is_sentence_label(label):
        covers_per_sent = map(
            lambda s: phrase_sentence_cover(s, coeff, dict()), t)
        return tree_utils.avg_dicts(covers_per_sent)

    covers[label] = covers.get(label, 0) + coeff

    num_children = len(t)

    for c in t:
        phrase_sentence_cover(c, coeff / num_children, covers)

    return covers