def token_freq(trees):
    """
    Collect token frequency statistics from trees

    >>> trees = [(1, (1, (2, 'a'), (3, 'b')), (1, (1, 'c'), (2, 'd'))), \
    (1, (1, (2, 'b'), (3, 'b')), (1, (1, 'c'), (2, 'a')))]
    >>> token_freq(trees)
    Counter({'b': 3, 'a': 2, 'c': 2, 'd': 1})
    """
    counter = Counter()
    for tree in trees:
        leaves = get_leaves_with_labels(tree)
        counter += Counter([token for token, label in  leaves])

    return counter
Beispiel #2
0
def token_freq(trees):
    """
    Collect token frequency statistics from trees

    >>> trees = [(1, (1, (2, 'a'), (3, 'b')), (1, (1, 'c'), (2, 'd'))), \
    (1, (1, (2, 'b'), (3, 'b')), (1, (1, 'c'), (2, 'a')))]
    >>> token_freq(trees)
    Counter({'b': 3, 'a': 2, 'c': 2, 'd': 1})
    """
    counter = Counter()
    for tree in trees:
        leaves = get_leaves_with_labels(tree)
        counter += Counter([token for token, label in leaves])

    return counter
Beispiel #3
0
def collect_nodes(trees):
    """
    Collect node information(token, left child, right child, label) of trees by starting from lower part of trees and moving to the top

    Param:
    ------

    trees: list of tree
    
    Return:
    ------
    list of tuple, (token, left child token, right child token, label)
    
    >>> from ptb import parse
    >>> t1 = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))")
    >>> t2 = parse("(0 (0 (2 A) (0 (0 (0 boring) (2 ,)) (0 bad))) (1 (2 ,) (1 (1 (1 unsatisfactory) (2 film)) (2 .))))")
    >>> t3 = parse("(2 film)") # some repeatition
    >>> data = collect_nodes([t1, t2, t3])
    >>> len(data)
    24
    >>> data[-1]
    ((('A', (('boring', ','), 'bad')), (',', (('unsatisfactory', 'film'), '.'))), ('A', (('boring', ','), 'bad')), (',', (('unsatisfactory', 'film'), '.')), 0)
    >>> data[0]
    ('funny', None, None, 3)
    >>> nodes = collect_nodes([t1])
    >>> len(nodes)
    14
    >>> nodes
    [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
    """
    all_tokens = []  # place to store the final result
    collected_tokens = set()
    
    while len(trees) > 0:
        shallower_trees = []
        
        # collect the leaf nodes
        for t in trees:
            tokens_with_labels = set(ptb.get_leaves_with_labels(t))

            # not all tokens are harvested
            # only the new ones
            new_tokens_with_labels = []
            for token, label in tokens_with_labels:
                if token not in collected_tokens:
                    new_tokens_with_labels.append((token, label))
            

            if new_tokens_with_labels:
                tokens, labels = zip(*new_tokens_with_labels)
            else:
                continue # nothing to add
                        
            # add new tokens, their children and their labels
            all_tokens += [
                (tok, ) + # the token
                ((tok[0], tok[1]) # children node id if has children
                  if isinstance(tok, tuple) 
                  else (None, None)) +  #for single words
                (l, ) # the label
                for tok, l in zip(tokens, labels)
            ]
            collected_tokens |= set(tokens)
            
            try:
                shallower_trees.append(merge_leaves(t))
            except CannotMergeAnyMoreException:
                pass

        trees = shallower_trees# we consider the shallower trees now
    
    return all_tokens