def test_construct_ptree():
    A, B, C, D = 'A', 'B', 'C', 'D'
    labelset_list = [(A, B, C),
                     (B),
                     (A, C),
                     tuple([D]),
                     (B, C),
                     (A)]
    labelset_list = map(set, labelset_list)
    actual_tree = construct_ptree(labelset_list)
    expected_tree = [[], [0], [0], [], [0, 1, 2], [0, 2]]
    # [(0, 1), (0, 2),
    #                   (1, 4), (2, 4), (2, 5)]
    assert_equal(actual_tree, expected_tree)
def main():
    N_TOP_TAGs = 50
    
    from db import conn
    articles = conn['bloomberg'].articles
    tag_freq = count_tags(articles.find())
    target_tags = set([k for k, _ in tag_freq.most_common(N_TOP_TAGs)])
    compact_edges = compactize_edges_by_tags(articles.find(), target_tags)
    
    pprint(compact_edges[:10])
    print(sum([1 for e in compact_edges if len(e['tags']) > 1]))
    pprint(len(compact_edges))
    
    sorted_compact_edges = sorted(compact_edges, key=lambda item: item['publish_time'])
    tagset_list = [e['tags'] for e in sorted_compact_edges]
    tree = construct_ptree(tagset_list)