Ejemplo n.º 1
0
def _fp_tree(db, min_support):
    """
    FP-tree construction ([1] § 2.1, Algorithm 1).

    If frequent items in db are determined to be less than threshold,
    "bucketing" [2] is used instead.

    Returns
    -------
    tuple
        (FP-tree, None) or (None, list of frequent itemsets with support)
    """
    if not isinstance(db, list): db = list(db)

    if not db:
        return _FP_TREE_EMPTY

    # Used to count item support so it can be reported when generating itemsets
    item_support = defaultdict(int)
    # Used for ordering transactions' items for "optimally" "compressed" tree
    node_support = defaultdict(int)
    for count, transaction in db:
        for item in transaction:
            item_support[item] += count
            node_support[item] += 1
    # Only ever consider items that have min_support
    frequent_items = {
        item
        for item, support in item_support.items() if support >= min_support
    }

    # Short-circuit, if possible
    n_items = len(frequent_items)
    if 0 == n_items:
        return _FP_TREE_EMPTY
    if 1 == n_items:
        item = frequent_items.pop()
        return None, ((frozenset({item}), item_support[item]), )
    if n_items <= _BUCKETING_FEW_ITEMS:
        return None, ((frozenset(itemset), support)
                      for itemset, support in _bucketing_count(
                          db, frequent_items, min_support))

    # "The items [...] should be ordered in the frequency descending order of
    # node occurrence of each item instead of its support" ([1], p. 12, bottom)
    sort_index = {
        item: i
        for i, item in enumerate(
            sorted(frequent_items, key=node_support.__getitem__, reverse=True))
    }.__getitem__
    # Only retain frequent items and sort them
    db = ((count,
           sorted(frequent_items.intersection(transaction), key=sort_index))
          for count, transaction in db)

    root = _Node()
    node_links = defaultdict(list)
    for count, transaction in db:
        T = root
        for item in transaction:
            T = _fp_tree_insert(item, T, node_links, count)
    # Sorted support-descending (in reverse because popping from the back for efficiency)
    root.node_links = sorted(node_links.items(),
                             key=lambda i: -sort_index(i[0]))
    return root, None
Ejemplo n.º 2
0
def _fp_tree(db, min_support):
    """
    FP-tree construction ([1] § 2.1, Algorithm 1).

    If frequent items in db are determined to be less than threshold,
    "bucketing" [2] is used instead.

    Returns
    -------
    tuple
        (FP-tree, None) or (None, list of frequent itemsets with support)
    """
    if not isinstance(db, list): db = list(db)

    if not db:
        return _FP_TREE_EMPTY

    # Used to count item support so it can be reported when generating itemsets
    item_support = defaultdict(int)
    # Used for ordering transactions' items for "optimally" "compressed" tree
    node_support = defaultdict(int)
    for count, transaction in db:
        for item in transaction:
            item_support[item] += count
            node_support[item] += 1
    # Only ever consider items that have min_support
    frequent_items = {item
                      for item, support in item_support.items()
                      if support >= min_support}

    # Short-circuit, if possible
    n_items = len(frequent_items)
    if 0 == n_items:
        return _FP_TREE_EMPTY
    if 1 == n_items:
        item = frequent_items.pop()
        return None, ((frozenset({item}), item_support[item]),)
    if n_items <= _BUCKETING_FEW_ITEMS:
        return None, ((frozenset(itemset), support)
                      for itemset, support in _bucketing_count(db, frequent_items, min_support))

    # "The items [...] should be ordered in the frequency descending order of
    # node occurrence of each item instead of its support" ([1], p. 12, bottom)
    sort_index = {item: i
                  for i, item in
                      enumerate(sorted(frequent_items,
                                       key=node_support.__getitem__,
                                       reverse=True))}.__getitem__
    # Only retain frequent items and sort them
    db = ((count, sorted(frequent_items.intersection(transaction),
                         key=sort_index))
          for count, transaction in db)

    root = _Node()
    node_links = defaultdict(list)
    for count, transaction in db:
        T = root
        for item in transaction:
            T = _fp_tree_insert(item, T, node_links, count)
    # Sorted support-descending (in reverse because popping from the back for efficiency)
    root.node_links = sorted(node_links.items(), key=lambda i: -sort_index(i[0]))
    return root, None