Example #1
0
    def buildTree(self,transactionDatabase):

        master = FPTree()
        for transaction in transactionDatabase:
            #print transaction
            master.add(transaction)

        return master
Example #2
0
def build_tree(window, item_count):
    path_len_sum = 0
    path_count = 0
    tree = FPTree()
    for bucket in window:
        for (transaction, count) in bucket.tree:
            sorted_transaction = sort_transaction(transaction, item_count)
            path_len_sum += count * len(sorted_transaction)
            path_count += count
            tree.insert(sorted_transaction, count)
    avg_path_len = path_len_sum / path_count
    return (tree, avg_path_len)
Example #3
0
def test_tree_iter():
    tree = FPTree()
    (item_count, _) = count_item_frequency_in(test_transactions)
    expected = Counter()
    for transaction in test_transactions:
        sort_transaction(transaction, item_count)
        tree.insert(transaction)
        expected[frozenset(transaction)] += 1
    observed = Counter()
    for (transaction, count) in tree:
        observed[frozenset(transaction)] += count
    assert (expected == observed)
Example #4
0
def test_tree_iter():
    tree = FPTree()
    item_count = count_item_frequency_in(test_transactions)
    expected = Counter()
    for transaction in [list(map(Item, t)) for t in test_transactions]:
        sort_transaction(transaction, item_count)
        tree.insert(transaction)
        expected[frozenset(transaction)] += 1
    stored_transactions = set()
    observed = Counter()
    for (transaction, count) in tree:
        observed[frozenset(transaction)] += count
    assert(expected == observed)
Example #5
0
    def __init__(self, mid = 1):

        self.mid       = mid
        self.contexts  = open('../data/%s.%s.all.nouns.contexts.utf-8' % \
                              (config.PREFIX_CONTEXT, mid))
        self.features  = open('../data/%s.%s.all.features.utf-8' % \
                              (config.PREFIX_FEATURE, mid), 'w')

        self.results   = open('../data/%s.%s.frequent.features.utf-8' % \
                              (config.PREFIX_FEATURE, mid), 'w')

        self.scores    = {}

        self.preTree   = FPTree()
        self.postTree  = FPTree()

        self.wordCount = self._genWordCount()
Example #6
0
 def __init__(self, debug=False):
     self.fp_tree = FPTree()
     self._maxGain_ = 0.0
     self._bestPattern = None
     self._bestPatterns = []
     #######################
     self._label = None
     self.debug = debug
def conditional_tree_from_paths(paths, minimum_support):
    """Builds a conditional FP-tree from the given prefix paths."""
    tree = FPTree()
    condition_item = None
    items = set()

    # Import the nodes in the paths into the new tree. Only the counts of the
    # leaf notes matter; the remaining counts will be reconstructed from the
    # leaf counts.
    for path in paths:
        if condition_item is None:
            condition_item = path[-1].item

        point = tree.root
        for node in path:
            next_point = point.search(node.item)
            if not next_point:
                # Add a new node to the tree.
                items.add(node.item)
                count = node.count if node.item == condition_item else 0
                next_point = FPNode(tree, node.item, count)
                point.add(next_point)
                tree._update_route(next_point)
            point = next_point

    assert condition_item is not None

    # Calculate the counts of the non-leaf nodes.
    for path in tree.prefix_paths(condition_item):
        count = None
        for node in reversed(path):
            if count is not None:
                node._count += count
            count = node.count

    # Eliminate the nodes for any items that are no longer frequent.
    for item in items:
        support = sum(n.count for n in tree.nodes(item))
        if support < minimum_support:
            # Doesn't make the cut anymore
            for node in tree.nodes(item):
                if node.parent is not None:
                    node.parent.remove(node)

    # Finally, remove the nodes corresponding to the item for which this
    # conditional tree was generated.
    for node in tree.nodes(condition_item):
        if node.parent is not None:  # the node might already be an orphan
            node.parent.remove(node)

    return tree
Example #8
0
class Bucket:
    def __init__(self, transaction=None):
        self.tree = FPTree()
        self.sorting_counter = None
        if transaction is not None:
            self.add(transaction)

    def add(self, transaction):
        self.tree.insert(sort_transaction(transaction, self.sorting_counter))

    def __len__(self):
        return self.tree.num_transactions

    def append(self, other_bucket):
        for (transaction, count) in other_bucket.tree:
            self.tree.insert(
                sort_transaction(transaction, self.sorting_counter), count)
        self.tree.sort()  # TODO: Is this necessary?
        self.sorting_counter = self.tree.item_count.copy()

    def __str__(self):
        return str(self.transactions)
Example #9
0
            for antecedent in itertools.combinations(itemset, i):
                antecedent = tuple(sorted(antecedent))
                consequent = tuple(sorted(set(itemset) - set(antecedent)))

                if antecedent in patterns:
                    lower_support = patterns[antecedent]
                    confidence = float(upper_support) / lower_support

                    if confidence >= confidence_threshold:
                        rules[antecedent] = (consequent, confidence)

    return rules


if __name__ == "__main__":
    big_dataset = load_transactions("data/Transactions.csv")

    support_threshold = int(len(big_dataset) * 0.05)
    tree = FPTree(big_dataset, support_threshold, None, None)
    patterns = tree.mine_patterns(support_threshold)

    print "Frequent patterns:", patterns
    print "Patterns found:", len(patterns)

    # Generate association rules from the frequent itemsets.
    min_confidence = 0.5
    rules = generate_association_rules(patterns, min_confidence)
    for rule in rules.keys():
        print rule, "=>", rules[rule]
    print "Number of rules found:", len(rules)
def find_frequent_itemsets(transactions,
                           minimum_support,
                           include_support=False):
    """
    Find frequent itemsets in the given transactions using FP-growth. This
    function returns a generator instead of an eagerly-populated list of items.

    The `transactions` parameter can be any iterable of iterables of items.
    `minimum_support` should be an integer specifying the minimum number of
    occurrences of an itemset for it to be accepted.

    Each item must be hashable (i.e., it must be valid as a member of a
    dictionary or a set).

    If `include_support` is true, yield (itemset, support) pairs instead of
    just the itemsets.
    """
    items = defaultdict(lambda: 0)  # mapping from items to their supports
    processed_transactions = []

    # Load the passed-in transactions and count the support that individual
    # items have.
    for transaction in transactions:
        processed = []
        for item in transaction:
            items[item] += 1
            processed.append(item)
        processed_transactions.append(processed)

    # Remove infrequent items from the item support dictionary.
    items = dict((item, support) for item, support in items.iteritems()
                 if support >= minimum_support)

    # Build our FP-tree. Before any transactions can be added to the tree, they
    # must be stripped of infrequent items and their surviving items must be
    # sorted in decreasing order of frequency.
    def clean_transaction(transaction):
        transaction = filter(lambda v: v in items, transaction)
        transaction.sort(key=lambda v: items[v], reverse=True)
        return transaction

    master = FPTree()
    for transaction in imap(clean_transaction, processed_transactions):
        master.add(transaction)

    def find_with_suffix(tree, suffix):
        for item, nodes in tree.items():
            support = sum(n.count for n in nodes)
            if support >= minimum_support and item not in suffix:
                # New winner!
                found_set = [item] + suffix
                yield (found_set, support) if include_support else found_set

                # Build a conditional tree and recursively search for frequent
                # itemsets within it.
                cond_tree = conditional_tree_from_paths(
                    tree.prefix_paths(item), minimum_support)
                for s in find_with_suffix(cond_tree, found_set):
                    yield s  # pass along the good news to our caller

    # Search for frequent itemsets, and yield the results we find.
    for itemset in find_with_suffix(master, []):
        yield itemset
Example #11
0
class Featrues:

    def __init__(self, mid = 1):

        self.mid       = mid
        self.contexts  = open('../data/%s.%s.all.nouns.contexts.utf-8' % \
                              (config.PREFIX_CONTEXT, mid))
        self.features  = open('../data/%s.%s.all.features.utf-8' % \
                              (config.PREFIX_FEATURE, mid), 'w')

        self.results   = open('../data/%s.%s.frequent.features.utf-8' % \
                              (config.PREFIX_FEATURE, mid), 'w')

        self.scores    = {}

        self.preTree   = FPTree()
        self.postTree  = FPTree()

        self.wordCount = self._genWordCount()

    def _genWordCount(self):

        nouns = open('../data/%s.%s.all.nouns.utf-8' % (config.PREFIX_WORD, self.mid))
        wc    = {}

        for n in nouns:
            w = n.strip().split(' ')[0]
            c = n.strip().split(' ')[1]
            wc[w.split('#')[0]] = c

        nouns.close()

        return wc

    def save(self):

        self.contexts.close()
        self.features.close()
        self.results.close()

    def outputFeatures(self):

        sortedFeatures = sorted(self.scores.iteritems(), key = lambda k:k[1], reverse = True)

        for f in sortedFeatures:
            # print(f[1])
            if f[1] > 10:
                self.results.write(f[0] + '\n')

    def outputAllFeatures(self):

        preFP  = self.preTree.getFP()
        postFP = self.postTree.getFP()

        preFrequence  = self._toOneWord(preFP)
        postFrequence = self._toOneWord(postFP)
        # self.preTree.printTree()
        # self.postTree.printTree()

        sortedPre  = sorted(preFrequence.iteritems(), key = lambda k:k[1][0], reverse = True)

        for s in sortedPre:
            key = s[0]
            if postFrequence.has_key(key) and \
               preFrequence[key][1].strip() != 'empty' and \
               postFrequence[key][1].strip() != 'empty':

                if self.scores.has_key(key):
                    self.scores[key] += config.WORD_COUNT_FACTOR * float(self.wordCount[key]) + \
                                        config.PRE_CONTEXT_FACTOR * float(s[1][0]) + \
                                        config.POST_CONTEXT_FACTOR * float(postFrequence[key][0])
                else:
                    self.scores[key]  = config.WORD_COUNT_FACTOR * float(self.wordCount[key]) + \
                                        config.PRE_CONTEXT_FACTOR * float(s[1][0]) + \
                                        config.POST_CONTEXT_FACTOR * float(postFrequence[key][0])

                self.features.write('%s %s %s %s %s %s\n' % \
                                    (s[1][0], \
                                     postFrequence[key][0], \
                                     self.wordCount[key], \
                                     preFrequence[key][1], \
                                     postFrequence[key][1], \
                                     key))

    def _toOneWord(self, fp):

        f = {}

        for key in fp:
            for k in key.split(' '):
                f[k]  = fp[key]

        return f


    def genFeatrues(self):

        for c in self.contexts:
            
            if c.find('>>>') < 0:
                continue

            word = c.strip().split('>>>')[0].split('#')[0]
            cs   = c.strip().split('>>>')[1].split('&&&')
            for c in cs:
                pre         = c.split('---')[0].split(' ')
                post        = c.split('---')[1].split(' ')

                pre.reverse()
                
                transaction = self._genTransaction(word, pre)
                self.preTree.insertTransaction(transaction)
                
                transaction = self._genTransaction(word, post)
                self.postTree.insertTransaction(transaction)

    def _genTransaction(self, word, contexts):

        transaction = []
        length      = len(contexts)

        for i in range(length):
            c = contexts[length - i - 1]
            if c.find('#') < 0:
                transaction.append(c)
            else:
                transaction.append(c.split('#')[1])

        transaction.append(word)
        return transaction
Example #12
0
def test_tree_sorting():

    expected_tree = construct_initial_tree(test_transactions)
    assert(expected_tree.is_sorted())

    tree = FPTree()
    for transaction in test_transactions:
        # Insert reversed, since lexicographical order is already decreasing
        # frequency order in this example.
        tree.insert(map(Item, reversed(transaction)))
    assert(str(expected_tree) != str(tree))
    tree.sort()
    assert(tree.is_sorted())
    assert(str(expected_tree) == str(tree))

    datasets = [
        "datasets/UCI-zoo.csv",
        "datasets/mushroom.csv",
        # "datasets/BMS-POS.csv",
        # "datasets/kosarak.csv",
    ]

    for csvFilePath in datasets:
        print("Loading FPTree for {}".format(csvFilePath))
        start = time.time()
        tree = FPTree()
        with open(csvFilePath, newline='') as csvfile:
            for line in list(csv.reader(csvfile)):
                # Insert sorted lexicographically
                transaction = sorted(map(Item, line))
                tree.insert(transaction)
        duration = time.time() - start
        print("Loaded in {:.2f} seconds".format(duration))
        print("Sorting...")
        start = time.time()
        tree.sort()
        duration = time.time() - start
        print("Sorting took {:.2f} seconds".format(duration))
        assert(tree.is_sorted())
Example #13
0
 def __init__(self, transaction=None):
     self.tree = FPTree()
     self.sorting_counter = None
     if transaction is not None:
         self.add(transaction)
Example #14
0
def mine_cp_tree_stream(transactions, min_support, sort_interval, window_size):
    # Yields (window_start_index, window_length, patterns)
    tree = FPTree()
    sliding_window = deque()
    frequency = None
    num_transactions = 0
    for transaction in transactions:
        num_transactions += 1
        transaction = sort_transaction(map(Item, transaction), frequency)
        tree.insert(transaction)
        sliding_window.append(transaction)
        if len(sliding_window) > window_size:
            transaction = sliding_window.popleft()
            transaction = sort_transaction(transaction, frequency)
            tree.remove(transaction, 1)
            assert (len(sliding_window) == window_size)
            assert (tree.num_transactions == window_size)
        if (num_transactions % sort_interval) == 0:
            tree.sort()
            frequency = tree.item_count.copy()
        if (num_transactions % window_size) == 0:
            if (num_transactions % sort_interval) != 0:
                # We won't have sorted due to the previous check, so we
                # need to sort before mining.
                tree.sort()
                frequency = tree.item_count.copy()
            assert (tree.num_transactions == len(sliding_window))
            assert (len(sliding_window) == window_size)
            min_count = min_support * tree.num_transactions
            patterns = fp_growth(tree, min_count, [])
            yield (num_transactions - len(sliding_window), len(sliding_window),
                   patterns)
    else:
        # We didn't just mine on the last transaction, we need to mine now,
        # else we'll miss data.
        if (num_transactions % window_size) != 0:
            if (num_transactions % sort_interval) != 0:
                tree.sort()
                frequency = tree.item_count.copy()
            min_count = min_support * tree.num_transactions
            patterns = fp_growth(tree, min_count, [])
            yield (num_transactions - len(sliding_window), len(sliding_window),
                   patterns)
Example #15
0
#     # Just some placeholder data
#     miner = DDPMine(debug=False)
#     miner.mine()

if __name__ == "__main__":

    database = TransactionDatabase.loadFromFile("./data/train_adt.csv", ['97'],
                                                100)
    data = TransactionDatabase.loadFromFile("./data/train_adt.csv", ['97'], 1)
    data1 = TransactionDatabase.loadFromFile("./data/test_adt.csv", ['97'], 1)
    # database.cleanAndPrune(2)
    # print ("Cleaned database:")
    # for transaction in database.transactions:
    #     print(str(transaction.label))
    # print ("\nItems in FP tree and corresponding nodes:")
    tree = FPTree()
    for t in database:
        tree.add(t)

    # print(str(tree))
    miner = DDPMine(debug=True)
    start = time.clock()
    Pt = miner.mine(database, 100)
    elapsed = time.clock() - start
    print("Time Total:%f" % elapsed)
    print(Pt)
    for row in Pt:
        print("Pattern:%s  label:%s" % (row[0], row[1]))

    for row in Pt:
        lb1 = 0