Esempio n. 1
0
class Bucket:
    def __init__(self, transaction=None):
        self.tree = FPTree()
        self.sorting_counter = None
        if transaction is not None:
            self.add(transaction)

    def add(self, transaction):
        self.tree.insert(sort_transaction(transaction, self.sorting_counter))

    def __len__(self):
        return self.tree.num_transactions

    def append(self, other_bucket):
        for (transaction, count) in other_bucket.tree:
            self.tree.insert(
                sort_transaction(transaction, self.sorting_counter), count)
        self.tree.sort()  # TODO: Is this necessary?
        self.sorting_counter = self.tree.item_count.copy()

    def __str__(self):
        return str(self.transactions)
Esempio n. 2
0
def test_tree_sorting():

    expected_tree = construct_initial_tree(test_transactions)
    assert(expected_tree.is_sorted())

    tree = FPTree()
    for transaction in test_transactions:
        # Insert reversed, since lexicographical order is already decreasing
        # frequency order in this example.
        tree.insert(map(Item, reversed(transaction)))
    assert(str(expected_tree) != str(tree))
    tree.sort()
    assert(tree.is_sorted())
    assert(str(expected_tree) == str(tree))

    datasets = [
        "datasets/UCI-zoo.csv",
        "datasets/mushroom.csv",
        # "datasets/BMS-POS.csv",
        # "datasets/kosarak.csv",
    ]

    for csvFilePath in datasets:
        print("Loading FPTree for {}".format(csvFilePath))
        start = time.time()
        tree = FPTree()
        with open(csvFilePath, newline='') as csvfile:
            for line in list(csv.reader(csvfile)):
                # Insert sorted lexicographically
                transaction = sorted(map(Item, line))
                tree.insert(transaction)
        duration = time.time() - start
        print("Loaded in {:.2f} seconds".format(duration))
        print("Sorting...")
        start = time.time()
        tree.sort()
        duration = time.time() - start
        print("Sorting took {:.2f} seconds".format(duration))
        assert(tree.is_sorted())
Esempio n. 3
0
def mine_cp_tree_stream(transactions, min_support, sort_interval, window_size):
    # Yields (window_start_index, window_length, patterns)
    tree = FPTree()
    sliding_window = deque()
    frequency = None
    num_transactions = 0
    for transaction in transactions:
        num_transactions += 1
        transaction = sort_transaction(map(Item, transaction), frequency)
        tree.insert(transaction)
        sliding_window.append(transaction)
        if len(sliding_window) > window_size:
            transaction = sliding_window.popleft()
            transaction = sort_transaction(transaction, frequency)
            tree.remove(transaction, 1)
            assert (len(sliding_window) == window_size)
            assert (tree.num_transactions == window_size)
        if (num_transactions % sort_interval) == 0:
            tree.sort()
            frequency = tree.item_count.copy()
        if (num_transactions % window_size) == 0:
            if (num_transactions % sort_interval) != 0:
                # We won't have sorted due to the previous check, so we
                # need to sort before mining.
                tree.sort()
                frequency = tree.item_count.copy()
            assert (tree.num_transactions == len(sliding_window))
            assert (len(sliding_window) == window_size)
            min_count = min_support * tree.num_transactions
            patterns = fp_growth(tree, min_count, [])
            yield (num_transactions - len(sliding_window), len(sliding_window),
                   patterns)
    else:
        # We didn't just mine on the last transaction, we need to mine now,
        # else we'll miss data.
        if (num_transactions % window_size) != 0:
            if (num_transactions % sort_interval) != 0:
                tree.sort()
                frequency = tree.item_count.copy()
            min_count = min_support * tree.num_transactions
            patterns = fp_growth(tree, min_count, [])
            yield (num_transactions - len(sliding_window), len(sliding_window),
                   patterns)