def build_tree(window, item_count): path_len_sum = 0 path_count = 0 tree = FPTree() for bucket in window: for (transaction, count) in bucket.tree: sorted_transaction = sort_transaction(transaction, item_count) path_len_sum += count * len(sorted_transaction) path_count += count tree.insert(sorted_transaction, count) avg_path_len = path_len_sum / path_count return (tree, avg_path_len)
def test_tree_iter(): tree = FPTree() (item_count, _) = count_item_frequency_in(test_transactions) expected = Counter() for transaction in test_transactions: sort_transaction(transaction, item_count) tree.insert(transaction) expected[frozenset(transaction)] += 1 observed = Counter() for (transaction, count) in tree: observed[frozenset(transaction)] += count assert (expected == observed)
def test_tree_iter(): tree = FPTree() item_count = count_item_frequency_in(test_transactions) expected = Counter() for transaction in [list(map(Item, t)) for t in test_transactions]: sort_transaction(transaction, item_count) tree.insert(transaction) expected[frozenset(transaction)] += 1 stored_transactions = set() observed = Counter() for (transaction, count) in tree: observed[frozenset(transaction)] += count assert(expected == observed)
def mine_cp_tree_stream(transactions, min_support, sort_interval, window_size): # Yields (window_start_index, window_length, patterns) tree = FPTree() sliding_window = deque() frequency = None num_transactions = 0 for transaction in transactions: num_transactions += 1 transaction = sort_transaction(map(Item, transaction), frequency) tree.insert(transaction) sliding_window.append(transaction) if len(sliding_window) > window_size: transaction = sliding_window.popleft() transaction = sort_transaction(transaction, frequency) tree.remove(transaction, 1) assert (len(sliding_window) == window_size) assert (tree.num_transactions == window_size) if (num_transactions % sort_interval) == 0: tree.sort() frequency = tree.item_count.copy() if (num_transactions % window_size) == 0: if (num_transactions % sort_interval) != 0: # We won't have sorted due to the previous check, so we # need to sort before mining. tree.sort() frequency = tree.item_count.copy() assert (tree.num_transactions == len(sliding_window)) assert (len(sliding_window) == window_size) min_count = min_support * tree.num_transactions patterns = fp_growth(tree, min_count, []) yield (num_transactions - len(sliding_window), len(sliding_window), patterns) else: # We didn't just mine on the last transaction, we need to mine now, # else we'll miss data. if (num_transactions % window_size) != 0: if (num_transactions % sort_interval) != 0: tree.sort() frequency = tree.item_count.copy() min_count = min_support * tree.num_transactions patterns = fp_growth(tree, min_count, []) yield (num_transactions - len(sliding_window), len(sliding_window), patterns)
def test_tree_sorting(): expected_tree = construct_initial_tree(test_transactions) assert(expected_tree.is_sorted()) tree = FPTree() for transaction in test_transactions: # Insert reversed, since lexicographical order is already decreasing # frequency order in this example. tree.insert(map(Item, reversed(transaction))) assert(str(expected_tree) != str(tree)) tree.sort() assert(tree.is_sorted()) assert(str(expected_tree) == str(tree)) datasets = [ "datasets/UCI-zoo.csv", "datasets/mushroom.csv", # "datasets/BMS-POS.csv", # "datasets/kosarak.csv", ] for csvFilePath in datasets: print("Loading FPTree for {}".format(csvFilePath)) start = time.time() tree = FPTree() with open(csvFilePath, newline='') as csvfile: for line in list(csv.reader(csvfile)): # Insert sorted lexicographically transaction = sorted(map(Item, line)) tree.insert(transaction) duration = time.time() - start print("Loaded in {:.2f} seconds".format(duration)) print("Sorting...") start = time.time() tree.sort() duration = time.time() - start print("Sorting took {:.2f} seconds".format(duration)) assert(tree.is_sorted())
class Bucket: def __init__(self, transaction=None): self.tree = FPTree() self.sorting_counter = None if transaction is not None: self.add(transaction) def add(self, transaction): self.tree.insert(sort_transaction(transaction, self.sorting_counter)) def __len__(self): return self.tree.num_transactions def append(self, other_bucket): for (transaction, count) in other_bucket.tree: self.tree.insert( sort_transaction(transaction, self.sorting_counter), count) self.tree.sort() # TODO: Is this necessary? self.sorting_counter = self.tree.item_count.copy() def __str__(self): return str(self.transactions)