def buildTree(self,transactionDatabase): master = FPTree() for transaction in transactionDatabase: #print transaction master.add(transaction) return master
def build_tree(window, item_count): path_len_sum = 0 path_count = 0 tree = FPTree() for bucket in window: for (transaction, count) in bucket.tree: sorted_transaction = sort_transaction(transaction, item_count) path_len_sum += count * len(sorted_transaction) path_count += count tree.insert(sorted_transaction, count) avg_path_len = path_len_sum / path_count return (tree, avg_path_len)
def test_tree_iter(): tree = FPTree() (item_count, _) = count_item_frequency_in(test_transactions) expected = Counter() for transaction in test_transactions: sort_transaction(transaction, item_count) tree.insert(transaction) expected[frozenset(transaction)] += 1 observed = Counter() for (transaction, count) in tree: observed[frozenset(transaction)] += count assert (expected == observed)
def test_tree_iter(): tree = FPTree() item_count = count_item_frequency_in(test_transactions) expected = Counter() for transaction in [list(map(Item, t)) for t in test_transactions]: sort_transaction(transaction, item_count) tree.insert(transaction) expected[frozenset(transaction)] += 1 stored_transactions = set() observed = Counter() for (transaction, count) in tree: observed[frozenset(transaction)] += count assert(expected == observed)
def __init__(self, mid = 1): self.mid = mid self.contexts = open('../data/%s.%s.all.nouns.contexts.utf-8' % \ (config.PREFIX_CONTEXT, mid)) self.features = open('../data/%s.%s.all.features.utf-8' % \ (config.PREFIX_FEATURE, mid), 'w') self.results = open('../data/%s.%s.frequent.features.utf-8' % \ (config.PREFIX_FEATURE, mid), 'w') self.scores = {} self.preTree = FPTree() self.postTree = FPTree() self.wordCount = self._genWordCount()
def __init__(self, debug=False): self.fp_tree = FPTree() self._maxGain_ = 0.0 self._bestPattern = None self._bestPatterns = [] ####################### self._label = None self.debug = debug
def conditional_tree_from_paths(paths, minimum_support): """Builds a conditional FP-tree from the given prefix paths.""" tree = FPTree() condition_item = None items = set() # Import the nodes in the paths into the new tree. Only the counts of the # leaf notes matter; the remaining counts will be reconstructed from the # leaf counts. for path in paths: if condition_item is None: condition_item = path[-1].item point = tree.root for node in path: next_point = point.search(node.item) if not next_point: # Add a new node to the tree. items.add(node.item) count = node.count if node.item == condition_item else 0 next_point = FPNode(tree, node.item, count) point.add(next_point) tree._update_route(next_point) point = next_point assert condition_item is not None # Calculate the counts of the non-leaf nodes. for path in tree.prefix_paths(condition_item): count = None for node in reversed(path): if count is not None: node._count += count count = node.count # Eliminate the nodes for any items that are no longer frequent. for item in items: support = sum(n.count for n in tree.nodes(item)) if support < minimum_support: # Doesn't make the cut anymore for node in tree.nodes(item): if node.parent is not None: node.parent.remove(node) # Finally, remove the nodes corresponding to the item for which this # conditional tree was generated. for node in tree.nodes(condition_item): if node.parent is not None: # the node might already be an orphan node.parent.remove(node) return tree
class Bucket: def __init__(self, transaction=None): self.tree = FPTree() self.sorting_counter = None if transaction is not None: self.add(transaction) def add(self, transaction): self.tree.insert(sort_transaction(transaction, self.sorting_counter)) def __len__(self): return self.tree.num_transactions def append(self, other_bucket): for (transaction, count) in other_bucket.tree: self.tree.insert( sort_transaction(transaction, self.sorting_counter), count) self.tree.sort() # TODO: Is this necessary? self.sorting_counter = self.tree.item_count.copy() def __str__(self): return str(self.transactions)
for antecedent in itertools.combinations(itemset, i): antecedent = tuple(sorted(antecedent)) consequent = tuple(sorted(set(itemset) - set(antecedent))) if antecedent in patterns: lower_support = patterns[antecedent] confidence = float(upper_support) / lower_support if confidence >= confidence_threshold: rules[antecedent] = (consequent, confidence) return rules if __name__ == "__main__": big_dataset = load_transactions("data/Transactions.csv") support_threshold = int(len(big_dataset) * 0.05) tree = FPTree(big_dataset, support_threshold, None, None) patterns = tree.mine_patterns(support_threshold) print "Frequent patterns:", patterns print "Patterns found:", len(patterns) # Generate association rules from the frequent itemsets. min_confidence = 0.5 rules = generate_association_rules(patterns, min_confidence) for rule in rules.keys(): print rule, "=>", rules[rule] print "Number of rules found:", len(rules)
def find_frequent_itemsets(transactions, minimum_support, include_support=False): """ Find frequent itemsets in the given transactions using FP-growth. This function returns a generator instead of an eagerly-populated list of items. The `transactions` parameter can be any iterable of iterables of items. `minimum_support` should be an integer specifying the minimum number of occurrences of an itemset for it to be accepted. Each item must be hashable (i.e., it must be valid as a member of a dictionary or a set). If `include_support` is true, yield (itemset, support) pairs instead of just the itemsets. """ items = defaultdict(lambda: 0) # mapping from items to their supports processed_transactions = [] # Load the passed-in transactions and count the support that individual # items have. for transaction in transactions: processed = [] for item in transaction: items[item] += 1 processed.append(item) processed_transactions.append(processed) # Remove infrequent items from the item support dictionary. items = dict((item, support) for item, support in items.iteritems() if support >= minimum_support) # Build our FP-tree. Before any transactions can be added to the tree, they # must be stripped of infrequent items and their surviving items must be # sorted in decreasing order of frequency. def clean_transaction(transaction): transaction = filter(lambda v: v in items, transaction) transaction.sort(key=lambda v: items[v], reverse=True) return transaction master = FPTree() for transaction in imap(clean_transaction, processed_transactions): master.add(transaction) def find_with_suffix(tree, suffix): for item, nodes in tree.items(): support = sum(n.count for n in nodes) if support >= minimum_support and item not in suffix: # New winner! found_set = [item] + suffix yield (found_set, support) if include_support else found_set # Build a conditional tree and recursively search for frequent # itemsets within it. cond_tree = conditional_tree_from_paths( tree.prefix_paths(item), minimum_support) for s in find_with_suffix(cond_tree, found_set): yield s # pass along the good news to our caller # Search for frequent itemsets, and yield the results we find. for itemset in find_with_suffix(master, []): yield itemset
class Featrues: def __init__(self, mid = 1): self.mid = mid self.contexts = open('../data/%s.%s.all.nouns.contexts.utf-8' % \ (config.PREFIX_CONTEXT, mid)) self.features = open('../data/%s.%s.all.features.utf-8' % \ (config.PREFIX_FEATURE, mid), 'w') self.results = open('../data/%s.%s.frequent.features.utf-8' % \ (config.PREFIX_FEATURE, mid), 'w') self.scores = {} self.preTree = FPTree() self.postTree = FPTree() self.wordCount = self._genWordCount() def _genWordCount(self): nouns = open('../data/%s.%s.all.nouns.utf-8' % (config.PREFIX_WORD, self.mid)) wc = {} for n in nouns: w = n.strip().split(' ')[0] c = n.strip().split(' ')[1] wc[w.split('#')[0]] = c nouns.close() return wc def save(self): self.contexts.close() self.features.close() self.results.close() def outputFeatures(self): sortedFeatures = sorted(self.scores.iteritems(), key = lambda k:k[1], reverse = True) for f in sortedFeatures: # print(f[1]) if f[1] > 10: self.results.write(f[0] + '\n') def outputAllFeatures(self): preFP = self.preTree.getFP() postFP = self.postTree.getFP() preFrequence = self._toOneWord(preFP) postFrequence = self._toOneWord(postFP) # self.preTree.printTree() # self.postTree.printTree() sortedPre = sorted(preFrequence.iteritems(), key = lambda k:k[1][0], reverse = True) for s in sortedPre: key = s[0] if postFrequence.has_key(key) and \ preFrequence[key][1].strip() != 'empty' and \ postFrequence[key][1].strip() != 'empty': if self.scores.has_key(key): self.scores[key] += config.WORD_COUNT_FACTOR * float(self.wordCount[key]) + \ config.PRE_CONTEXT_FACTOR * float(s[1][0]) + \ config.POST_CONTEXT_FACTOR * float(postFrequence[key][0]) else: self.scores[key] = config.WORD_COUNT_FACTOR * float(self.wordCount[key]) + \ config.PRE_CONTEXT_FACTOR * float(s[1][0]) + \ config.POST_CONTEXT_FACTOR * float(postFrequence[key][0]) self.features.write('%s %s %s %s %s %s\n' % \ (s[1][0], \ postFrequence[key][0], \ self.wordCount[key], \ preFrequence[key][1], \ postFrequence[key][1], \ key)) def _toOneWord(self, fp): f = {} for key in fp: for k in key.split(' '): f[k] = fp[key] return f def genFeatrues(self): for c in self.contexts: if c.find('>>>') < 0: continue word = c.strip().split('>>>')[0].split('#')[0] cs = c.strip().split('>>>')[1].split('&&&') for c in cs: pre = c.split('---')[0].split(' ') post = c.split('---')[1].split(' ') pre.reverse() transaction = self._genTransaction(word, pre) self.preTree.insertTransaction(transaction) transaction = self._genTransaction(word, post) self.postTree.insertTransaction(transaction) def _genTransaction(self, word, contexts): transaction = [] length = len(contexts) for i in range(length): c = contexts[length - i - 1] if c.find('#') < 0: transaction.append(c) else: transaction.append(c.split('#')[1]) transaction.append(word) return transaction
def test_tree_sorting(): expected_tree = construct_initial_tree(test_transactions) assert(expected_tree.is_sorted()) tree = FPTree() for transaction in test_transactions: # Insert reversed, since lexicographical order is already decreasing # frequency order in this example. tree.insert(map(Item, reversed(transaction))) assert(str(expected_tree) != str(tree)) tree.sort() assert(tree.is_sorted()) assert(str(expected_tree) == str(tree)) datasets = [ "datasets/UCI-zoo.csv", "datasets/mushroom.csv", # "datasets/BMS-POS.csv", # "datasets/kosarak.csv", ] for csvFilePath in datasets: print("Loading FPTree for {}".format(csvFilePath)) start = time.time() tree = FPTree() with open(csvFilePath, newline='') as csvfile: for line in list(csv.reader(csvfile)): # Insert sorted lexicographically transaction = sorted(map(Item, line)) tree.insert(transaction) duration = time.time() - start print("Loaded in {:.2f} seconds".format(duration)) print("Sorting...") start = time.time() tree.sort() duration = time.time() - start print("Sorting took {:.2f} seconds".format(duration)) assert(tree.is_sorted())
def __init__(self, transaction=None): self.tree = FPTree() self.sorting_counter = None if transaction is not None: self.add(transaction)
def mine_cp_tree_stream(transactions, min_support, sort_interval, window_size): # Yields (window_start_index, window_length, patterns) tree = FPTree() sliding_window = deque() frequency = None num_transactions = 0 for transaction in transactions: num_transactions += 1 transaction = sort_transaction(map(Item, transaction), frequency) tree.insert(transaction) sliding_window.append(transaction) if len(sliding_window) > window_size: transaction = sliding_window.popleft() transaction = sort_transaction(transaction, frequency) tree.remove(transaction, 1) assert (len(sliding_window) == window_size) assert (tree.num_transactions == window_size) if (num_transactions % sort_interval) == 0: tree.sort() frequency = tree.item_count.copy() if (num_transactions % window_size) == 0: if (num_transactions % sort_interval) != 0: # We won't have sorted due to the previous check, so we # need to sort before mining. tree.sort() frequency = tree.item_count.copy() assert (tree.num_transactions == len(sliding_window)) assert (len(sliding_window) == window_size) min_count = min_support * tree.num_transactions patterns = fp_growth(tree, min_count, []) yield (num_transactions - len(sliding_window), len(sliding_window), patterns) else: # We didn't just mine on the last transaction, we need to mine now, # else we'll miss data. if (num_transactions % window_size) != 0: if (num_transactions % sort_interval) != 0: tree.sort() frequency = tree.item_count.copy() min_count = min_support * tree.num_transactions patterns = fp_growth(tree, min_count, []) yield (num_transactions - len(sliding_window), len(sliding_window), patterns)
# # Just some placeholder data # miner = DDPMine(debug=False) # miner.mine() if __name__ == "__main__": database = TransactionDatabase.loadFromFile("./data/train_adt.csv", ['97'], 100) data = TransactionDatabase.loadFromFile("./data/train_adt.csv", ['97'], 1) data1 = TransactionDatabase.loadFromFile("./data/test_adt.csv", ['97'], 1) # database.cleanAndPrune(2) # print ("Cleaned database:") # for transaction in database.transactions: # print(str(transaction.label)) # print ("\nItems in FP tree and corresponding nodes:") tree = FPTree() for t in database: tree.add(t) # print(str(tree)) miner = DDPMine(debug=True) start = time.clock() Pt = miner.mine(database, 100) elapsed = time.clock() - start print("Time Total:%f" % elapsed) print(Pt) for row in Pt: print("Pattern:%s label:%s" % (row[0], row[1])) for row in Pt: lb1 = 0