def speeds(): """ Test the naive rule finder vs. the simple one from the paper. """ import random random.seed(123456) transactions = generate_transactions(num_transactions=random.randint( 250, 500), unique_items=random.randint(8, 9), items_row=(10, 50)) itemsets, num_transactions = itemsets_from_transactions(transactions, 0.1) import time min_conf = 0.5 print(itemsets) st = time.perf_counter() rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions) rules_apri = list(rules_apri) time_formatted = round(time.perf_counter() - st, 40) print('Fast apriori ran in {} s'.format(time_formatted)) st = time.perf_counter() rules_simple = generate_rules_simple(itemsets, min_conf, num_transactions) rules_simple = list(rules_simple) time_formatted = round(time.perf_counter() - st, 40) print('Simple apriori ran in {} s'.format(time_formatted)) st = time.perf_counter() rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions) rules_naive = list(rules_naive) time_formatted = round(time.perf_counter() - st, 40) print('Naive apriori ran in {} s'.format(time_formatted))
def test_generate_rules_apriori_large(): """ Test with lots of data. This test will fail if the second argument to `_ap_genrules` is not validated as non-empty before the recursive function call. We must have if H_m_copy: yield from _ap_genrules for this test to pass. """ transactions = generate_transactions(num_transactions=100, unique_items=30, items_row=(1, 20), seed=123) itemsets, num_transactions = itemsets_from_transactions(transactions, 0.1) min_conf = 0.3 rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions) rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions) rules_apri = list(rules_apri) rules_naive = list(rules_naive) # Test equal length, since no duplicates should be returned by apriori assert len(rules_apri) == len(rules_naive) # Test equal results assert set(rules_apri) == set(rules_naive)
def test_generate_rules_simple_vs_apriori(transactions): """ Test the naive rule finder vs. the simple one from the paper. """ itemsets, num_transactions = itemsets_from_transactions(transactions, 0.1) min_conf = 0.1 rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions) rules_simple = generate_rules_simple(itemsets, min_conf, num_transactions) assert set(rules_apri) == set(rules_simple)
def apriori( transactions: typing.List[tuple], min_support: float = 0.5, min_confidence: float = 0.5, max_length: int = 8, verbosity: int = 0, ): """ The classic apriori algorithm as described in 1994 by Agrawal et al. The Apriori algorithm works in two phases. Phase 1 iterates over the transactions several times to build up itemsets of the desired support level. Phase 2 builds association rules of the desired confidence given the itemsets found in Phase 1. Both of these phases may be correctly implemented by exhausting the search space, i.e. generating every possible itemset and checking it's support. The Apriori prunes the search space efficiently by deciding apriori if an itemset possibly has the desired support, before iterating over the entire dataset and checking. Parameters ---------- transactions : list of tuples, or a callable returning a generator The transactions may be either a list of tuples, where the tuples must contain hashable items. Alternatively, a callable returning a generator may be passed. A generator is not sufficient, since the algorithm will exhaust it, and it needs to iterate over it several times. Therefore, a callable returning a generator must be passed. min_support : float The minimum support of the rules returned. The support is frequency of which the items in the rule appear together in the data set. min_confidence : float The minimum confidence of the rules returned. Given a rule X -> Y, the confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y) max_length : int The maximum length of the itemsets and the rules. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. Examples -------- >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')] >>> itemsets, rules = apriori(transactions, min_confidence=1) >>> rules [{a} -> {b}] """ itemsets, num_trans = itemsets_from_transactions( transactions, min_support, max_length, verbosity ) rules = generate_rules_apriori( itemsets, min_confidence, num_trans, verbosity ) return itemsets, list(rules)
def test_generate_rules_naive_vs_apriori(transactions): """ Test the naive rule finder vs. the simple one from the paper. """ itemsets, num_transactions = itemsets_from_transactions(transactions, 0.15) min_conf = 0.3 rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions) rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions) rules_apri = list(rules_apri) rules_naive = list(rules_naive) # Test equal length, since no duplicates should be returned by apriori assert len(rules_apri) == len(rules_naive) # Test equal results assert set(rules_apri) == set(rules_naive)
def apriori( transactions: typing.Iterable[typing.Union[set, tuple, list]], min_support: float = 0.5, min_confidence: float = 0.5, max_length: int = 8, verbosity: int = 0, output_transaction_ids: bool = False, ): """ The classic apriori algorithm as described in 1994 by Agrawal et al. The Apriori algorithm works in two phases. Phase 1 iterates over the transactions several times to build up itemsets of the desired support level. Phase 2 builds association rules of the desired confidence given the itemsets found in Phase 1. Both of these phases may be correctly implemented by exhausting the search space, i.e. generating every possible itemset and checking it's support. The Apriori prunes the search space efficiently by deciding apriori if an itemset possibly has the desired support, before iterating over the entire dataset and checking. Parameters ---------- transactions : list of transactions (sets/tuples/lists). Each element in the transactions must be hashable. min_support : float The minimum support of the rules returned. The support is frequency of which the items in the rule appear together in the data set. min_confidence : float The minimum confidence of the rules returned. Given a rule X -> Y, the confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y) max_length : int The maximum length of the itemsets and the rules. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. output_transaction_ids : bool If set to true, the output contains the ids of transactions that contain a frequent itemset. The ids are the enumeration of the transactions in the sequence they appear. Examples -------- >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')] >>> itemsets, rules = apriori(transactions, min_confidence=1) >>> rules [{a} -> {b}] """ itemsets, num_trans = itemsets_from_transactions( transactions, min_support, max_length, verbosity, output_transaction_ids=True, ) itemsets_raw = { length: {item: counter.itemset_count for (item, counter) in itemsets.items()} for (length, itemsets) in itemsets.items() } rules = generate_rules_apriori(itemsets_raw, min_confidence, num_trans, verbosity) if output_transaction_ids: return itemsets, list(rules) else: return itemsets_raw, list(rules)
def apriori( transactions, min_support=0.5, min_confidence=0.5, max_length=8, verbosity=0, output_transaction_ids=False, ): """ The classic apriori algorithm as described in 1994 by Agrawal et al. The Apriori algorithm works in two phases. Phase 1 iterates over the transactions several times to build up itemsets of the desired support level. Phase 2 builds association rules of the desired confidence given the itemsets found in Phase 1. Both of these phases may be correctly implemented by exhausting the search space, i.e. generating every possible itemset and checking it's support. The Apriori prunes the search space efficiently by deciding apriori if an itemset possibly has the desired support, before iterating over the entire dataset and checking. Parameters ---------- transactions : list of tuples, list of itemsets.TransactionWithId, or a callable returning a generator. Use TransactionWithId's when the transactions have ids which should appear in the outputs. The transactions may be either a list of tuples, where the tuples must contain hashable items. Alternatively, a callable returning a generator may be passed. A generator is not sufficient, since the algorithm will exhaust it, and it needs to iterate over it several times. Therefore, a callable returning a generator must be passed. min_support : float The minimum support of the rules returned. The support is frequency of which the items in the rule appear together in the data set. min_confidence : float The minimum confidence of the rules returned. Given a rule X -> Y, the confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y) max_length : int The maximum length of the itemsets and the rules. verbosity : int The level of detail printing when the algorithm runs. Either 0, 1 or 2. output_transaction_ids : bool If set to true, the output contains the ids of transactions that contain a frequent itemset. The ids are the enumeration of the transactions in the sequence they appear. Examples -------- >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')] >>> itemsets, rules = apriori(transactions, min_confidence=1) >>> rules [{a} -> {b}] """ print('itemsets_from_transactions') itemsets, num_trans = itemsets_from_transactions( transactions, min_support, max_length, verbosity, output_transaction_ids, ) if itemsets and isinstance(next(iter(itemsets[1].values())), ItemsetCount): itemsets_for_rules = _convert_to_counts(itemsets) else: itemsets_for_rules = itemsets print('generate_rules_apriori') rules = generate_rules_apriori(itemsets_for_rules, min_confidence, num_trans, verbosity) return itemsets, list(rules)