def test_against_R_implementation_1(): """ The following R-code was used: > install.packages("arules") > col1 = c("a", "a", "a", "b", "b", "b", "b") > col2 = c("c", "c", "d", "d", "d", "c", "c") > col3 = c("e", "e", "e", "e", "f", "f", "f") > df = data.frame(col1, col2, col3) > df <- data.frame(sapply(df, as.factor)) > rules <- apriori(df, parameter = list(supp = 0.2, conf = 0.2)) > inspect(head(rules, by = "confidence")) """ transactions = [('a', 'c', 'e'), ('a', 'c', 'e'), ('a', 'd', 'e'), ('b', 'd', 'e'), ('b', 'd', 'f'), ('b', 'c', 'f'), ('b', 'c', 'f')] itemsets, rules = apriori(transactions, 0.2, 0.2) assert Rule(('a', ), ('e', )) in rules for rule in rules: if rule == Rule(('a', ), ('e', )): assert abs(rule.support - 0.4285714) < 10e-7 assert rule.confidence == 1 if rule == Rule(('c', 'e'), ('a', )): assert abs(rule.support - 0.2857143) < 10e-7 assert rule.confidence == 1 if rule == Rule(('e', ), ('a', )): assert abs(rule.support - 0.4285714) < 10e-7 assert rule.confidence == 3 / 4
def test_iterator_input(): """ Minimal test using transactions from iterators. """ empty_iterator = iter(()) transactions = empty_iterator itemsets, rules = apriori(transactions, 0.2, 0.2) assert itemsets == {} and rules == [] transactions = [(1, 2), (1, 2), (1, 3), (1, 4), (1, 3)] transactions_iter = iter(transactions) itemsets1, rules1 = apriori(transactions_iter, 0.2, 1) itemsets2, rules2 = apriori(transactions, 0.2, 1) assert len(rules1) == len(rules2) for i in range(len(rules1)): assert rules1[i] == rules2[i]
def test_api(): transactions = [ ("a", "c", "e"), ("a", "c", "e"), ("a", "d", "e"), ("b", "d", "e"), ("b", "d", "f"), ("b", "c", "f"), ("b", "c", "f"), ] itemsets, rules = apriori(transactions, 0.2, 0.2) assert itemsets[1] == { ("a", ): 3, ("c", ): 4, ("e", ): 4, ("d", ): 3, ("b", ): 4, ("f", ): 3 } assert all(isinstance(rule, Rule) for rule in rules) for count, itemsets_dict in itemsets.items(): assert isinstance(itemsets_dict, dict) for itemset, count in itemsets_dict.items(): actual_count = sum(1 if set(itemset).issubset(set(trans)) else 0 for trans in transactions) assert count == actual_count itemsets, rules = apriori(transactions, 0.2, 0.2, output_transaction_ids=True) for count, itemsets_dict in itemsets.items(): assert isinstance(itemsets_dict, dict) for itemset, counter in itemsets_dict.items(): assert isinstance(counter, ItemsetCount) actual_count = sum(1 if set(itemset).issubset(set(trans)) else 0 for trans in transactions) assert counter.itemset_count == actual_count
def test_against_R_implementation_3(): """ The following R-code was used: > install.packages("arules") > col1 = c("b", "b", "c", "a", "b", "b", "a", "a", "b", "b", "a", "a", "c", "b", "a", "c") > col2 = c("e", "d", "e", "e", "e", "e", "d", "e", "e", "e", "d", "e", "e", "e", "d", "e") > col3 = c("i", "g", "h", "j", "i", "g", "h", "j", "i", "g", "j", "i", "j", "j", "i", "i") > df = data.frame(col1, col2, col3) > df <- data.frame(sapply(df, as.factor)) > rules <- apriori(df, parameter = list(supp = 0.2, conf = 0.2)) > inspect(head(rules, by = "confidence")) """ transactions = [ ("b", "e", "i"), ("b", "d", "g"), ("c", "e", "h"), ("a", "e", "j"), ("b", "e", "i"), ("b", "e", "g"), ("a", "d", "h"), ("a", "e", "j"), ("b", "e", "i"), ("b", "e", "g"), ("a", "d", "j"), ("a", "e", "i"), ("c", "e", "j"), ("b", "e", "j"), ("a", "d", "i"), ("c", "e", "i"), ] itemsets, rules = apriori(transactions, 0.2, 0.2) for rule in rules: if rule == Rule(("b", ), ("e", )): assert abs(rule.support - 0.3750) < 10e-7 assert abs(rule.confidence - 0.8571429) < 10e-7 if rule == Rule(("i", ), ("e", )): assert abs(rule.support - 0.3125) < 10e-7 assert abs(rule.confidence - 0.8333333) < 10e-7 if rule == Rule(("j", ), ("e", )): assert abs(rule.support - 0.2500) < 10e-7 assert abs(rule.confidence - 0.8000000) < 10e-7 if rule == Rule(("e", ), ("b", )): assert abs(rule.support - 0.3750) < 10e-7 assert abs(rule.confidence - 0.5000000) < 10e-7
def test_against_R_implementation_2(): """ The following R-code was used: > install.packages("arules") > col1 = c("b", "b", "c", "b", "a", "a", "b", "c", "b", "b", "a", "b", "a", "a", "a", "c", "b", "a", "b", "b", "b", "c", "a", "c", "a", "a", "c", "a", "b", "b", "a", "c") > col2 = c("e", "f", "e", "e", "f", "e", "d", "f", "e", "e", "e", "d", "e", "e", "f", "d", "d", "d", "e", "f", "f", "d", "d", "f", "e", "e", "f", "f", "f", "d", "e", "e") > col3 = c("g", "i", "j", "i", "i", "j", "i", "h", "g", "j", "g", "h", "i", "h", "g", "h", "g", "j", "h", "i", "g", "g", "i", "h", "h", "h", "h", "g", "j", "i", "g", "g") > df = data.frame(col1, col2, col3) > df <- data.frame(sapply(df, as.factor)) > rules <- apriori(df, parameter = list(supp = 0.2, conf = 0.2)) > inspect(head(rules, by = "confidence")) """ transactions = [('b', 'e', 'g'), ('b', 'f', 'i'), ('c', 'e', 'j'), ('b', 'e', 'i'), ('a', 'f', 'i'), ('a', 'e', 'j'), ('b', 'd', 'i'), ('c', 'f', 'h'), ('b', 'e', 'g'), ('b', 'e', 'j'), ('a', 'e', 'g'), ('b', 'd', 'h'), ('a', 'e', 'i'), ('a', 'e', 'h'), ('a', 'f', 'g'), ('c', 'd', 'h'), ('b', 'd', 'g'), ('a', 'd', 'j'), ('b', 'e', 'h'), ('b', 'f', 'i'), ('b', 'f', 'g'), ('c', 'd', 'g'), ('a', 'd', 'i'), ('c', 'f', 'h'), ('a', 'e', 'h'), ('a', 'e', 'h'), ('c', 'f', 'h'), ('a', 'f', 'g'), ('b', 'f', 'j'), ('b', 'd', 'i'), ('a', 'e', 'g'), ('c', 'e', 'g')] itemsets, rules = apriori(transactions, 0.2, 0.2) for rule in rules: if rule == Rule(('a', ), ('e', )): assert abs(rule.support - 0.21875) < 10e-7 assert abs(rule.confidence - 0.5833333) < 10e-7 if rule == Rule(('e', ), ('a', )): assert abs(rule.support - 0.21875) < 10e-7 assert abs(rule.confidence - 0.5000000) < 10e-7
def test_minimal_input(): """ The with some minimal inputs, and make sure the correct errors are raised. """ transactions = [] itemsets, rules = apriori(transactions, 0.2, 0.2) assert itemsets == {} and rules == [] with pytest.raises(ValueError): itemsets, rules = apriori(transactions, -0.2, 0.2) with pytest.raises(ValueError): itemsets, rules = apriori(transactions, 0.2, -0.2) with pytest.raises(ValueError): itemsets, rules = apriori(transactions, "asdf", 1) itemsets, rules = apriori([(1, 2), (1, 2), (1, 3)], 1, 1) itemsets, rules = apriori([(1, 2), (1, 2), (1, 3)], 1.0, 1.0)
def test_adult_dataset(): """ Test on the Adult dataset, which may be found here: https://archive.ics.uci.edu/ml/datasets/adult Some numeric columns were removed. The age was discretized. The purpose of this test is to assure that the algorithm can deal with a small 2.2 MB (30k rows) data set reasonably efficiently. Test against R, from the following code > library(arules) > df <- read.csv("adult_data_cleaned.txt", header = FALSE) > df <- data.frame(sapply(df, as.factor)) > rules <- apriori(df, parameter = list(supp = 0.4, conf = 0.4)) > inspect(head(rules, by = "confidence"m = 10)) """ def data_generator(filename): """ Data generator, needs to return a generator to be called several times. """ def data_gen(): with open(filename) as file: for line in file: yield tuple(k.strip() for k in line.split(',')) return data_gen try: base, _ = os.path.split(__file__) filename = os.path.join(base, 'adult_data_cleaned.txt') except NameError: filename = 'adult_data_cleaned.txt' transactions = data_generator(filename) itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=0.2) # Test that the rules found in R were also found using this implementation rules_set = set(rules) assert Rule(('Married-civ-spouse', 'Husband', 'middle-aged'), ('Male',)) in rules_set assert Rule(('Married-civ-spouse', 'White', 'middle-aged', 'Male'), ('Husband',)) in rules_set assert Rule(('<=50K', 'young'), ('Never-married',)) in rules_set assert Rule(('Husband', 'White', 'Male', 'middle-aged'), ('Married-civ-spouse',)) in rules_set assert Rule(('young',), ('Never-married',)) in rules_set # Test results against R package arules for rule in rules: if rule == Rule(('Married-civ-spouse', 'Husband', 'middle-aged'), ('Male',)): assert abs(rule.support - 0.2356193) < 10e-7 assert abs(rule.confidence - 0.9998697) < 10e-7 assert abs(rule.lift - 1.494115) < 10e-7 if rule == Rule(('Married-civ-spouse', 'White', 'middle-aged', 'Male'), ('Husband',)): assert abs(rule.support - 0.2123399) < 10e-7 assert abs(rule.confidence - 0.9938192) < 10e-7 assert abs(rule.lift - 2.452797) < 10e-7 if rule == Rule(('<=50K', 'young'), ('Never-married',)): assert abs(rule.support - 0.2170081) < 10e-7 assert abs(rule.confidence - 0.7680435) < 10e-7 assert abs(rule.lift - 2.340940) < 10e-7 if rule == Rule(('Husband', 'White', 'Male', 'middle-aged'), ('Married-civ-spouse',)): assert abs(rule.support - 0.2123399) < 10e-7 assert abs(rule.confidence - 0.9995663) < 10e-7 assert abs(rule.lift - 2.173269) < 10e-7 if rule == Rule(('young',), ('Never-married',)): assert abs(rule.support - 0.2200792) < 10e-7 assert abs(rule.confidence - 0.7379261) < 10e-7 assert abs(rule.lift - 2.249144) < 10e-7