Ejemplo n.º 1
0
def test_against_R_implementation_1():
    """
    The following R-code was used:
        
    > install.packages("arules")
    > col1 = c("a", "a", "a", "b", "b", "b", "b") 
    > col2 = c("c", "c", "d", "d", "d", "c", "c") 
    > col3 = c("e", "e", "e", "e", "f", "f", "f")
    > df = data.frame(col1, col2, col3)
    > df <- data.frame(sapply(df, as.factor))
    > rules <- apriori(df, parameter = list(supp = 0.2, conf = 0.2))
    > inspect(head(rules, by = "confidence"))
    """

    transactions = [('a', 'c', 'e'), ('a', 'c', 'e'), ('a', 'd', 'e'),
                    ('b', 'd', 'e'), ('b', 'd', 'f'), ('b', 'c', 'f'),
                    ('b', 'c', 'f')]

    itemsets, rules = apriori(transactions, 0.2, 0.2)

    assert Rule(('a', ), ('e', )) in rules

    for rule in rules:
        if rule == Rule(('a', ), ('e', )):
            assert abs(rule.support - 0.4285714) < 10e-7
            assert rule.confidence == 1

        if rule == Rule(('c', 'e'), ('a', )):
            assert abs(rule.support - 0.2857143) < 10e-7
            assert rule.confidence == 1

        if rule == Rule(('e', ), ('a', )):
            assert abs(rule.support - 0.4285714) < 10e-7
            assert rule.confidence == 3 / 4
Ejemplo n.º 2
0
def test_iterator_input():
    """
    Minimal test using transactions from iterators.
    """
    empty_iterator = iter(())
    transactions = empty_iterator
    itemsets, rules = apriori(transactions, 0.2, 0.2)
    assert itemsets == {} and rules == []

    transactions = [(1, 2), (1, 2), (1, 3), (1, 4), (1, 3)]
    transactions_iter = iter(transactions)
    itemsets1, rules1 = apriori(transactions_iter, 0.2, 1)
    itemsets2, rules2 = apriori(transactions, 0.2, 1)
    assert len(rules1) == len(rules2)
    for i in range(len(rules1)):
        assert rules1[i] == rules2[i]
Ejemplo n.º 3
0
def test_api():

    transactions = [
        ("a", "c", "e"),
        ("a", "c", "e"),
        ("a", "d", "e"),
        ("b", "d", "e"),
        ("b", "d", "f"),
        ("b", "c", "f"),
        ("b", "c", "f"),
    ]

    itemsets, rules = apriori(transactions, 0.2, 0.2)

    assert itemsets[1] == {
        ("a", ): 3,
        ("c", ): 4,
        ("e", ): 4,
        ("d", ): 3,
        ("b", ): 4,
        ("f", ): 3
    }
    assert all(isinstance(rule, Rule) for rule in rules)

    for count, itemsets_dict in itemsets.items():
        assert isinstance(itemsets_dict, dict)
        for itemset, count in itemsets_dict.items():

            actual_count = sum(1 if set(itemset).issubset(set(trans)) else 0
                               for trans in transactions)
            assert count == actual_count

    itemsets, rules = apriori(transactions,
                              0.2,
                              0.2,
                              output_transaction_ids=True)
    for count, itemsets_dict in itemsets.items():
        assert isinstance(itemsets_dict, dict)
        for itemset, counter in itemsets_dict.items():
            assert isinstance(counter, ItemsetCount)

            actual_count = sum(1 if set(itemset).issubset(set(trans)) else 0
                               for trans in transactions)
            assert counter.itemset_count == actual_count
Ejemplo n.º 4
0
def test_against_R_implementation_3():
    """
    The following R-code was used:

    > install.packages("arules")
    > col1 = c("b", "b", "c", "a", "b", "b", "a", "a", "b", "b", "a", "a", "c",
    "b", "a", "c")
    > col2 = c("e", "d", "e", "e", "e", "e", "d", "e", "e", "e", "d", "e", "e",
    "e", "d", "e")
    > col3 = c("i", "g", "h", "j", "i", "g", "h", "j", "i", "g", "j", "i", "j",
    "j", "i", "i")
    > df = data.frame(col1, col2, col3)
    > df <- data.frame(sapply(df, as.factor))
    > rules <- apriori(df, parameter = list(supp = 0.2, conf = 0.2))
    > inspect(head(rules, by = "confidence"))
    """

    transactions = [
        ("b", "e", "i"),
        ("b", "d", "g"),
        ("c", "e", "h"),
        ("a", "e", "j"),
        ("b", "e", "i"),
        ("b", "e", "g"),
        ("a", "d", "h"),
        ("a", "e", "j"),
        ("b", "e", "i"),
        ("b", "e", "g"),
        ("a", "d", "j"),
        ("a", "e", "i"),
        ("c", "e", "j"),
        ("b", "e", "j"),
        ("a", "d", "i"),
        ("c", "e", "i"),
    ]

    itemsets, rules = apriori(transactions, 0.2, 0.2)

    for rule in rules:
        if rule == Rule(("b", ), ("e", )):
            assert abs(rule.support - 0.3750) < 10e-7
            assert abs(rule.confidence - 0.8571429) < 10e-7

        if rule == Rule(("i", ), ("e", )):
            assert abs(rule.support - 0.3125) < 10e-7
            assert abs(rule.confidence - 0.8333333) < 10e-7

        if rule == Rule(("j", ), ("e", )):
            assert abs(rule.support - 0.2500) < 10e-7
            assert abs(rule.confidence - 0.8000000) < 10e-7

        if rule == Rule(("e", ), ("b", )):
            assert abs(rule.support - 0.3750) < 10e-7
            assert abs(rule.confidence - 0.5000000) < 10e-7
Ejemplo n.º 5
0
def test_against_R_implementation_2():
    """
    The following R-code was used:
        
    > install.packages("arules")
    > col1 = c("b", "b", "c", "b", "a", "a", "b", "c", "b", "b", "a", "b", "a", 
    "a", "a", "c", "b", "a", "b", "b", "b", "c", "a", "c", "a", "a", "c", "a", 
    "b", "b", "a", "c") 
    > col2 = c("e", "f", "e", "e", "f", "e", "d", "f", "e", "e", "e", "d", "e", 
    "e", "f", "d", "d", "d", "e", "f", "f", "d", "d", "f", "e", "e", "f", "f", 
    "f", "d", "e", "e") 
    > col3 = c("g", "i", "j", "i", "i", "j", "i", "h", "g", "j", "g", "h", "i", 
    "h", "g", "h", "g", "j", "h", "i", "g", "g", "i", "h", "h", "h", "h", "g", 
    "j", "i", "g", "g")
    > df = data.frame(col1, col2, col3)
    > df <- data.frame(sapply(df, as.factor))
    > rules <- apriori(df, parameter = list(supp = 0.2, conf = 0.2))
    > inspect(head(rules, by = "confidence"))
    """

    transactions = [('b', 'e', 'g'), ('b', 'f', 'i'), ('c', 'e', 'j'),
                    ('b', 'e', 'i'), ('a', 'f', 'i'), ('a', 'e', 'j'),
                    ('b', 'd', 'i'), ('c', 'f', 'h'), ('b', 'e', 'g'),
                    ('b', 'e', 'j'), ('a', 'e', 'g'), ('b', 'd', 'h'),
                    ('a', 'e', 'i'), ('a', 'e', 'h'), ('a', 'f', 'g'),
                    ('c', 'd', 'h'), ('b', 'd', 'g'), ('a', 'd', 'j'),
                    ('b', 'e', 'h'), ('b', 'f', 'i'), ('b', 'f', 'g'),
                    ('c', 'd', 'g'), ('a', 'd', 'i'), ('c', 'f', 'h'),
                    ('a', 'e', 'h'), ('a', 'e', 'h'), ('c', 'f', 'h'),
                    ('a', 'f', 'g'), ('b', 'f', 'j'), ('b', 'd', 'i'),
                    ('a', 'e', 'g'), ('c', 'e', 'g')]

    itemsets, rules = apriori(transactions, 0.2, 0.2)

    for rule in rules:
        if rule == Rule(('a', ), ('e', )):
            assert abs(rule.support - 0.21875) < 10e-7
            assert abs(rule.confidence - 0.5833333) < 10e-7

        if rule == Rule(('e', ), ('a', )):
            assert abs(rule.support - 0.21875) < 10e-7
            assert abs(rule.confidence - 0.5000000) < 10e-7
Ejemplo n.º 6
0
def test_minimal_input():
    """
    The with some minimal inputs, and make sure the correct errors are raised.
    """
    transactions = []
    itemsets, rules = apriori(transactions, 0.2, 0.2)
    assert itemsets == {} and rules == []

    with pytest.raises(ValueError):
        itemsets, rules = apriori(transactions, -0.2, 0.2)

    with pytest.raises(ValueError):
        itemsets, rules = apriori(transactions, 0.2, -0.2)

    with pytest.raises(ValueError):
        itemsets, rules = apriori(transactions, "asdf", 1)

    itemsets, rules = apriori([(1, 2), (1, 2), (1, 3)], 1, 1)
    itemsets, rules = apriori([(1, 2), (1, 2), (1, 3)], 1.0, 1.0)
Ejemplo n.º 7
0
def test_adult_dataset():
    """
    Test on the Adult dataset, which may be found here:
        https://archive.ics.uci.edu/ml/datasets/adult
        
    Some numeric columns were removed. The age was discretized.
    The purpose of this test is to assure that the algorithm can deal with a
    small 2.2 MB (30k rows) data set reasonably efficiently.
    
    Test against R, from the following code
    > library(arules)
    > df <- read.csv("adult_data_cleaned.txt", header = FALSE)
    > df <- data.frame(sapply(df, as.factor))
    > rules <- apriori(df, parameter = list(supp = 0.4, conf = 0.4))
    > inspect(head(rules, by = "confidence"m = 10))

    """
    
    def data_generator(filename):
        """
        Data generator, needs to return a generator to be called several times.
        """
        def data_gen():
            with open(filename) as file:
                for line in file:
                    yield tuple(k.strip() for k in line.split(','))      
        return data_gen

    try:
        base, _ = os.path.split(__file__)
        filename = os.path.join(base, 'adult_data_cleaned.txt')
    except NameError:
        filename = 'adult_data_cleaned.txt'
    transactions = data_generator(filename)
    itemsets, rules = apriori(transactions, min_support=0.2, 
                              min_confidence=0.2)
    
    # Test that the rules found in R were also found using this implementation
    rules_set = set(rules)
    assert Rule(('Married-civ-spouse', 'Husband', 'middle-aged'), 
                ('Male',)) in rules_set
    assert Rule(('Married-civ-spouse', 'White', 'middle-aged', 'Male'), 
                ('Husband',)) in rules_set
    assert Rule(('<=50K', 'young'), ('Never-married',)) in rules_set
    assert Rule(('Husband', 'White', 'Male', 'middle-aged'), 
                ('Married-civ-spouse',)) in rules_set
    assert Rule(('young',), ('Never-married',)) in rules_set
    
    # Test results against R package arules
    for rule in rules:
        if rule == Rule(('Married-civ-spouse', 'Husband', 'middle-aged'), 
                        ('Male',)):
            assert abs(rule.support - 0.2356193) < 10e-7
            assert abs(rule.confidence - 0.9998697) < 10e-7
            assert abs(rule.lift - 1.494115) < 10e-7
            
        if rule == Rule(('Married-civ-spouse', 'White', 'middle-aged', 'Male'), 
                        ('Husband',)):
            assert abs(rule.support - 0.2123399) < 10e-7
            assert abs(rule.confidence - 0.9938192) < 10e-7
            assert abs(rule.lift - 2.452797) < 10e-7
            
        if rule == Rule(('<=50K', 'young'), ('Never-married',)):
            assert abs(rule.support - 0.2170081) < 10e-7
            assert abs(rule.confidence - 0.7680435) < 10e-7
            assert abs(rule.lift - 2.340940) < 10e-7
            
        if rule == Rule(('Husband', 'White', 'Male', 'middle-aged'), 
                        ('Married-civ-spouse',)):
            assert abs(rule.support - 0.2123399) < 10e-7
            assert abs(rule.confidence - 0.9995663) < 10e-7
            assert abs(rule.lift - 2.173269) < 10e-7
            
        if rule == Rule(('young',), ('Never-married',)):
            assert abs(rule.support - 0.2200792) < 10e-7
            assert abs(rule.confidence - 0.7379261) < 10e-7
            assert abs(rule.lift - 2.249144) < 10e-7