コード例 #1
0
def test_generate_rules_apriori_large():
    """
    Test with lots of data.
    This test will fail if the second argument to `_ap_genrules` is not 
    validated as non-empty before the recursive function call. We must have
    if H_m_copy:
        yield from _ap_genrules
    for this test to pass.
    """

    transactions = generate_transactions(num_transactions=100,
                                         unique_items=30,
                                         items_row=(1, 20),
                                         seed=123)

    itemsets, num_transactions = itemsets_from_transactions(transactions, 0.1)

    min_conf = 0.3
    rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions)
    rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions)
    rules_apri = list(rules_apri)
    rules_naive = list(rules_naive)

    # Test equal length, since no duplicates should be returned by apriori
    assert len(rules_apri) == len(rules_naive)

    # Test equal results
    assert set(rules_apri) == set(rules_naive)
コード例 #2
0
def apriori2(
    transactions: typing.Union[typing.List[tuple], typing.Callable],
    min_support: float = 0.5,
    min_confidence: float = 0.5,
    max_length: int = 8,
    verbosity: int = 0,
    output_transaction_ids: bool = False,
):
    itemsets, num_trans = itemsets_from_transactions(
        transactions,
        min_support,
        max_length,
        verbosity,
        output_transaction_ids,
    )

    if itemsets and isinstance(next(iter(itemsets[1].values())), ItemsetCount):
        itemsets_for_rules = _convert_to_counts(itemsets)
    else:
        itemsets_for_rules = itemsets

    for itemset in itemsets.items():
        for itemset2 in itemset[1]:
            tmp = itemset[1][itemset2] / num_trans
            if tmp > 0.7 and tmp < 0.75:
                itemset[1][itemset2] = "[0.7 - 0.75]"
            elif tmp < 0.8:
                itemset[1][itemset2] = "[0.75 - 0.8]"
            elif tmp < 0.85:
                itemset[1][itemset2] = "[0.8 - 0.85]"
            elif tmp < 0.90:
                itemset[1][itemset2] = "[0.85 - 0.9]"
            else:
                itemset[1][itemset2] = "[0.9 - 1]"
    return itemsets, []
コード例 #3
0
def speeds():
    """
    Test the naive rule finder vs. the simple one from the paper.
    """
    import random
    random.seed(123456)
    transactions = generate_transactions(num_transactions=random.randint(
        250, 500),
                                         unique_items=random.randint(8, 9),
                                         items_row=(10, 50))

    itemsets, num_transactions = itemsets_from_transactions(transactions, 0.1)
    import time
    min_conf = 0.5

    print(itemsets)

    st = time.perf_counter()
    rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions)
    rules_apri = list(rules_apri)
    time_formatted = round(time.perf_counter() - st, 40)
    print('Fast apriori ran in {} s'.format(time_formatted))

    st = time.perf_counter()
    rules_simple = generate_rules_simple(itemsets, min_conf, num_transactions)
    rules_simple = list(rules_simple)
    time_formatted = round(time.perf_counter() - st, 40)
    print('Simple apriori ran in {} s'.format(time_formatted))

    st = time.perf_counter()
    rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions)
    rules_naive = list(rules_naive)
    time_formatted = round(time.perf_counter() - st, 40)
    print('Naive apriori ran in {} s'.format(time_formatted))
コード例 #4
0
def test_itemsets_from_a_file():
    """
    Test generator feature.
    """

    def file_generator(filename_):
        """
        A file generator for testing.
        """

        def generate_from_file():
            with open(filename_) as file:
                for line in file:
                    transactions = tuple(line.strip("\n").split(","))
                    yield transactions

        return generate_from_file

    base, filename = os.path.split(__file__)
    gen_obj = file_generator(os.path.join(base, "transactions.txt"))
    result, _ = itemsets_from_transactions(
        gen_obj, min_support=4 / 4, output_transaction_ids=True
    )
    assert result[2] == {
        ("A", "C"): ItemsetCount(itemset_count=4, members={0, 1, 2, 3})
    }
コード例 #5
0
def test_itemsets_max_length(transactions, min_support):
    """
    The that nothing larger than max length is returned.
    """
    max_len = random.randint(1, 5)
    result, _ = itemsets_from_transactions(list(transactions), min_support, 
                                           max_length=max_len)
    
    assert all(list(k <= max_len for k in result.keys()))
コード例 #6
0
def test_itemsets_from_transactions_stochastic(transactions, min_support):
    """
    Test 50 random inputs.
    """
    result, _ = itemsets_from_transactions(list(transactions), min_support)
    naive_result, _ = itemsets_from_transactions_naive(list(transactions),
                                                       min_support)

    for key in set.union(set(result.keys()), set(naive_result.keys())):
        assert result[key] == naive_result[key]
コード例 #7
0
def test_generate_rules_simple_vs_naive(transactions):
    """
    Test the naive rule finder vs. the simple one from the paper.
    """

    itemsets, num_transactions = itemsets_from_transactions(transactions, 0.25)

    min_conf = 0.1
    rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions)
    rules_simple = generate_rules_simple(itemsets, min_conf, num_transactions)
    assert set(rules_naive) == set(rules_simple)
コード例 #8
0
def apriori(
    transactions: typing.List[tuple],
    min_support: float = 0.5,
    min_confidence: float = 0.5,
    max_length: int = 8,
    verbosity: int = 0,
):
    """
    The classic apriori algorithm as described in 1994 by Agrawal et al.
    
    The Apriori algorithm works in two phases. Phase 1 iterates over the 
    transactions several times to build up itemsets of the desired support
    level. Phase 2 builds association rules of the desired confidence given the
    itemsets found in Phase 1. Both of these phases may be correctly
    implemented by exhausting the search space, i.e. generating every possible
    itemset and checking it's support. The Apriori prunes the search space
    efficiently by deciding apriori if an itemset possibly has the desired
    support, before iterating over the entire dataset and checking.
    
    Parameters
    ----------
    transactions : list of tuples, or a callable returning a generator
        The transactions may be either a list of tuples, where the tuples must
        contain hashable items. Alternatively, a callable returning a generator
        may be passed. A generator is not sufficient, since the algorithm will
        exhaust it, and it needs to iterate over it several times. Therefore,
        a callable returning a generator must be passed.
    min_support : float
        The minimum support of the rules returned. The support is frequency of
        which the items in the rule appear together in the data set.
    min_confidence : float
        The minimum confidence of the rules returned. Given a rule X -> Y, the
        confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y)
    max_length : int
        The maximum length of the itemsets and the rules.
    verbosity : int
        The level of detail printing when the algorithm runs. Either 0, 1 or 2.
    
    Examples
    --------
    >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')]
    >>> itemsets, rules = apriori(transactions, min_confidence=1)
    >>> rules
    [{a} -> {b}]
    """
    itemsets, num_trans = itemsets_from_transactions(
        transactions, min_support, max_length, verbosity
    )
    rules = generate_rules_apriori(
        itemsets, min_confidence, num_trans, verbosity
    )
    return itemsets, list(rules)
コード例 #9
0
def test_itemsets_from_a_generator_callable():
    """
    Test generator feature.
    """
    def generator():
        """
        A generator for testing.
        """
        for i in range(4):
            yield [j + i for j in range(5)]

    itemsets, _ = itemsets_from_transactions(generator, min_support=3 / 4)
    assert itemsets[3] == {(2, 3, 4): 3, (3, 4, 5): 3}
コード例 #10
0
def test_itemsets_max_length(transactions, min_support):
    """
    The that nothing larger than max length is returned.
    """
    max_len = random.randint(1, 5)
    result, _ = itemsets_from_transactions(
        list(transactions),
        min_support,
        max_length=max_len,
        output_transaction_ids=True,
    )

    assert all(list(k <= max_len for k in result.keys()))
    for length, itemsets in result.items():
        for itemset_count in itemsets.values():
            assert all(isinstance(i, int) for i in itemset_count.members)
コード例 #11
0
def test_generate_rules_naive_vs_apriori(transactions):
    """
    Test the naive rule finder vs. the simple one from the paper.
    """

    itemsets, num_transactions = itemsets_from_transactions(transactions, 0.15)

    min_conf = 0.3
    rules_apri = generate_rules_apriori(itemsets, min_conf, num_transactions)
    rules_naive = generate_rules_naively(itemsets, min_conf, num_transactions)
    rules_apri = list(rules_apri)
    rules_naive = list(rules_naive)

    # Test equal length, since no duplicates should be returned by apriori
    assert len(rules_apri) == len(rules_naive)

    # Test equal results
    assert set(rules_apri) == set(rules_naive)
コード例 #12
0
def test_itemsets_from_a_file():
    """
    Test generator feature.
    """
    def file_generator(filename):
        """
        A file generator for testing.
        """
        def generate_from_file():
            with open(filename) as file:
                for line in file:
                    yield tuple(line.strip("\n").split(","))

        return generate_from_file

    base, filename = os.path.split(__file__)
    gen_obj = file_generator(os.path.join(base, "transactions.txt"))
    result, _ = itemsets_from_transactions(gen_obj, min_support=4 / 4)
    assert result[2] == {("A", "C"): 4}
コード例 #13
0
def test_itemsets_from_a_generator_callable():
    """
    Test generator feature.
    """
    def generator():
        """
        A generator for testing.
        """
        for i in range(4):
            transactions = tuple(j + i for j in range(5))
            yield transactions

    itemsets, _ = itemsets_from_transactions(generator,
                                             min_support=3 / 4,
                                             output_transaction_ids=True)
    assert itemsets[3] == {
        (2, 3, 4): ItemsetCount(itemset_count=3, members={0, 1, 2}),
        (3, 4, 5): ItemsetCount(itemset_count=3, members={1, 2, 3}),
    }
コード例 #14
0
ファイル: apriori.py プロジェクト: tommyod/Efficient-Apriori
def apriori(
    transactions: typing.Iterable[typing.Union[set, tuple, list]],
    min_support: float = 0.5,
    min_confidence: float = 0.5,
    max_length: int = 8,
    verbosity: int = 0,
    output_transaction_ids: bool = False,
):
    """
    The classic apriori algorithm as described in 1994 by Agrawal et al.

    The Apriori algorithm works in two phases. Phase 1 iterates over the
    transactions several times to build up itemsets of the desired support
    level. Phase 2 builds association rules of the desired confidence given the
    itemsets found in Phase 1. Both of these phases may be correctly
    implemented by exhausting the search space, i.e. generating every possible
    itemset and checking it's support. The Apriori prunes the search space
    efficiently by deciding apriori if an itemset possibly has the desired
    support, before iterating over the entire dataset and checking.

    Parameters
    ----------
    transactions : list of transactions (sets/tuples/lists). Each element in
        the transactions must be hashable.
    min_support : float
        The minimum support of the rules returned. The support is frequency of
        which the items in the rule appear together in the data set.
    min_confidence : float
        The minimum confidence of the rules returned. Given a rule X -> Y, the
        confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y)
    max_length : int
        The maximum length of the itemsets and the rules.
    verbosity : int
        The level of detail printing when the algorithm runs. Either 0, 1 or 2.
    output_transaction_ids : bool
        If set to true, the output contains the ids of transactions that
        contain a frequent itemset. The ids are the enumeration of the
        transactions in the sequence they appear.
    Examples
    --------
    >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')]
    >>> itemsets, rules = apriori(transactions, min_confidence=1)
    >>> rules
    [{a} -> {b}]
    """

    itemsets, num_trans = itemsets_from_transactions(
        transactions,
        min_support,
        max_length,
        verbosity,
        output_transaction_ids=True,
    )

    itemsets_raw = {
        length:
        {item: counter.itemset_count
         for (item, counter) in itemsets.items()}
        for (length, itemsets) in itemsets.items()
    }
    rules = generate_rules_apriori(itemsets_raw, min_confidence, num_trans,
                                   verbosity)

    if output_transaction_ids:
        return itemsets, list(rules)
    else:
        return itemsets_raw, list(rules)
コード例 #15
0
def apriori(
    transactions,
    min_support=0.5,
    min_confidence=0.5,
    max_length=8,
    verbosity=0,
    output_transaction_ids=False,
):
    """
    The classic apriori algorithm as described in 1994 by Agrawal et al.

    The Apriori algorithm works in two phases. Phase 1 iterates over the
    transactions several times to build up itemsets of the desired support
    level. Phase 2 builds association rules of the desired confidence given the
    itemsets found in Phase 1. Both of these phases may be correctly
    implemented by exhausting the search space, i.e. generating every possible
    itemset and checking it's support. The Apriori prunes the search space
    efficiently by deciding apriori if an itemset possibly has the desired
    support, before iterating over the entire dataset and checking.

    Parameters
    ----------
    transactions : list of tuples, list of itemsets.TransactionWithId,
        or a callable returning a generator. Use TransactionWithId's when
        the transactions have ids which should appear in the outputs.
        The transactions may be either a list of tuples, where the tuples must
        contain hashable items. Alternatively, a callable returning a generator
        may be passed. A generator is not sufficient, since the algorithm will
        exhaust it, and it needs to iterate over it several times. Therefore,
        a callable returning a generator must be passed.
    min_support : float
        The minimum support of the rules returned. The support is frequency of
        which the items in the rule appear together in the data set.
    min_confidence : float
        The minimum confidence of the rules returned. Given a rule X -> Y, the
        confidence is the probability of Y, given X, i.e. P(Y|X) = conf(X -> Y)
    max_length : int
        The maximum length of the itemsets and the rules.
    verbosity : int
        The level of detail printing when the algorithm runs. Either 0, 1 or 2.
    output_transaction_ids : bool
        If set to true, the output contains the ids of transactions that
        contain a frequent itemset. The ids are the enumeration of the
        transactions in the sequence they appear.
    Examples
    --------
    >>> transactions = [('a', 'b', 'c'), ('a', 'b', 'd'), ('f', 'b', 'g')]
    >>> itemsets, rules = apriori(transactions, min_confidence=1)
    >>> rules
    [{a} -> {b}]
    """
    print('itemsets_from_transactions')
    itemsets, num_trans = itemsets_from_transactions(
        transactions,
        min_support,
        max_length,
        verbosity,
        output_transaction_ids,
    )

    if itemsets and isinstance(next(iter(itemsets[1].values())), ItemsetCount):
        itemsets_for_rules = _convert_to_counts(itemsets)
    else:
        itemsets_for_rules = itemsets
    print('generate_rules_apriori')
    rules = generate_rules_apriori(itemsets_for_rules, min_confidence,
                                   num_trans, verbosity)
    return itemsets, list(rules)