Beispiel #1
0
    def find_association_rules(self, transactions):

        rules = fim.arules(transactions,
                           supp=-3,
                           zmin=1,
                           zmax=6,
                           conf=60,
                           report="SC")

        return rules
Beispiel #2
0
 def fit_fim_apriori(self, conf: float = 70, support: float = 10):
     """
     Get classification rules by pyFim.
     """
     self.rules = fim.arules(
         self.transactions,
         supp=support,
         conf=conf,
         report="sc",
         mode="o",
         appear=dict(self.appearance),
         zmin=2,  # At least one antecedent and consequent
         zmax=self.max_length)
Beispiel #3
0
    def fit_fim_apriori(self, conf: float=70, support: float=10):
        """Train the model to be able to get classification rules (PyFIM).

        Parameters
        ----------
        conf : float = 70
            Confidence.
            DEFAULT: 70%
        support : float = 10
            Support.
            DEFAULT: 10%
        """
        self.rules = fim.arules(self.transactions,
                                supp=support,
                                conf=conf,
                                report="sc",
                                mode="o",
                                appear=dict(self.appearance),
                                zmin=2,  # At least one antecedent and consequent
                                zmax=self.max_length)
Beispiel #4
0
        print(u"{} items: {}".format(supp/float(trcount), "|".join(items)).encode('utf-8'))
if algo == "eclat":
    s = fim.eclat(transactions, supp=2)
    s = sorted(s, key=lambda x:x[1])
    for items,supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8'))
elif algo == "eclat-rules":
    rules = fim.eclat(transactions, target='r', report='aC')
    rules = sorted(rules, key = lambda x: x[3])
    for consequence, condition, support_count, confidence_percent in rules:
        condition = map(ele_to_str, condition)
        consequence = ele_to_str(consequence)
        print(u"{:6.2f}% of {} eles: If {} then {}".format(confidence_percent, support_count, " & ".join(condition), consequence))
elif algo == "arules":
    rules = fim.arules(transactions, supp=2, conf=75, report='aCl', eval='l', thresh=30)
    #random.shuffle(rules) # lambda x: x[3])
    #rules = sorted(rules, key = lambda x: x[3]) # sort by confidence %
    rules = sorted(rules, key = lambda x: x[4]) # sort by lift
    rules = sorted(rules, key = lambda x: -len(x[1])) # sort by condition length
    for consequence, condition, support_count, confidence_percent, lift in rules:
        conditionSet = set(condition)
        #den = [c for c in sets if not conditionSet.issubset(c)]
        #num = [c for c in den if consequence in c]
        interestingness = 0
        if calculate_interestingness:
            den = 0
            for c in sets: 
                if not conditionSet.issubset(c): 
                    den = den+1
            num = 0 
Beispiel #5
0
def top_rules(transactions,
              appearance={},
              target_rule_count=1000,
              init_support=0.,
              init_conf=0.5,
              conf_step=0.05,
              supp_step=0.05,
              minlen=2,
              init_maxlen=3,
              total_timeout=100.,
              max_iterations=30):
    """Function for finding the best n (target_rule_count)
    rules from transaction list

    Parameters
    ----------
    transactions : 2D array of strings
        e.g. [["a:=:1", "b:=:3"], ["a:=:4", "b:=:2"]]

    appearance : dictionary
        dictionary specifying rule appearance

    targent_rule_count : int
        target number of rules to mine

    init_conf : float
        confidence from which to start mining

    conf_step : float

    supp_step : float

    minen : int
        minimum len of rules to mine

    init_maxlen : int
        maxlen from which to start mining

    total_timeout : float
        maximum execution time of the function

    max_iterations : int
        maximum iterations to try before stopping
        execution


    Returns
    -------
    list of mined rules. The rules are not ordered.

    """

    starttime = time.time()

    MAX_RULE_LEN = len(transactions[0])

    support = init_support
    conf = init_conf

    maxlen = init_maxlen

    flag = True
    lastrulecount = -1
    maxlendecreased_due_timeout = False
    iterations = 0

    rules = None

    while flag:
        iterations += 1

        if iterations == max_iterations:
            print("Max iterations reached")
            break

        print(
            "Running apriori with setting: confidence={}, support={}, minlen={}, maxlen={}, MAX_RULE_LEN={}"
            .format(conf, support, minlen, maxlen, MAX_RULE_LEN))

        rules_current = fim.arules(transactions,
                                   supp=support,
                                   conf=conf,
                                   mode="o",
                                   report="sc",
                                   appear=appearance,
                                   zmax=maxlen,
                                   zmin=minlen)

        rules = rules_current

        rule_count = len(rules)

        print("Rule count: {}, Iteration: {}".format(rule_count, iterations))

        if (rule_count >= target_rule_count):
            flag = False
            print("Target rule count satisfied:", target_rule_count)
        else:
            exectime = time.time() - starttime

            if exectime > total_timeout:
                print("Execution time exceeded:", total_timeout)
                flag = False

            elif maxlen < MAX_RULE_LEN and lastrulecount != rule_count and not maxlendecreased_due_timeout:
                maxlen += 1
                lastrulecount = rule_count
                print("Increasing maxlen", maxlen)

            elif maxlen < MAX_RULE_LEN and maxlendecreased_due_timeout and support <= 1 - supp_step:
                support += supp_step
                maxlen += 1
                lastrulecount = rule_count

                print("Increasing maxlen to", maxlen)
                print("Increasing minsup to", support)

                maxlendecreased_due_timeout = False

            elif conf > conf_step:
                conf -= conf_step
                print("Decreasing confidence to", conf)

            else:
                print("All options exhausted")
                flag = False

    return rules
Beispiel #6
0
dec = 0
for r in frequentset:
    if 100 in r[0]:
        rec += 1
    elif 200 in r[0]:
        dec += 1

print("The number of frequent item that has 100:%d" % rec, file=outf)

print("\npart d:", file=outf)
print("The number of frequent item that has 200:%d" % dec, file=outf)

print("\npart e:", file=outf)
print("Top 10 rules with head 100:", file=outf)
print("output formation: rule_head|rule_tail|sup|confidence", file=outf)
rules = arules(dataset, supp=20, conf=0)
rule_100head = list()
for r in rules:
    if (r[0] == 100):
        rule_100head.append(r)
    elif (type(r[0]) != type(1)):
        if 100 in r[0]:
            rule_100head.append(r)
rule_100head.sort(key=takeFourth, reverse=True)
i = 0
for r in rule_100head:
    if (i < 10):
        print(r, file=outf)
        i += 1
    else:
        break
Beispiel #7
0
if __name__ == '__main__':
    runs = int(argv[1]) if len(argv) > 1 else 1
    tracts = [[i + 1 for i in range(100) if random() < 0.1]
              for k in range(1000)]
    with open('data.txt', 'w') as out:
        for t in tracts:
            for i in t:
                out.write('%d ' % i)
            out.write('\n')

    if len(argv) > 2:
        rules = arules(tracts,
                       supp=-2,
                       conf=80,
                       zmin=2,
                       report='ab',
                       appear={
                           None: 'in',
                           1: 'out'
                       })
        for r in rules:
            print(r)
        remove('data.txt')
        exit()

    stderr.write('association rules:\n')
    stderr.write('arules ... ')
    t = time()
    for r in range(runs):
        rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab')
    stderr.write('done [%.3fs].\n' % (time() - t))
Beispiel #8
0
            res.append((head, body, int(line[-2]), int(line[-1])))
    return res

#-----------------------------------------------------------------------

if __name__ == '__main__':
    runs   = int(argv[1]) if len(argv) > 1 else 1
    tracts = [[i+1 for i in range(100) if random() < 0.1]
                   for k in range(1000)]
    with open('data.txt', 'w') as out:
        for t in tracts:
            for i in t: out.write('%d ' % i)
            out.write('\n')

    if len(argv) > 2:
        rules  = arules(tracts, supp=-2, conf=80, zmin=2,
                        report='ab', appear={None:'in', 1:'out'})
        for r in rules: print(r)
        remove('data.txt'); exit()

    stderr.write('association rules:\n')
    stderr.write('arules ... '); t = time()
    for r in range(runs):
        rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab')
    stderr.write('done [%.3fs].\n' % (time()-t))
    rules = set([(h, tuple(sorted(list(b))), x, y)
                 for h,b,x,y in rules])
    stderr.write('\n')

    for p,f in [('apriori',  apriori),
                ('eclat',    eclat),
                ('fpgrowth', fpgrowth)]:
Beispiel #9
0
def get_association_rules(x,
                          y,
                          asr_params={
                              'supp': -10,
                              'conf': 0.,
                              'thresh': 100
                          },
                          z_score_threshold=1.96):
    '''
    Get the matrix of rules generated using the association rules algorithm.

    Parameters:
    - x, pandas.DataFrame, shape = [n_samples, n_features]
      Training vector, where n_samples is the number of samples and
      n_features is the number of features.
    - y, pandas.Series, shape = [n_samples]
      Target vector relative to X.
    - asr_params, dict. Parameters to used for the association rules
      algorithm. See package pyfim for the parameters that can be used.
    - z_score_threshold, float. The threshold to use for the z-score parameter.

    Output:
    - return the relevant rules under the form of a np.array.
      shape (n_rules, n_rule_caracteristics) where n_rules is the number of
      relevant rules and n_rule_caracteristics the number of
      caracteristics of the rules(equal to 8).
    '''
    # Confidence is an indication of how often the rule has been found to be
    # true.
    # The confidence value of a rule is the proportion of the transactions that
    # contains X which also contains Y.
    # {conf (X-> Y)=supp(X U Y)/supp(X).

    data = pd.DataFrame(x, dtype=np.int32)
    data['target'] = y
    data['target'] = data['target'].astype(np.int32)
    data = data.applymap(str)
    data['target'] = data['target'].apply(lambda x: 'l' + '-' + x)

    for col in list(set(data.columns.tolist()) - set(['target'])):
        data[col] = data[col].apply(lambda x: 'v' + str(col) + '-' + x)

    rules = []

    for c in np.unique(data['target']):
        rules_ = fim.arules(
            data.values.tolist(),
            report='(abhlQ',
            # ( => combine values in a tuple (must be first character)
            # a => absolute item set  support (number of transactions)
            #   => rule modality size
            # b => absolute body set  support (number of transactions)
            #   => rule size
            # h => absolute head item support (number of transactions)
            #   => number of samples with the modality of interest in the whole
            #      dataset
            # l => lift value of a rule (confidence/prior)
            # Q => support of the empty set (total number of transactions)
            #   => total number of samples in the whole dataset
            # Results are under the form:
            # ('modality', (item1, item2), (a, b, h, l, Q)
            # E.g.: ('l-0', ('v1-2',), (35, 35, 50, 3.0, 150))
            eval='l',
            appear={
                None: 'in',
                c: 'out'
            },
            **asr_params)
        rules.extend(list(map(lambda r: get_info_rules(r), rules_)))

    rules = np.array(rules)

    rules = rules[rules[:, 6] >= z_score_threshold]

    return rules
Beispiel #10
0
    for items, supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8'))
elif algo == "eclat-rules":
    rules = fim.eclat(transactions, target='r', report='aC')
    rules = sorted(rules, key=lambda x: x[3])
    for consequence, condition, support_count, confidence_percent in rules:
        condition = map(ele_to_str, condition)
        consequence = ele_to_str(consequence)
        print(u"{:6.2f}% of {} eles: If {} then {}".format(
            confidence_percent, support_count, " & ".join(condition),
            consequence))
elif algo == "arules":
    rules = fim.arules(transactions,
                       supp=2,
                       conf=75,
                       report='aCl',
                       eval='l',
                       thresh=30)
    #random.shuffle(rules) # lambda x: x[3])
    #rules = sorted(rules, key = lambda x: x[3]) # sort by confidence %
    rules = sorted(rules, key=lambda x: x[4])  # sort by lift
    rules = sorted(rules, key=lambda x: -len(x[1]))  # sort by condition length
    for consequence, condition, support_count, confidence_percent, lift in rules:
        conditionSet = set(condition)
        #den = [c for c in sets if not conditionSet.issubset(c)]
        #num = [c for c in den if consequence in c]
        interestingness = 0
        if calculate_interestingness:
            den = 0
            for c in sets:
                if not conditionSet.issubset(c):
def top_rules(transactions,
              appearance: Optional[Dict] = None,
              target_rule_count: int = 1000,
              init_support: float = 0.05,
              init_confidence: float = 0.5,
              confidence_step: float = 0.05,
              support_step: float = 0.05,
              min_length: int = 2,
              init_max_length: int = 3,
              total_timeout: float = 100.0,
              max_iterations: int = 30,
              verbose: bool = True):
    """
    Function for finding the best n (target_rule_count) rules from transaction list
    Returns list of mined rules. The rules are not ordered.

    :param transactions : 2D array of strings,  e.g. [["a:=:1", "b:=:3"], ["a:=:4", "b:=:2"]]
    :param appearance : dict - dictionary specifying rule appearance
    :param target_rule_count : int - target number of rules to mine
    :param init_support : float - support from which to start mining
    :param init_confidence : float - confidence from which to start mining
    :param confidence_step : float
    :param support_step : float
    :param min_length : int - minimum len of rules to mine
    :param init_max_length : int - maximum len from which to start mining
    :param total_timeout : float - maximum execution time of the function
    :param max_iterations : int - maximum iterations to try before stopping execution
    :param verbose: bool
    """

    if appearance is None:
        appearance = {}

    start_time: float = time.time()

    # the length of a rule is at most the length of a transaction. (All transactions have the same length.)
    MAX_RULE_LEN: int = len(transactions[0])

    current_support: float = init_support
    current_confidence: float = init_confidence

    current_max_length: int = init_max_length

    keep_mining: bool = True

    is_max_length_decreased_due_timeout: bool = False
    current_iteration: int = 0

    last_rule_count = -1
    rules: Optional[List] = None

    if verbose:
        print("STARTING top_rules")
    while keep_mining:
        current_iteration += 1

        if current_iteration > max_iterations:
            if verbose:
                print("Max iterations reached")
            break

        if verbose:
            print(f"--- iteration {current_iteration} ---")
            print((f"Running apriori with setting: "
                   f"confidence={current_confidence}, "
                   f"support={current_support}, "
                   f"min_length={min_length}, "
                   f"max_length={current_max_length}, "
                   f"MAX_RULE_LEN={MAX_RULE_LEN}"))

        current_rules = fim.arules(transactions,
                                   supp=current_support,
                                   conf=current_confidence,
                                   mode="o",
                                   report="sc",
                                   appear=appearance,
                                   zmax=current_max_length,
                                   zmin=min_length)
        current_nb_of_rules = len(current_rules)

        # assign
        rules = current_rules

        if verbose:
            print(
                f"Rule count: {current_nb_of_rules}, Iteration: {current_iteration}"
            )

        if current_nb_of_rules >= target_rule_count:
            keep_mining = False
            if verbose:
                print(f"\tTarget rule count satisfied: {target_rule_count}")
        else:
            current_execution_time = time.time() - start_time

            # if timeout limit exceeded
            if current_execution_time > total_timeout:
                if verbose:
                    print(f"\tExecution time exceeded: {total_timeout}")
                keep_mining = False

            # if we can still increase our rule length AND
            # the number of rules found has changed (increased?) since last time AND
            # there has
            elif current_max_length < MAX_RULE_LEN and last_rule_count != current_nb_of_rules and not is_max_length_decreased_due_timeout:
                current_max_length += 1
                last_rule_count = current_nb_of_rules
                if verbose:
                    print(f"\tIncreasing max_length {current_max_length}")

            # if we can still increase our rule length AND
            #
            # we can still increase our support
            # THEN:
            # increase our support
            # increment our max length
            elif current_max_length < MAX_RULE_LEN and is_max_length_decreased_due_timeout and current_support <= 1 - support_step:
                current_support += support_step
                current_max_length += 1
                last_rule_count = current_nb_of_rules
                is_max_length_decreased_due_timeout = False

                if verbose:
                    print(f"\tIncreasing maxlen to {current_max_length}")
                    print(f"\tIncreasing minsup to {current_support}")

            # IF we can still decrease our confidence
            # THEN decrease our confidence
            elif current_confidence > confidence_step:
                current_confidence -= confidence_step
                if verbose:
                    print(f"\tDecreasing confidence to {current_confidence}")

            else:
                if verbose:
                    print("\tAll options exhausted")
                keep_mining = False
        if verbose:
            end_of_current_iteration_message = f"--- end iteration {current_iteration} ---"
            print(end_of_current_iteration_message)
            print("-" * len(end_of_current_iteration_message))
    if verbose:
        print(f"FINISHED top_rules after {current_iteration} iterations")
    return rules
Beispiel #12
0
#-----------------------------------------------------------------------

if __name__ == '__main__':
    runs   = int(argv[1]) if len(argv) > 1 else 1
    tracts = [[i+1 for i in range(100) if random() < 0.1]
                   for k in range(1000)]
    with open('data.txt', 'w') as out:
        for t in tracts:
            for i in t: out.write('%d ' % i)
            out.write('\n')

    stderr.write('association rules:\n')
    stderr.write('arules ... '); t = time()
    for r in range(runs):
        rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab')
    stderr.write('done [%.3fs].\n' % (time()-t))
    rules = set([(h, tuple(sorted(list(b))), x[0], x[1])
                 for h,b,x in rules])
    stderr.write('\n')

    for p,f in [('apriori',  apriori),
                ('eclat',    eclat),
                ('fpgrowth', fpgrowth)]:
        stderr.write(p +' (rules)\n')
        stderr.write('python ... '); t = time()
        for r in range(runs):
            pyrules = f(tracts, target='r', supp=-2, conf=80,
                        zmin=2, report='ab')
        stderr.write('done [%.3fs].\n' % (time()-t))
Beispiel #13
0
'''
Written by Stan Smoltis for Sydney Business Intelligence User Group
'''
from fim import arules
import pandas
import pyodbc

suppParam = 0.1
confParam = 0.7

_conn = pyodbc.connect(
    "DRIVER={SQL Server};SERVER=(local)\sql2017;Database=PythonDemo;Trusted_Connection=yes;"
)
_sql = "SELECT [Departments] as [Values] FROM [dbo].[CombinedSets] WHERE StoreCode=20"
InputDataSet = pandas.read_sql_query(sql=_sql, con=_conn)

rules = arules(InputDataSet["Values"].str.split(",\s*"),
               supp=suppParam,
               conf=confParam,
               zmin=2,
               report="SCl")

OutputDataSet = pandas.DataFrame(
    rules, columns=["cons", "ante", "supp", "conf", "lift"])
OutputDataSet["ante"] = OutputDataSet["ante"].apply(lambda col: ", ".join(col))
print(len(OutputDataSet))
print(OutputDataSet)