def find_association_rules(self, transactions): rules = fim.arules(transactions, supp=-3, zmin=1, zmax=6, conf=60, report="SC") return rules
def fit_fim_apriori(self, conf: float = 70, support: float = 10): """ Get classification rules by pyFim. """ self.rules = fim.arules( self.transactions, supp=support, conf=conf, report="sc", mode="o", appear=dict(self.appearance), zmin=2, # At least one antecedent and consequent zmax=self.max_length)
def fit_fim_apriori(self, conf: float=70, support: float=10): """Train the model to be able to get classification rules (PyFIM). Parameters ---------- conf : float = 70 Confidence. DEFAULT: 70% support : float = 10 Support. DEFAULT: 10% """ self.rules = fim.arules(self.transactions, supp=support, conf=conf, report="sc", mode="o", appear=dict(self.appearance), zmin=2, # At least one antecedent and consequent zmax=self.max_length)
print(u"{} items: {}".format(supp/float(trcount), "|".join(items)).encode('utf-8')) if algo == "eclat": s = fim.eclat(transactions, supp=2) s = sorted(s, key=lambda x:x[1]) for items,supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8')) elif algo == "eclat-rules": rules = fim.eclat(transactions, target='r', report='aC') rules = sorted(rules, key = lambda x: x[3]) for consequence, condition, support_count, confidence_percent in rules: condition = map(ele_to_str, condition) consequence = ele_to_str(consequence) print(u"{:6.2f}% of {} eles: If {} then {}".format(confidence_percent, support_count, " & ".join(condition), consequence)) elif algo == "arules": rules = fim.arules(transactions, supp=2, conf=75, report='aCl', eval='l', thresh=30) #random.shuffle(rules) # lambda x: x[3]) #rules = sorted(rules, key = lambda x: x[3]) # sort by confidence % rules = sorted(rules, key = lambda x: x[4]) # sort by lift rules = sorted(rules, key = lambda x: -len(x[1])) # sort by condition length for consequence, condition, support_count, confidence_percent, lift in rules: conditionSet = set(condition) #den = [c for c in sets if not conditionSet.issubset(c)] #num = [c for c in den if consequence in c] interestingness = 0 if calculate_interestingness: den = 0 for c in sets: if not conditionSet.issubset(c): den = den+1 num = 0
def top_rules(transactions, appearance={}, target_rule_count=1000, init_support=0., init_conf=0.5, conf_step=0.05, supp_step=0.05, minlen=2, init_maxlen=3, total_timeout=100., max_iterations=30): """Function for finding the best n (target_rule_count) rules from transaction list Parameters ---------- transactions : 2D array of strings e.g. [["a:=:1", "b:=:3"], ["a:=:4", "b:=:2"]] appearance : dictionary dictionary specifying rule appearance targent_rule_count : int target number of rules to mine init_conf : float confidence from which to start mining conf_step : float supp_step : float minen : int minimum len of rules to mine init_maxlen : int maxlen from which to start mining total_timeout : float maximum execution time of the function max_iterations : int maximum iterations to try before stopping execution Returns ------- list of mined rules. The rules are not ordered. """ starttime = time.time() MAX_RULE_LEN = len(transactions[0]) support = init_support conf = init_conf maxlen = init_maxlen flag = True lastrulecount = -1 maxlendecreased_due_timeout = False iterations = 0 rules = None while flag: iterations += 1 if iterations == max_iterations: print("Max iterations reached") break print( "Running apriori with setting: confidence={}, support={}, minlen={}, maxlen={}, MAX_RULE_LEN={}" .format(conf, support, minlen, maxlen, MAX_RULE_LEN)) rules_current = fim.arules(transactions, supp=support, conf=conf, mode="o", report="sc", appear=appearance, zmax=maxlen, zmin=minlen) rules = rules_current rule_count = len(rules) print("Rule count: {}, Iteration: {}".format(rule_count, iterations)) if (rule_count >= target_rule_count): flag = False print("Target rule count satisfied:", target_rule_count) else: exectime = time.time() - starttime if exectime > total_timeout: print("Execution time exceeded:", total_timeout) flag = False elif maxlen < MAX_RULE_LEN and lastrulecount != rule_count and not maxlendecreased_due_timeout: maxlen += 1 lastrulecount = rule_count print("Increasing maxlen", maxlen) elif maxlen < MAX_RULE_LEN and maxlendecreased_due_timeout and support <= 1 - supp_step: support += supp_step maxlen += 1 lastrulecount = rule_count print("Increasing maxlen to", maxlen) print("Increasing minsup to", support) maxlendecreased_due_timeout = False elif conf > conf_step: conf -= conf_step print("Decreasing confidence to", conf) else: print("All options exhausted") flag = False return rules
dec = 0 for r in frequentset: if 100 in r[0]: rec += 1 elif 200 in r[0]: dec += 1 print("The number of frequent item that has 100:%d" % rec, file=outf) print("\npart d:", file=outf) print("The number of frequent item that has 200:%d" % dec, file=outf) print("\npart e:", file=outf) print("Top 10 rules with head 100:", file=outf) print("output formation: rule_head|rule_tail|sup|confidence", file=outf) rules = arules(dataset, supp=20, conf=0) rule_100head = list() for r in rules: if (r[0] == 100): rule_100head.append(r) elif (type(r[0]) != type(1)): if 100 in r[0]: rule_100head.append(r) rule_100head.sort(key=takeFourth, reverse=True) i = 0 for r in rule_100head: if (i < 10): print(r, file=outf) i += 1 else: break
if __name__ == '__main__': runs = int(argv[1]) if len(argv) > 1 else 1 tracts = [[i + 1 for i in range(100) if random() < 0.1] for k in range(1000)] with open('data.txt', 'w') as out: for t in tracts: for i in t: out.write('%d ' % i) out.write('\n') if len(argv) > 2: rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab', appear={ None: 'in', 1: 'out' }) for r in rules: print(r) remove('data.txt') exit() stderr.write('association rules:\n') stderr.write('arules ... ') t = time() for r in range(runs): rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab') stderr.write('done [%.3fs].\n' % (time() - t))
res.append((head, body, int(line[-2]), int(line[-1]))) return res #----------------------------------------------------------------------- if __name__ == '__main__': runs = int(argv[1]) if len(argv) > 1 else 1 tracts = [[i+1 for i in range(100) if random() < 0.1] for k in range(1000)] with open('data.txt', 'w') as out: for t in tracts: for i in t: out.write('%d ' % i) out.write('\n') if len(argv) > 2: rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab', appear={None:'in', 1:'out'}) for r in rules: print(r) remove('data.txt'); exit() stderr.write('association rules:\n') stderr.write('arules ... '); t = time() for r in range(runs): rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab') stderr.write('done [%.3fs].\n' % (time()-t)) rules = set([(h, tuple(sorted(list(b))), x, y) for h,b,x,y in rules]) stderr.write('\n') for p,f in [('apriori', apriori), ('eclat', eclat), ('fpgrowth', fpgrowth)]:
def get_association_rules(x, y, asr_params={ 'supp': -10, 'conf': 0., 'thresh': 100 }, z_score_threshold=1.96): ''' Get the matrix of rules generated using the association rules algorithm. Parameters: - x, pandas.DataFrame, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. - y, pandas.Series, shape = [n_samples] Target vector relative to X. - asr_params, dict. Parameters to used for the association rules algorithm. See package pyfim for the parameters that can be used. - z_score_threshold, float. The threshold to use for the z-score parameter. Output: - return the relevant rules under the form of a np.array. shape (n_rules, n_rule_caracteristics) where n_rules is the number of relevant rules and n_rule_caracteristics the number of caracteristics of the rules(equal to 8). ''' # Confidence is an indication of how often the rule has been found to be # true. # The confidence value of a rule is the proportion of the transactions that # contains X which also contains Y. # {conf (X-> Y)=supp(X U Y)/supp(X). data = pd.DataFrame(x, dtype=np.int32) data['target'] = y data['target'] = data['target'].astype(np.int32) data = data.applymap(str) data['target'] = data['target'].apply(lambda x: 'l' + '-' + x) for col in list(set(data.columns.tolist()) - set(['target'])): data[col] = data[col].apply(lambda x: 'v' + str(col) + '-' + x) rules = [] for c in np.unique(data['target']): rules_ = fim.arules( data.values.tolist(), report='(abhlQ', # ( => combine values in a tuple (must be first character) # a => absolute item set support (number of transactions) # => rule modality size # b => absolute body set support (number of transactions) # => rule size # h => absolute head item support (number of transactions) # => number of samples with the modality of interest in the whole # dataset # l => lift value of a rule (confidence/prior) # Q => support of the empty set (total number of transactions) # => total number of samples in the whole dataset # Results are under the form: # ('modality', (item1, item2), (a, b, h, l, Q) # E.g.: ('l-0', ('v1-2',), (35, 35, 50, 3.0, 150)) eval='l', appear={ None: 'in', c: 'out' }, **asr_params) rules.extend(list(map(lambda r: get_info_rules(r), rules_))) rules = np.array(rules) rules = rules[rules[:, 6] >= z_score_threshold] return rules
for items, supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8')) elif algo == "eclat-rules": rules = fim.eclat(transactions, target='r', report='aC') rules = sorted(rules, key=lambda x: x[3]) for consequence, condition, support_count, confidence_percent in rules: condition = map(ele_to_str, condition) consequence = ele_to_str(consequence) print(u"{:6.2f}% of {} eles: If {} then {}".format( confidence_percent, support_count, " & ".join(condition), consequence)) elif algo == "arules": rules = fim.arules(transactions, supp=2, conf=75, report='aCl', eval='l', thresh=30) #random.shuffle(rules) # lambda x: x[3]) #rules = sorted(rules, key = lambda x: x[3]) # sort by confidence % rules = sorted(rules, key=lambda x: x[4]) # sort by lift rules = sorted(rules, key=lambda x: -len(x[1])) # sort by condition length for consequence, condition, support_count, confidence_percent, lift in rules: conditionSet = set(condition) #den = [c for c in sets if not conditionSet.issubset(c)] #num = [c for c in den if consequence in c] interestingness = 0 if calculate_interestingness: den = 0 for c in sets: if not conditionSet.issubset(c):
def top_rules(transactions, appearance: Optional[Dict] = None, target_rule_count: int = 1000, init_support: float = 0.05, init_confidence: float = 0.5, confidence_step: float = 0.05, support_step: float = 0.05, min_length: int = 2, init_max_length: int = 3, total_timeout: float = 100.0, max_iterations: int = 30, verbose: bool = True): """ Function for finding the best n (target_rule_count) rules from transaction list Returns list of mined rules. The rules are not ordered. :param transactions : 2D array of strings, e.g. [["a:=:1", "b:=:3"], ["a:=:4", "b:=:2"]] :param appearance : dict - dictionary specifying rule appearance :param target_rule_count : int - target number of rules to mine :param init_support : float - support from which to start mining :param init_confidence : float - confidence from which to start mining :param confidence_step : float :param support_step : float :param min_length : int - minimum len of rules to mine :param init_max_length : int - maximum len from which to start mining :param total_timeout : float - maximum execution time of the function :param max_iterations : int - maximum iterations to try before stopping execution :param verbose: bool """ if appearance is None: appearance = {} start_time: float = time.time() # the length of a rule is at most the length of a transaction. (All transactions have the same length.) MAX_RULE_LEN: int = len(transactions[0]) current_support: float = init_support current_confidence: float = init_confidence current_max_length: int = init_max_length keep_mining: bool = True is_max_length_decreased_due_timeout: bool = False current_iteration: int = 0 last_rule_count = -1 rules: Optional[List] = None if verbose: print("STARTING top_rules") while keep_mining: current_iteration += 1 if current_iteration > max_iterations: if verbose: print("Max iterations reached") break if verbose: print(f"--- iteration {current_iteration} ---") print((f"Running apriori with setting: " f"confidence={current_confidence}, " f"support={current_support}, " f"min_length={min_length}, " f"max_length={current_max_length}, " f"MAX_RULE_LEN={MAX_RULE_LEN}")) current_rules = fim.arules(transactions, supp=current_support, conf=current_confidence, mode="o", report="sc", appear=appearance, zmax=current_max_length, zmin=min_length) current_nb_of_rules = len(current_rules) # assign rules = current_rules if verbose: print( f"Rule count: {current_nb_of_rules}, Iteration: {current_iteration}" ) if current_nb_of_rules >= target_rule_count: keep_mining = False if verbose: print(f"\tTarget rule count satisfied: {target_rule_count}") else: current_execution_time = time.time() - start_time # if timeout limit exceeded if current_execution_time > total_timeout: if verbose: print(f"\tExecution time exceeded: {total_timeout}") keep_mining = False # if we can still increase our rule length AND # the number of rules found has changed (increased?) since last time AND # there has elif current_max_length < MAX_RULE_LEN and last_rule_count != current_nb_of_rules and not is_max_length_decreased_due_timeout: current_max_length += 1 last_rule_count = current_nb_of_rules if verbose: print(f"\tIncreasing max_length {current_max_length}") # if we can still increase our rule length AND # # we can still increase our support # THEN: # increase our support # increment our max length elif current_max_length < MAX_RULE_LEN and is_max_length_decreased_due_timeout and current_support <= 1 - support_step: current_support += support_step current_max_length += 1 last_rule_count = current_nb_of_rules is_max_length_decreased_due_timeout = False if verbose: print(f"\tIncreasing maxlen to {current_max_length}") print(f"\tIncreasing minsup to {current_support}") # IF we can still decrease our confidence # THEN decrease our confidence elif current_confidence > confidence_step: current_confidence -= confidence_step if verbose: print(f"\tDecreasing confidence to {current_confidence}") else: if verbose: print("\tAll options exhausted") keep_mining = False if verbose: end_of_current_iteration_message = f"--- end iteration {current_iteration} ---" print(end_of_current_iteration_message) print("-" * len(end_of_current_iteration_message)) if verbose: print(f"FINISHED top_rules after {current_iteration} iterations") return rules
#----------------------------------------------------------------------- if __name__ == '__main__': runs = int(argv[1]) if len(argv) > 1 else 1 tracts = [[i+1 for i in range(100) if random() < 0.1] for k in range(1000)] with open('data.txt', 'w') as out: for t in tracts: for i in t: out.write('%d ' % i) out.write('\n') stderr.write('association rules:\n') stderr.write('arules ... '); t = time() for r in range(runs): rules = arules(tracts, supp=-2, conf=80, zmin=2, report='ab') stderr.write('done [%.3fs].\n' % (time()-t)) rules = set([(h, tuple(sorted(list(b))), x[0], x[1]) for h,b,x in rules]) stderr.write('\n') for p,f in [('apriori', apriori), ('eclat', eclat), ('fpgrowth', fpgrowth)]: stderr.write(p +' (rules)\n') stderr.write('python ... '); t = time() for r in range(runs): pyrules = f(tracts, target='r', supp=-2, conf=80, zmin=2, report='ab') stderr.write('done [%.3fs].\n' % (time()-t))
''' Written by Stan Smoltis for Sydney Business Intelligence User Group ''' from fim import arules import pandas import pyodbc suppParam = 0.1 confParam = 0.7 _conn = pyodbc.connect( "DRIVER={SQL Server};SERVER=(local)\sql2017;Database=PythonDemo;Trusted_Connection=yes;" ) _sql = "SELECT [Departments] as [Values] FROM [dbo].[CombinedSets] WHERE StoreCode=20" InputDataSet = pandas.read_sql_query(sql=_sql, con=_conn) rules = arules(InputDataSet["Values"].str.split(",\s*"), supp=suppParam, conf=confParam, zmin=2, report="SCl") OutputDataSet = pandas.DataFrame( rules, columns=["cons", "ante", "supp", "conf", "lift"]) OutputDataSet["ante"] = OutputDataSet["ante"].apply(lambda col: ", ".join(col)) print(len(OutputDataSet)) print(OutputDataSet)