def run_eclat(dataset): #converting data into nupy array X = dataset.iloc[:, :].values # constructign the list subjects = [] current_subject = [] current_student = X[0, 0] for i in range(len(X)): if current_student == X[i, 0]: current_subject.append(X[i, 1]) else: if len(current_subject) > 1: subjects.append(current_subject) current_subject = [] current_student = X[i, 0] results = [] results = eclat(subjects, out=results, supp=0.09, zmin=2) found = [] for i in range(len(results)): found.append(list(results[i][0])) return pd.DataFrame(found)
def runForAll(OUTPUT_DIR, XML_OUTPUT_DIR, FV_CSV, FV_PICKLE): createDirIfNotExists(OUTPUT_DIR) createDirIfNotExists(XML_OUTPUT_DIR) transactions,idMap = loadData(FV_CSV) print "Mining itemsets" itemsets = eclat(tracts=transactions, target='c', supp=-MIN_SUPPORT, zmin=MIN_ITEMSET_SIZE) print "Done mining itemsets" if itemsets is None: print "No itemsets found" sys.exit(0) print "Found",len(itemsets),"itemsets" labels,fvValues,fvidmap,labidmap,featureVector = pickle.load(open(FV_PICKLE, 'rb')) fvinv_map = {v: k for k, v in fvidmap.iteritems()} labinv_map = {v: k for k, v in labidmap.iteritems()} decoded_transactions = [ set(expandCpreds(t, idMap, fvinv_map)) for t in transactions ] args = [ (t, itemsets, MIN_CONF, transactions, decoded_transactions, decodeLabel(idt, labels, labinv_map), idt, fvinv_map, idMap, OUTPUT_DIR, XML_OUTPUT_DIR, labels, labinv_map) for idt,t in enumerate(transactions) if len(t) > 0 ] if PARALLELIZE: # Let's make this parallel now for speed... p = mp.Pool(mp.cpu_count()) p.map(checkConsistency, args) else: for arg in args: checkConsistency(arg)
def eclatRules(das, country): if country != 'all': das=das[das.ship_to_country.apply(lambda x: True if x in country else False)] das=das.groupby(['order_line'])['ordered_item'].apply(list).values.tolist() eclat_result=eclat(das, supp=10, zmax=4, report='aCL', target='r', eval='l') eclat_result=pd.DataFrame(eclat_result) eclat_result.columns=['antecedants', 'consequents', 'support', 'confidence', 'list'] eclat_result.consequents=eclat_result.consequents.astype('str') eclat_result= eclat_result[eclat_result.consequents != '()'] return eclat_result
def do_eclat(min_support): transactions = [] result = return_data_rows() for row in result: one_trip_locations = [] for location in row.polyline: x = str(location.lat)[:8] + "," + str(location.long)[:8] one_trip_locations.append(x) numbers_tuple = tuple(one_trip_locations) transactions.append(numbers_tuple) start_time = datetime.now() rules = eclat(transactions, supp=min_support * 10) rules.sort(key=lambda x: x[1], reverse=True) end_time = datetime.now() print(rules) diff = (end_time - start_time) print("eclat longs : ", diff.total_seconds(), "seconds")
def transactions2freqitems(transactions_by_labels: List[List], supp=0.05, zmin=1, zmax=3) -> List[tuple]: supp = int(supp * 100) itemsets = set() for trans in transactions_by_labels: itemset = [ tuple(sorted(r[0])) for r in eclat(trans, supp=supp, zmin=zmin, zmax=zmax) ] itemsets |= set(itemset) itemsets = list(itemsets) logging.info("Total {:d} itemsets mined".format(len(itemsets))) return itemsets
def runForEPointGroups(): createDirIfNotExists(EPG_OUTPUT_DIR) createDirIfNotExists(EPG_XML_OUTPUT_DIR) # Read in input directory csvFiles = [ getFilename(f) for f in os.listdir(EPG_FV_DIR) if isFile(EPG_FV_DIR, f) and f.endswith('.csv')] for fname in csvFiles: EPG_FV_CSV = os.path.join(EPG_FV_DIR, fname + '.csv') EPG_FV_PICKLE = os.path.join(EPG_FV_DIR, fname + '.pickle') EPG_i_OD = os.path.join(EPG_OUTPUT_DIR, fname) EPG_i_OD_XML = os.path.join(EPG_XML_OUTPUT_DIR, fname) createDirIfNotExists(EPG_i_OD) createDirIfNotExists(EPG_i_OD_XML) transactions,idMap = loadData(EPG_FV_CSV) #Make sure transactions are not all empty if len(idMap) <= 0: continue print "Mining itemsets for",fname itemsets = eclat(tracts=transactions, target='c', supp=-MIN_SUPPORT, zmin=MIN_ITEMSET_SIZE) print "Done mining itemsets for",fname if itemsets is None: print "No itemsets found" sys.exit(0) print "Found",len(itemsets),"itemsets" labels,fvValues,fvidmap,labidmap,featureVector = pickle.load(open(EPG_FV_PICKLE, 'rb')) fvinv_map = {v: k for k, v in fvidmap.iteritems()} labinv_map = {v: k for k, v in labidmap.iteritems()} decoded_transactions = [ set(expandCpreds(t, idMap, fvinv_map)) for t in transactions ] itemsets = sorted([(isup, iset) for iset,isup in itemsets], reverse=True) itemsets = [(iset, isup) for isup,iset in itemsets] for idt,t in enumerate(transactions): checkGroupConsistency((t, itemsets, MIN_CONF, transactions, decoded_transactions, decodeLabel(idt, labels, labinv_map), idt, fvinv_map, idMap, EPG_i_OD, EPG_i_OD_XML, labels, labinv_map))
tid = int(argv[1]) if tid < -2: print(fpgrowth.__doc__) elif tid < -1: print(eclat.__doc__) elif tid < 0: print(apriori.__doc__) else: tracts = [[1, 2, 3], [1, 4, 5], [2, 3, 4], [1, 2, 3, 4], [2, 3], [1, 2, 4], [4, 5], [1, 2, 3, 4], [3, 4, 5], [1, 2, 3]] print('transactions:') for t in tracts: print(t) if tid < 1: print('apriori(tracts, supp=-3, zmin=2):') for r in apriori(tracts, supp=-3, zmin=2): print r elif tid < 2: print('eclat(tracts, supp=-3, zmin=2):') for r in eclat(tracts, supp=-3, zmin=2): print r elif tid < 3: print('fpgrowth(tracts, supp=-3, zmin=2):') for r in fpgrowth(tracts, supp=-3, zmin=2): print r else: print('fim(tracts, supp=-3, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
def ele_to_str(ele): global db return util.ele_to_str(db, ele) sets = map(set, transactions) print('running algorithm', file=sys.stderr) before = time.time() if algo == "apriori": s = fim.apriori(transactions, supp=2) s = sorted(s, key=lambda x:x[1]) for items,supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp/float(trcount), "|".join(items)).encode('utf-8')) if algo == "eclat": s = fim.eclat(transactions, supp=2) s = sorted(s, key=lambda x:x[1]) for items,supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8')) elif algo == "eclat-rules": rules = fim.eclat(transactions, target='r', report='aC') rules = sorted(rules, key = lambda x: x[3]) for consequence, condition, support_count, confidence_percent in rules: condition = map(ele_to_str, condition) consequence = ele_to_str(consequence) print(u"{:6.2f}% of {} eles: If {} then {}".format(confidence_percent, support_count, " & ".join(condition), consequence)) elif algo == "arules": rules = fim.arules(transactions, supp=2, conf=75, report='aCl', eval='l', thresh=30) #random.shuffle(rules) # lambda x: x[3]) #rules = sorted(rules, key = lambda x: x[3]) # sort by confidence %
import pandas as pd import numpy as np from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import fpmax from mlxtend.frequent_patterns import association_rules from fim import eclat from tabulate import tabulate df = pd.read_csv("plants_preprocessed.csv") print("================ FP-Max ==================") dataset = df.values.tolist() te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) itemsets = fpmax(df, min_support=0.001, use_colnames=True,max_len = 10) print(itemsets) print("\n\n RULES based on FP growth : \n\n") rules = association_rules(itemsets, min_threshold=0.0001,support_only=True) print(rules[['antecedents', 'consequents', 'support']]) print("================ ECLAT-Max ================ ") itemsets = eclat(dataset,target='m',supp=2,report='s') print(tabulate(itemsets, headers=['Itemset', 'Support'], tablefmt='pretty'))
dataset = list() for line in lines: strpline = line.rstrip() arr = strpline.split(',') newline = [] for i in range(len(arr)): if arr[i] == 'y': newline.append(i) if arr[0] == 'republican': newline.append(100) else: newline.append(200) dataset.append(newline) print("\npart a:", file=outf) frequentset = eclat(dataset, supp=20, zmin=1) print("with 20%% support, there are %d frequent itemsets." % len(frequentset), file=outf) print("\npart b:", file=outf) print("Top ten itemsets:", file=outf) frequentset.sort(key=takeSecond, reverse=True) i = 0 for r in frequentset: if (i < 10): print(r, file=outf) i += 1 else: break print("\npart c:", file=outf)
import pandas as pd dataset = pd.read_csv('Market_Basket_Optimisation.csv', header=None) n = len(dataset) transactions = [] for i in range(0, n): transaction = [] m = len(dataset.values[i]) for j in range(0, m): data = str(dataset.values[i, j]) if data != "nan": transaction.append(data) transactions.append(transaction) results = [] from fim import eclat rules = eclat(tracts=transactions, zmin=1) rules.sort(key=lambda x: x[1], reverse=True)
tracts = { (1, 2, 3): 0.5, (1, 4, 5): 1.2, (2, 3, 4): 0.8, (1, 2, 3, 4): 0.3, (2, 3): 1.5, (1, 2, 4): 0.9, (4, 5): 0.6, (1, 2, 3, 4): 1.0, (3, 4, 5): 0.7 } print('transactions:') for t in tracts: print(t, tracts[t]) if tid < 1: print('apriori(tracts, supp=-1.6, zmin=2):') for r in apriori(tracts, supp=-1.6, zmin=2): print(r) elif tid < 2: print('eclat(tracts, supp=-1.6, zmin=2):') for r in eclat(tracts, supp=-1.6, zmin=2): print(r) elif tid < 3: print('fpgrowth(tracts, supp=-1.6, zmin=2):') for r in fpgrowth(tracts, supp=-1.6, zmin=2): print(r) else: print('fim(tracts, supp=-1.6, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-1.6, zmin=2, report='#'): print(r)
print(fpgrowth.__doc__) elif tid < -1: print(eclat.__doc__) elif tid < 0: print(apriori.__doc__) else: tracts = [ [ 1, 2, 3 ], [ 1, 4, 5 ], [ 2, 3, 4 ], [ 1, 2, 3, 4 ], [ 2, 3 ], [ 1, 2, 4 ], [ 4, 5 ], [ 1, 2, 3, 4 ], [ 3, 4, 5 ], [ 1, 2, 3 ] ] print('transactions:') for t in tracts: print(t) if tid < 1: print ('apriori(tracts, supp=-3, zmin=2):') for r in apriori(tracts, supp=-3, zmin=2): print r elif tid < 2: print ('eclat(tracts, supp=-3, zmin=2):') for r in eclat(tracts, supp=-3, zmin=2): print r elif tid < 3: print ('fpgrowth(tracts, supp=-3, zmin=2):') for r in fpgrowth(tracts, supp=-3, zmin=2): print r else: print ('fim(tracts, supp=-3, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
global db return util.ele_to_str(db, ele) sets = map(set, transactions) print('running algorithm', file=sys.stderr) before = time.time() if algo == "apriori": s = fim.apriori(transactions, supp=2) s = sorted(s, key=lambda x: x[1]) for items, supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp / float(trcount), "|".join(items)).encode('utf-8')) if algo == "eclat": s = fim.eclat(transactions, supp=2) s = sorted(s, key=lambda x: x[1]) for items, supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8')) elif algo == "eclat-rules": rules = fim.eclat(transactions, target='r', report='aC') rules = sorted(rules, key=lambda x: x[3]) for consequence, condition, support_count, confidence_percent in rules: condition = map(ele_to_str, condition) consequence = ele_to_str(consequence) print(u"{:6.2f}% of {} eles: If {} then {}".format( confidence_percent, support_count, " & ".join(condition), consequence)) elif algo == "arules": rules = fim.arules(transactions,
data = pd.read_excel(r'D:\Accidents.xlsx') df = pd.DataFrame( data, columns=[ "Date d'accident DAT_ACC", "Heure d'accident HEU_ACC", "Decoupage Geographique COD_PRV", "Etat chaussée COD_ETA_CHA", "Etat surface COD_ETA_SUR", "Lumière COD_LUM", "Lumière COD_LUM", "Code Agglomeration COD_AGG", "Localisation LOC", "Type carrefours COD_TYP_CAR_AGG", "Profils en long COD_PRO_LON", "Type collision COD_TYP_COL", "Obstacle heurtés COD_OBS_HRT", "Point de choc initial COD_POI_CHO", "manœuvre COD_MAN" ]) ##transformation de la dataframe en liste freq_itemsets = df.values.tolist() ##Application de l'algorithme eclat res = eclat(freq_itemsets, supp=10, zmin=10) ##transformer la liste en dataframe to_df = pd.DataFrame(res, columns=["Frequent Items", "Support"]) ##affichage du support sous forme de pourcentage for index, row in to_df.iterrows(): print(row['Frequent Items'], "\t\t", '%.2f' % (row['Support'] / len(df) * 100), "%")
def mbasket(self, data_p, support_par, confidence_par, method='apriori', lift_par=1.2): """ :param :return: """ start0 = time() ## Apriori analysis + association rules creation # find association rules with default settings rules = pd.DataFrame() if method == 'fpgrowth': start = time() frequent_itemsets = pd.DataFrame(fpgrowth(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s', mode='o'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("fpgrowth() -", run_time, "s") elif method == 'eclat': start = time() frequent_itemsets = pd.DataFrame(eclat(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s', mode='o'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("eclat() -", run_time, "s") elif method == 'relim': start = time() frequent_itemsets = pd.DataFrame(relim(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("relim() -", run_time, "s") elif method == 'sam': start = time() frequent_itemsets = pd.DataFrame(sam(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("sam() -", run_time, "s") elif method == 'ista': start = time() frequent_itemsets = pd.DataFrame(ista(data_p[1], supp=support_par * 100, zmin=1, report='s'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("ista() -", run_time, "s") else: start = time() frequent_itemsets = pd.DataFrame(apriori(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s', mode='o'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) rules = self.find_rules(frequent_itemsets, lift_par, confidence_par) print("apriori() -", run_time, "s") # users with antedecents from the rules calculated above if rules.shape[0] > 0: pivot_binary_tr = data_p[0].transpose() recom = {} pb = {} rules['antecedents'] = rules['antecedents'].apply( lambda x: frozenset(x)) for user in pivot_binary_tr.columns: products_bought = pivot_binary_tr.index[pivot_binary_tr[user] == 1] pb[user] = products_bought suitable_rules = [] for ante in rules['antecedents'].iteritems(): if ante[1].issubset(products_bought): # do poprawy suitable_rules.append(ante[0]) recom[user] = suitable_rules recom = pd.DataFrame.from_dict(recom, orient='index').stack().reset_index( level=1, drop=True).reset_index() recom.columns = ['review_profilename', 'Rule'] # products bought - zeby wykluczyc te produkty z rekomendacji pb2 = pd.DataFrame.from_dict(pb, orient='index').stack().reset_index( level=1, drop=True).reset_index() pb2.columns = ['review_profilename', 'antecedents1'] rule_cons = rules[['antecedents', 'consequents']].reset_index() rule_cons['consequents'] = [ i for i, *_ in rule_cons['consequents'] ] # change format from frozensets to normal rule_cons['antecedents'] = [ list(i) for i in rule_cons['antecedents'] ] rule_cons.columns = ['Rule', 'antecedents', 'consequents'] recom = recom.merge(rule_cons, on='Rule') recom.drop_duplicates(['review_profilename', 'consequents'], keep='first', inplace=True) # exclude from recommendations products already bought recom_already_satisfied = pb2.merge( recom, left_on=['review_profilename', 'antecedents1'], right_on=['review_profilename', 'consequents']) recom_already_satisfied['beer_already_known'] = 1 sum_recom_already_satisfied = recom_already_satisfied[ 'beer_already_known'].sum() recom_new = recom.merge( recom_already_satisfied[[ 'review_profilename', 'Rule', 'consequents', 'beer_already_known' ]], on=['review_profilename', 'Rule', 'consequents'], how='left') recom_new = recom_new[recom_new['beer_already_known'] != 1][[ 'review_profilename', 'Rule', 'antecedents', 'consequents' ]] else: rule_cons = 0 recom_new = 0 sum_recom_already_satisfied = 0 mba_time = round(time() - start0, 2) print("mbasket() -", mba_time, "s") return [ rule_cons, recom_new, mba_time, sum_recom_already_satisfied, run_time ]
lyf_tag_list = ICCv1['lyf_tag_list'] items = [str(elem['_id']) for elem in lyf_tag_list.find()] sample = [elem['tags'] for elem in lyf_tag_log.find()] tag_dict = dict() for elem in lyf_tag_list.find(): tag_dict[str(elem['_id'])] = elem['tag'] all_tag = [] for elem in sample: all_tag.extend(elem) diff = set(all_tag) - set(items) for elem in diff: tag_dict[elem] = '' res = fim.eclat(sample, supp=50) # 根据tag_dict显示tag中文名称 def element(lis): for elem in lis: if isinstance(elem, unicode): print tag_dict[elem] elif isinstance(elem, int): print elem else: element(elem)
# and the dataset from https://www.kaggle.com/puneetbhaya/online-retail # Importing the libraries import numpy as np import pandas as pd from fim import eclat help(eclat) ######## Data Preprocessing dataset = pd.read_excel('Online Retail.xlsx') #Replace null descriptions with the stock code for i, d in dataset[dataset['Description'].isnull()].iterrows(): dataset['Description'][i] = "Code-" + str(d['StockCode']) #group into baskets grouped = dataset.groupby('InvoiceNo') #rearrange into a list transactions = [] for name,group in grouped: transactions.append(list(group['Description'].map(str))) ##### Training report = eclat(transactions, target='s', supp=1, zmin=2) ##compare with apriori from same module. Different from apyori.... maybe have to investigate further from fim import apriori help(apriori) areport = apriori(transactions, report='l,c', target='r', supp=1, zmin=2)