def test_lift(): res_df = association_rules(df_freq_items, min_threshold=1.1, metric='lift') assert res_df.values.shape[0] == 6 res_df = association_rules(df_freq_items_with_colnames, min_threshold=1.1, metric='lift') assert res_df.values.shape[0] == 6
def test_confidence(): res_df = association_rules(df_freq_items, min_threshold=0.8, metric='confidence') assert res_df.values.shape[0] == 9 res_df = association_rules(df_freq_items_with_colnames, min_threshold=0.8, metric='confidence') assert res_df.values.shape[0] == 9
def test_conviction(): res_df = association_rules(df_freq_items, min_threshold=1.5, metric='conviction') assert res_df.values.shape[0] == 11 res_df = association_rules(df_freq_items_with_colnames, min_threshold=1.5, metric='conviction') assert res_df.values.shape[0] == 11
def test_default(): res_df = association_rules(df_freq_items) res_df['antecedents'] = res_df['antecedents'].apply( lambda x: str(frozenset(x))) res_df['consequents'] = res_df['consequents'].apply( lambda x: str(frozenset(x))) res_df.sort_values(columns_ordered, inplace=True) res_df.reset_index(inplace=True, drop=True) expect = pd.DataFrame([ [(8,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf], [(6,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf], [(8, 3), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf], [(8, 5), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf], [(8,), (3, 5), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf], [(3,), (5,), 0.8, 1.0, 0.8, 1.0, 1.0, 0.0, np.inf], [(5,), (3,), 1.0, 0.8, 0.8, 0.8, 1.0, 0.0, 1.0], [(10,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf], [(8,), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf]], columns=columns_ordered ) expect['antecedents'] = expect['antecedents'].apply( lambda x: str(frozenset(x))) expect['consequents'] = expect['consequents'].apply( lambda x: str(frozenset(x))) expect.sort_values(columns_ordered, inplace=True) expect.reset_index(inplace=True, drop=True) assert res_df.equals(expect), res_df
def test_override_metric_with_support(): res_df = association_rules(df_freq_items_with_colnames, min_threshold=0.8) # default metric is confidence assert res_df.values.shape[0] == 9 res_df = association_rules(df_freq_items_with_colnames, min_threshold=0.8, metric='support') assert res_df.values.shape[0] == 2 res_df = association_rules(df_freq_items_with_colnames, min_threshold=0.8, support_only=True) assert res_df.values.shape[0] == 2
def test_datatypes(): res_df = association_rules(df_freq_items) for i in res_df['antecedents']: assert isinstance(i, frozenset) is True for i in res_df['consequents']: assert isinstance(i, frozenset) is True # cast itemset-containing dataframe to set and # check if association_rule converts it internally # back to frozensets df_freq_items_copy = df_freq_items.copy() df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\ .apply(lambda x: set(x)) res_df = association_rules(df_freq_items) for i in res_df['antecedents']: assert isinstance(i, frozenset) is True for i in res_df['consequents']: assert isinstance(i, frozenset) is True
def test_empty_result(): expect = pd.DataFrame( columns=['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction'] ) res_df = association_rules(df_freq_items, min_threshold=2) assert res_df.equals(expect)
def test_frozenset_selection(): res_df = association_rules(df_freq_items) sel = res_df[res_df['consequents'] == frozenset((3, 5))] assert sel.values.shape[0] == 1 sel = res_df[res_df['consequents'] == frozenset((5, 3))] assert sel.values.shape[0] == 1 sel = res_df[res_df['consequents'] == {3, 5}] assert sel.values.shape[0] == 1 sel = res_df[res_df['antecedents'] == frozenset((8, 3))] assert sel.values.shape[0] == 1
def defectsContainsDataSet(self,param,head): basket_sets = (self.df[self.df[head].str.contains(param)].groupby(["DEFEC.", "INSPECCION"])["DEFEC."].count().unstack(level=0).fillna(0)) if(not basket_sets.empty): basket_sets = basket_sets.applymap(self.encode_units) #Apriori algorithm frequent_itemsets = apriori(basket_sets,min_support=0.05, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) #Add the elements affected by the rule return self.addNumberRules(rules,self.lenConData) else: return pd.DataFrame()
def test_on_df_with_missing_entries_support_only(): # this is a data frame where information about # antecedents and consequents have been cropped # see https://github.com/rasbt/mlxtend/issues/390 # for more details dict = {'itemsets': [['177', '176'], ['177', '179'], ['176', '178'], ['176', '179'], ['93', '100'], ['177', '178'], ['177', '176', '178']], 'support': [0.253623, 0.253623, 0.217391, 0.217391, 0.181159, 0.108696, 0.108696]} df = pd.DataFrame(dict) df_result = association_rules(df, support_only=True, min_threshold=0.1) assert df_result['support'].shape == (18,) assert int(np.isnan(df_result['support'].values).any()) != 1
def rule_generation(frequent_itemsets): rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8) rules.to_csv("tmp.csv") return rules
# ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], # ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], # ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], # ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']] from datetime import timedelta, date from es02 import es02 dataset = [] if __name__ == "__main__": global dataset es = es02() es.set_service("display") es.load_datas2(date(2017,12,1),date(2018,1,8)) dataset2 = es.dset dataset = [ item for item in dataset2 if len(item) > 1 ] for item in dataset: print(item) import pandas as pd from mlxtend.preprocessing import OnehotTransactions oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.3,use_colnames=True) print(frequent_itemsets) from mlxtend.frequent_patterns import association_rules arule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(arule)
# Working on table Products with Top Support TopSupport = freq_items.copy() TopSupport.columns = ['Percentage of Orders', 'Product'] TopSupport = TopSupport[['Product', 'Percentage of Orders']] TopSupport['Percentage of Orders'] = TopSupport['Percentage of Orders'].apply(lambda x: round(x * 100, 2)) TopSupport.insert(0, 'Serial No.', range(1, 1 + len(TopSupport))) TopSupport['Product'] = TopSupport['Product'].apply(lambda x: list(x)) # TopSupport['Dish Name'] = TopSupport['Dish Name'].apply(lambda x: ', '.join(x)) TopSupport['Product'] = TopSupport['Product'].apply(lambda x: 0 if len(x) >= 2 else x[0]) TopSupport = TopSupport[TopSupport['Product'] != 0] selectfew_TopSupport = TopSupport.head(10) top5perc = round(selectfew_TopSupport['Percentage of Orders'].head().sum(), 1) # Working on Product Associations based on Top Support rules_TopSupport = association_rules(freq_items, metric="support", min_threshold=0.015) rules_TopSupport = rules_TopSupport.sort_values(by=['support'], ascending=False) # MADE CHANGE HERE rules_TopSupport = rules_TopSupport[rules_TopSupport.lift > 1] rules_TopSupport = rules_TopSupport[['antecedents', 'consequents', 'support']] rules_TopSupport.columns = ['Product', 'Associated Product', 'Percentage of Orders'] rules_TopSupport['Percentage of Orders'] = rules_TopSupport['Percentage of Orders'].apply(lambda x: round(x * 100, 2)) rules_TopSupport.insert(0, 'Serial No', range(1, 1 + len(rules_TopSupport))) rules_TopSupport['Product'] = rules_TopSupport['Product'].apply(lambda x: list(x)) rules_TopSupport['Product'] = rules_TopSupport['Product'].apply(lambda x: x[0]) rules_TopSupport['Associated Product'] = rules_TopSupport['Associated Product'].apply(lambda x: list(x)) rules_TopSupport['Associated Product'] = rules_TopSupport['Associated Product'].apply(lambda x: ', '.join(x)) def remove_common_rows(data): ind = []
# In[10]: #Creating the dataframe of frequent itemsets te = TransactionEncoder() te_ary = te.fit(player_combo).transform(player_combo) match_df_freq = pd.DataFrame(te_ary, columns=te.columns_) # In[11]: match_sup = apriori(match_df_freq, min_support=0.1, use_colnames=True) print(match_sup) # In[12]: rules = association_rules(match_sup, metric="lift", min_threshold=1) # In[13]: rules # In[14]: won_rules = rules[(rules['consequents'] == {"won"})] # In[15]: won_rules # In[16]:
# In[14]: frequent_itemsets = apriori(movies_new, min_support=0.002, max_len=3, use_colnames=True) frequent_itemsets # In[15]: frequent_itemsets.sort_values('support', ascending=False, inplace=True) frequent_itemsets.sort_values # In[16]: rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules.head(20) rules.sort_values('lift', ascending=False).head(10) # In[17]: frequent_itemsets_1 = apriori(movies_new, min_support=0.004, max_len=4, use_colnames=True) frequent_itemsets_1 # In[18]: frequent_itemsets_1.sort_values('support', ascending=False, inplace=True) frequent_itemsets_1.sort_values
df = pd.DataFrame(te_ary, columns=te.columns_) df #this matrix of transactions : T/ F indicate their presence in each Trans ID df.shape #get back orginal transactions orgtrans1 = te_ary[:] te.inverse_transform(orgtrans1) #%%% #frequent itemsets - Most Imp Step support_threshold = 0.01 frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True) frequent_itemsets print(frequent_itemsets) #dataframe with the itemsets #%%%% - Support support3 = association_rules(frequent_itemsets, metric="support", min_threshold = .3) print(support3) print(support3[['antecedents', 'consequents', 'support','confidence']]) #--- support2 = association_rules(frequent_itemsets, metric="support", min_threshold = .2) print(support2[['antecedents', 'consequents', 'support','confidence']]) #%%%% Lift lift1 = association_rules(frequent_itemsets, metric="lift", min_threshold=1) print(lift1) print(lift1[['antecedents', 'consequents', 'support', 'lift','confidence']]) #-- lift2 = association_rules(frequent_itemsets, metric="lift", min_threshold=2) print(lift2) print(lift2[['antecedents', 'consequents', 'support', 'lift','confidence']])
lst # In[5]: bucket = (lst.groupby( ['ID', 'Item'])['Quantity'].sum().unstack().reset_index().fillna( 0).set_index('ID').applymap(lambda x: 1 if x > 0 else 0)) bucket # Sebelum data dimasukkan dalam algoritma apriori, data sudah harus berbentuk seperti di tabel di atas # ### Frquent Itemsets # Misalnya kita ingin menggunakan Minimal Support 30% # In[7]: frequent_itemsets = apriori(bucket, min_support=0.30, use_colnames=True) frequent_itemsets # ### Make Rules # Misalnya kita ingin membuat rules berdasarkan nilai minimal confidence 70% # In[8]: rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7) rules # In[ ]:
lst2 = [] for content in contents_list: lst = listmaker(content) if lst != []: lst2.append(lst) # Association Analysis import pandas as pd from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_ary = te.fit(lst2).transform(lst2) df = pd.DataFrame(te_ary, columns=te.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) from mlxtend.frequent_patterns import association_rules rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, support_only=False) rules['length'] = rules['antecedents'].apply(lambda x: len(x)) rules['length2'] = rules['consequents'].apply(lambda x: len(x)) rules = rules[(rules['length'] == 1) & (rules['length2'] == 1)] import openpyxl rules.to_excel("비타민제 소비자별 연관분석 결과/Vitamin Workers.xlsx")
#most popular wine: cantina pinot bianco # In[42]: df = df.groupby(['order', 'orderNumber']).size().reset_index(name='count') basket = (df.groupby([ 'orderNumber', 'order' ])['count'].sum().unstack().reset_index().fillna(0).set_index('orderNumber')) def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 basket_sets = basket.applymap(encode_units) frequent_itemsets = apriori(basket_sets, min_support=0.1, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift") rules.sort_values('lift', ascending=False, inplace=True) rules.head(10) #sorted by lift #top item - if a customer orders raost chicken, they are 2.75 times more likely to also purchase duckhorn chardonnay # In[36]: # In[ ]:
tr_enc = TransactionEncoder() basket = pd.DataFrame(tr_enc.fit_transform(transactions), columns=tr_enc.columns_) # In[14]: basket # In[15]: # səbət analizi üçün lazım olan funksiyalar from mlxtend.frequent_patterns import apriori, association_rules # In[16]: # məhsulların ayrı ayrılıqda və birlikdə səbətlərdə görünmə dərəcəsi frequent = apriori(basket, min_support=0.00002, low_memory=True, use_colnames=True) frequent # In[17]: # sol tərəfdəki məhsullar alındıqda sağ tərəfdəki məhsulların alınma ehtimalı ən yüksək olan kombinasiyalar (confidence dəyərinə görə) association_rules(frequent, metric="confidence", min_threshold=0.05).sort_values(by="confidence", ascending=False)
dataset_series.columns = ['transactions'] # Creating a dummy columns for the each item in each transactions...using column names as index X = dataset_series['transactions'].str.join(sep='*').str.get_dummies(sep='*') ''' Support = 0.001 and max_len = 3 ''' frequent_itemsets = apriori(X, min_support=0.001, max_len=3, use_colnames=True) # 9968 itemsets # Most frequent item sets based on support frequent_itemsets.sort_values('support', ascending =False, inplace=True) # Barplot of frequent item sets plt.bar(x=list(range(1,11)), height=frequent_itemsets.support[1:11], color='rgmyk') plt.xticks(list(range(1,11)), frequent_itemsets.itemsets[1:11], rotation=45);plt.xlabel('item_sets');plt.ylabel('support') # Rules rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1) # 45712 rules are generated rules.head(10) rules.sort_values('confidence',ascending =False, inplace=True) rules.head(10) ''' Support = 0.001 and max_len = 2 ''' frequent_itemsets = apriori(X, min_support=0.001, max_len=2, use_colnames=True) # 3138 itemsets # Most frequent item sets based on support frequent_itemsets.sort_values('support', ascending =False, inplace=True) # Barplot of frequent item sets plt.bar(x=list(range(1,11)), height=frequent_itemsets.support[1:11], color='rgmyk') plt.xticks(list(range(1,11)), frequent_itemsets.itemsets[1:11], rotation=45);plt.xlabel('item_sets');plt.ylabel('support') # Rules rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1) # 5426 rules are generated rules.head(10)
#df_inv_pro.sum() df_inv_pro.loc["tot_sepet"] = df_inv_pro.apply(lambda x: x.sum(), axis=0) df_inv_pro.tail() ############################################ # Birliktelik Kurallarının Çıkarılması ############################################ df_inv_pro.drop("tot_product", axis=1, inplace=True) df_inv_pro.drop("tot_sepet", axis=0, inplace=True) frequent_itemsets = apriori(df_inv_pro, min_support=0.01, use_colnames=True) frequent_itemsets.sort_values("support", ascending=False) rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01) rules.head() rules.sort_values("lift", ascending=False).head() # tüm çalışmanın fonksiyonlaştırılması import pandas as pd pd.set_option('display.max_columns', None) from mlxtend.frequent_patterns import apriori, association_rules from helpers.helpers import crm_data_prep, create_invoice_product_df df_ = pd.read_excel( r"C:\Users\Erkan\Desktop\DSMLBC-4\4.Hafta_26-29_Ocak Haftası\Ödevler ve Çalışmalar\online_retail_II.xlsx", sheet_name="Year 2010-2011") df = df_.copy() df = crm_data_prep(df)
def get_apriori(self): frequent_itemsets = apriori(self.encode_categorical(), min_support=0.07, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) return rules
frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True) frequent_itemsets1 # In[6]: frequent_itemsets05 = apriori(sparse_df, min_support=0.05, use_colnames=True) frequent_itemsets05 # In[8]: rules05 = association_rules(frequent_itemsets05, metric="confidence", min_threshold=0.5) rules05 # In[9]: frequent_itemsets01 = apriori(sparse_df, min_support=0.01, use_colnames=True) frequent_itemsets01 # In[10]: rules01 = association_rules(frequent_itemsets01, metric="confidence", min_threshold=0.7) rules01
import numpy as np dataframe = pd.read_csv('ARMatrixControl.csv') del dataframe['Classes'] del dataframe['Unnamed: 0'] min_support=0.095 print "support: ", min_support frequent_itemsets = apriori(dataframe, min_support=min_support, use_colnames=True) fileName = "control_frequent_itemsets_" + str(min_support) + '.csv' frequent_itemsets.to_csv(fileName) for confidence_increment in range(50, 100, 10): min_confidence = confidence_increment / float(100) print "confidence: ", min_confidence rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) fileName = "control_association_rules_C" + str(min_confidence) + "_S" + str(min_support) rules.to_csv(fileName+'.csv') print "DONE BITCH"
np_data_a = all_data.to_numpy() np_data_a = [[ elem for elem in row[1:] if isinstance(elem, str) and elem in items ] for row in np_data_a] np_data_a = [row for row in np_data_a if len(row) > 1] te_a = TransactionEncoder() te_ary_a = te_a.fit_transform(np_data_a) data_a = pd.DataFrame(te_ary_a, columns=te_a.columns_) data_a # %% result = fpgrowth(data_a, min_support=0.05, use_colnames=True) result # %% rules_conf = association_rules(result, min_threshold=0.3) rules_conf # %% rules_sup = association_rules(result, min_threshold=0.01, metric='support') rules_sup # %% rules_lift = association_rules(result, min_threshold=0.01, metric='lift') rules_lift # %% rules_leverage = association_rules(result, min_threshold=0.01, metric='leverage') rules_leverage
# 1.2 Importing Required modules. from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules from mlxtend.preprocessing import TransactionEncoder import pandas as pd import numpy as np # 1.3 Input data dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']] # 1.4 Creating the dataframe of frequent itemsets. te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) # 1.5 Applying Apriori algorithm and finding Association rules. sup = apriori(df, min_support=0.002, use_colnames=True) rules = association_rules(sup, metric="support", min_threshold=0.5) print(rules)
def my_func(df, algo_type): # Import the dataset response = {'error': False} try: items = (df['0'].unique()) encoded_vals = [] for index, row in df.iterrows(): labels = {} uncommons = list(set(items) - set(row)) commons = list(set(items).intersection(row)) for uc in uncommons: labels[uc] = 0 for com in commons: labels[com] = 1 encoded_vals.append(labels) # FP-Growth module requires a dataframe that has either 0 and 1 or True and False as data # we need to One Hot Encode the data. ohe_df = pd.DataFrame(encoded_vals) if algo_type == 'fp-growth': # Applying fp-growth freq_items = fpgrowth(ohe_df, min_support=0.2, use_colnames=True, verbose=1) elif algo_type == 'apriori': freq_items = apriori(ohe_df, min_support=0.2, use_colnames=True, verbose=1) # Mining Association Rules rules = association_rules(freq_items, metric="confidence", min_threshold=0.6) # Visualizing results # Support vs Confidence plt.scatter(rules['support'], rules['confidence'], alpha=0.5) plt.xlabel('support') plt.ylabel('confidence') plt.title('Support vs Confidence') support_confidence = generate_graph_img(plt) # Support vs Lift plt.scatter(rules['support'], rules['lift'], alpha=0.5) plt.xlabel('support') plt.ylabel('lift') plt.title('Support vs Lift') support_lift = generate_graph_img(plt) # Lift vs Confidence fit = np.polyfit(rules['lift'], rules['confidence'], 1) fit_fn = np.poly1d(fit) plt.plot(rules['lift'], rules['confidence'], 'yo', rules['lift'], fit_fn(rules['lift'])) lift_confidence = generate_graph_img(plt) response = { 'support_confidence': f'data:image/png;base64,{support_confidence}', 'support_lift': f'data:image/png;base64,{support_lift}', 'lift_confidence': f'data:image/png;base64,{lift_confidence}', 'error': False } except Exception as e: response = { 'error': str(e) } return response
basket_Roti = basket_encoded basket_encoded = basket_Selai.applymap(hot_encode) basket_Selai = basket_encoded basket_encoded = basket_Mentega.applymap(hot_encode) basket_Mentega = basket_encoded basket_encoded = basket_Susu.applymap(hot_encode) basket_Susu = basket_encoded basket_encoded = basket_Cokelat.applymap(hot_encode) basket_Cokelat = basket_encoded frq_items = apriori(basket_Roti, min_support=0.3, use_colnames=True) rules = association_rules(frq_items, metric="lift", min_threshold=1) rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]) print(rules.head()) frq_items = apriori(basket_Selai, min_support=0.3, use_colnames=True) rules = association_rules(frq_items, metric="lift", min_threshold=1) rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]) print(rules.head()) frq_items = apriori(basket_Mentega, min_support=0.3, use_colnames=True) rules = association_rules(frq_items, metric="lift", min_threshold=1) rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]) print(rules.head()) frq_items = apriori(basket_Selai, min_support=0.3, use_colnames=True) rules = association_rules(frq_items, metric="lift", min_threshold=1)
edges.to_json('./output/edges.json', orient='records') print('frequent itemsets') frequent_itemsets = apriori(basket_sets, min_support=min_support, use_colnames=True, max_len=max_len) frequent_itemsets.sort_values( 'support', inplace=True, ascending=False, ) print('rules') rules = association_rules(frequent_itemsets, metric=min_threshold_metric, min_threshold=min_threshold) rules.sort_values('lift', inplace=True, ascending=False) print('found ' + str(len(rules)) + ' rules') rules.to_json('./output/rules.json', orient='records') with open(out_js, "w") as fs: fs.truncate(0) fs.write("var loadeddata = {\n") fs.write('products : ' + nodes.to_json(orient='records') + ',\n') # fs.write('var links = ' + edges.to_json(orient='records') + '\n') fs.write('rules : ' + rules.to_json(orient='records') + ',\n') fs.write('}') # basket.sort_values('korfu', inplace=True, ascending=False)
df.info df.describe frequent_itemsets = apriori(df.iloc[: , 5:15], min_support = 0.015, max_len = 5, use_colnames = True) # Most Frequent item sets based on support frequent_itemsets.sort_values('support', ascending = False, inplace = True) # barplot of top 10 for visualization import matplotlib.pyplot as plt plt.bar(x = list(range(0, 11)), height = frequent_itemsets.support[0:11], color ='rgmyk') plt.xticks(list(range(0, 11)), frequent_itemsets.itemsets[0:11], rotation=10) plt.xlabel('item-sets') plt.ylabel('support') plt.show() rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1) #writing rules by arules rules.head(10) rules.sort_values('lift', ascending = False).head(10) #sorting values using highest lift ratio #considering only required columns and removing profusion def to_list(i): return (sorted(list(i))) ma_X = rules.antecedents.apply(to_list) + rules.consequents.apply(to_list) #considering antecedent and consequent columns and coverting them to list ma_X = ma_X.apply(sorted) #sorting in sequence rules_sets = list(ma_X) #coverting from str to list unique_rules_sets = [list(m) for m in set(tuple(i) for i in rules_sets)] #using set to remove profusion(duplicates)
def mba(survey): #selecting customer_id and merchant cols results = survey[[ "customer_id", "restaurant", "fffc", "grocery", "electronic", "sports", "clothing", "household", "hba", "beverage" ]] #creating a new df with a col of combination of all merchants in a str (done cause of chained assignnment) merchant = pd.DataFrame() merchant["merchants"] = results[results.columns[1:]].apply( lambda y: ','.join(y.dropna().astype(str)), axis=1) #create another df with just the combined merchants in str and customer_id new_merchant = pd.concat([results["customer_id"], merchant["merchants"]], axis=1) #text pre-processing #converting all inputs to lowercase new_merchant = new_merchant.applymap(lambda s: s.lower() if type(s) == str else s) #replace str containing 'and' with ',' new_merchant = new_merchant.applymap(lambda x: str(x).replace('and ', ',')) #strip data of all whitespaces, not done here to match merchants_db input #new_merchant = new_merchant.applymap(lambda r: str(r).replace(" ","")) #new_merchant = new_merchant.applymap(lambda w: str(w).strip()) #change merchants col from str to list new_merchant["merchants"] = new_merchant["merchants"].str.split('[:;.,/]') #explode merchants cols and ensure index reset since customer_id is the identifier and already in col exploded_merchant = new_merchant.explode("merchants") exploded_merchant.reset_index(drop=True, inplace=True) #check for nil or - values or empty string values (survey input error) for j in exploded_merchant.index: if exploded_merchant["merchants"][j] == "nil" or exploded_merchant[ "merchants"][j] == "Nil" or exploded_merchant["merchants"][ j] == "" or exploded_merchant["merchants"][ j] == "nan" or exploded_merchant["merchants"][j] == "-": exploded_merchant.drop([j], inplace=True) #remove front and trailing whitespaces exploded_merchant = exploded_merchant.applymap(lambda w: str(w).strip()) #for mba add a quantity col exploded_merchant.insert(2, "quantity", 1) #explodexploded_merchantcsv', index = False) #print(exploded_merchant) #note exploded_merchantls are in str and quantity col in numpy.float market_basket = exploded_merchant.groupby(['customer_id', 'merchants'])['quantity'] market_basket = market_basket.sum().unstack().reset_index().fillna( 0).set_index('customer_id') market_basket = market_basket.applymap(encode_data) #itemsets are possible generation after applying apriori on min_support itemsets = apriori(market_basket, min_support=0.07, use_colnames=True) #rules is a df of possible associations rules = association_rules(itemsets, metric="lift", min_threshold=0) #note sorting done after appending categories or else error #check fronzen set antecedents for single item (sort by category) for k in rules.index: if len(rules["antecedents"][k]) > 1: rules.drop([k], inplace=True) #convert fronzenset from itemset into list or string(for single item) rules['antecedents'] = rules['antecedents'].apply( lambda a: ','.join(list(a))) rules['consequents'] = rules['consequents'].apply( lambda a: ','.join(list(a))) #round values to 3dp for visualization rules['antecedent support'] = round(rules['antecedent support'], 3) rules['consequent support'] = round(rules['consequent support'], 3) rules['support'] = round(rules['support'], 3) rules['confidence'] = round(rules['confidence'], 3) rules['lift'] = round(rules['lift'], 3) rules['leverage'] = round(rules['leverage'], 3) rules['conviction'] = round(rules['conviction'], 3) #creating new cols for graphs #new col = itemset rules['itemsets'] = rules['antecedents'] + ' -> ' + rules['consequents'] #new col = count, assign int instead of float rules['count'] = round(rules["support"] * len(new_merchant.index)) rules["count"] = rules["count"].astype(int) #rules.to_csv("mba_007.csv") return rules
import pandas as pd from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import association_rules df = pd.read_csv( r'C:\Users\arash\Desktop\association-rule-mining\dataframe-python.csv') df # pd.set_option("display.max_rows", None, "display.max_columns", None) print(df) frequent_itemsets_fp = fpgrowth(df, min_support=0.2, use_colnames=True) print(frequent_itemsets_fp) rules_fp = association_rules(frequent_itemsets_fp, metric="confidence", min_threshold=0.7) print(rules_fp) rules_fp
if x != '': print(x + ' → ' + y) from mlxtend.preprocessing import TransactionEncoder TE = TransactionEncoder() data = TE.fit_transform(symptoms) print(data) import pandas as pd df = pd.DataFrame(data, columns=TE.columns_) df.head() from mlxtend.frequent_patterns import apriori items = apriori(df, min_support=0.1, use_colnames=True) print(items) print(items[items['itemsets'].apply(lambda x: len(x)) >= 2]) from mlxtend.frequent_patterns import association_rules rules = association_rules(items, min_threshold=0.7) print(rules) for i, j in rules.iterrows(): X = j['antecedents'] Y = j['consequents'] x = ', '.join([item for item in X]) y = ', '.join([item for item in Y]) print(x + ' → ' + y)
df.shape #get back orginal transactions orgtrans1 = te_ary[:] te.inverse_transform(orgtrans1) #%%% #frequent itemsets - Most Imp Step support_threshold = 0.01 #https://github.com/rasbt/mlxtend/blob/master/mlxtend/frequent_patterns/apriori.py frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True) frequent_itemsets print(frequent_itemsets) #dataframe with the itemsets #%%%% - Support Rules association_rules? #output - DF with antecedents -> consequent supportRules3 = association_rules(frequent_itemsets, metric="support", min_threshold = .3) print(supportRules3) supportRules3.head() print(supportRules3[['antecedents', 'consequents', 'support','confidence','lift']]) #--- supportRules2 = association_rules(frequent_itemsets, metric="support", min_threshold = .2) print(supportRules2[['antecedents', 'consequents', 'support','confidence','lift']]) #%%%% Lift : generally > 1 for strong associations lift1 = association_rules(frequent_itemsets, metric="lift", min_threshold=1) print(lift1) lift1 print(lift1[['antecedents', 'consequents', 'support', 'lift','confidence']])
def bil_aprori(): if request.method == 'POST': f = request.files['file'] # f.save(secure_filename(f.filename)) print(f) app.logger.info("File Received") else: return 'Error in Upload' warnings.filterwarnings('ignore') df = pd.read_csv(f) print("Dataset Import Success") df['Item'] = df['Item'].str.lower() x = df['Item'] == 'none' print(x.value_counts()) df = df.drop(df[df.Item == 'none'].index) len(df['Item'].unique()) df_for_top10_Items = df['Item'].value_counts().head(10) Item_array = np.arange(len(df_for_top10_Items)) import matplotlib.pyplot as plt # plt.figure(figsize=(15,5)) # Items_name=['coffee','bread','tea','cake','pastry','sandwich','medialuna','hot chocolate','cookies','brownie'] # plt.bar(Item_array,df_for_top10_Items.iloc[:]) # plt.xticks(Item_array,Items_name) # plt.title('Top 5 most selling items') # # plt.show() # plt.savefig('static/new_plot1.png') fig, ax = plt.subplots(figsize=(16, 7)) df['Item'].value_counts().sort_values(ascending=False).head(20).plot.bar( width=0.5, edgecolor='k', align='center', linewidth=1) plt.xlabel('Food Item', fontsize=20) plt.ylabel('Number of transactions', fontsize=17) ax.tick_params(labelsize=20) plt.title('20 Most Sold Items', fontsize=20) # plt.grid() plt.savefig('static/new_plot1.png') plt.clf() plt.cla() plt.close() ###################################################################### df['Date'] = pd.to_datetime(df['Date']) df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.hour df['day_of_week'] = df['Date'].dt.weekday d = df.loc[:, 'Date'] weekday_names = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] Weekday_number = [0, 1, 2, 3, 4, 5, 6] week_df = d.groupby(d.dt.weekday).count().reindex(Weekday_number) Item_array_week = np.arange(len(week_df)) plt.figure(figsize=(15, 5)) my_colors = 'rk' plt.bar(Item_array_week, week_df, color=my_colors) plt.xticks(Item_array_week, weekday_names) plt.title('Number of Transactions made based on Weekdays') #plt.show() plt.savefig('static/new_plot2.png') plt.clf() plt.cla() plt.close() ##################################################################### dt = df.loc[:, 'Time'] Hour_names = [ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ] time_df = dt.groupby(dt).count().reindex(Hour_names) Item_array_hour = np.arange(len(time_df)) plt.figure(figsize=(15, 5)) my_colors = 'rb' plt.bar(Item_array_hour, time_df, color=my_colors) plt.xticks(Item_array_hour, Hour_names) plt.title('Number of Transactions made based on Hours') #plt.show() plt.savefig('static/new_plot3.png') plt.clf() plt.cla() plt.close() ############################################################################## from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules hot_encoded_df = df.groupby( ['Transaction', 'Item'])['Item'].count().unstack().reset_index().fillna(0).set_index( 'Transaction') def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 hot_encoded_df = hot_encoded_df.applymap(encode_units) frequent_itemsets = apriori(hot_encoded_df, min_support=0.01, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules.head() rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.5)] ############################################################################### support = rules['support'].values confidence = rules['confidence'].values import seaborn as sns for i in range(len(support)): support[i] = support[i] confidence[i] = confidence[i] # plt.figure(figsize=(15,5)) # my_colors = 'rb' # plt.bar(Item_array_hour,time_df, color=my_colors) # plt.xticks(Item_array_hour,Hour_names) # plt.title('Number of Transactions made based on Hours') # #plt.show() # plt.savefig('static/new_plot3.png') plt.plot() plt.figure(figsize=(15, 5)) plt.scatter(support, confidence, alpha=0.5, marker="*") plt.title('Association Rules') plt.xlabel('support') plt.ylabel('confidence') #fig=() #sns.regplot(x=support, y=confidence, fit_reg=False) # plt.show(p) # fig = p.get_figure() # fig.savefig('out.png') # fig = fig1.get_figure() # fig.savefig("output.png") # fig = sns.regplot(x=support, y=confidence, fit_reg=False) # fig.figure.savefig('static/new_plot4.png') #fig = sns.regplot(x=support, y=confidence, fit_reg=False) #fig.figure.savefig('../test.png') #plt.show() plt.savefig('static/new_plot4.png') plt.clf() plt.cla() plt.close() ###################### rules_to_show = 20 import networkx as nx plt.plot() G1 = nx.DiGraph() color_map = [] N = 50 colors = np.random.rand(N) strs = [ 'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11' ] for i in range(rules_to_show): G1.add_nodes_from(["R" + str(i)]) for a in rules.iloc[i]['antecedents']: G1.add_nodes_from([a]) G1.add_edge(a, "R" + str(i), color=colors[i], weight=2) for c in rules.iloc[i]['consequents']: G1.add_nodes_from([c]) G1.add_edge("R" + str(i), c, color=colors[i], weight=2) for node in G1: found_a_string = False for item in strs: if node == item: found_a_string = True if found_a_string: color_map.append('yellow') else: color_map.append('green') edges = G1.edges() colors = [G1[u][v]['color'] for u, v in edges] weights = [G1[u][v]['weight'] for u, v in edges] pos = nx.spring_layout(G1, k=16, scale=1) nx.draw(G1, pos, edges=edges, node_color=color_map, edge_color=colors, width=weights, font_size=16, with_labels=False) for p in pos: # raise text positions pos[p][1] += 0.07 nx.draw_networkx_labels(G1, pos) # plt.show() plt.savefig('static/new_plot5.png') plt.clf() plt.cla() plt.close() import time time.sleep(5) # Delays for 5 seconds. You can also use a float value. return render_template('out.html', name='Top 5 most selling items', url='new_plot1.png', name1='abc', url2='new_plot2.png', url3='new_plot3.png', url4='new_plot4.png', url5='new_plot5.png')
def create_model(metric="confidence", threshold=0.5, min_support=0.05, round=4): """ This function creates an association rules model using data and identifiers passed at setup stage. This function internally transforms the data for association rule mining. Example ------- >>> from pycaret.datasets import get_data >>> data = get_data('france') >>> from pycaret.arules import * >>> exp_name = setup(data = data, transaction_id = 'InvoiceNo', item_id = 'Description') >>> model1 = create_model(metric = 'confidence') metric: str, default = 'confidence' Metric to evaluate if a rule is of interest. Default is set to confidence. Other available metrics include 'support', 'lift', 'leverage', 'conviction'. These metrics are computed as follows: * support(A->C) = support(A+C) [aka 'support'], range: [0, 1] * confidence(A->C) = support(A+C) / support(A), range: [0, 1] * lift(A->C) = confidence(A->C) / support(C), range: [0, inf] * leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1] * conviction = [1 - support(C)] / [1 - confidence(A->C)], range: [0, inf] threshold: float, default = 0.5 Minimal threshold for the evaluation metric, via the `metric` parameter, to decide whether a candidate rule is of interest. min_support: float, default = 0.05 A float between 0 and 1 for minumum support of the itemsets returned. The support is computed as the fraction `transactions_where_item(s)_occur / total_transactions`. round: int, default = 4 Number of decimal places metrics in score grid will be rounded to. Returns: pandas.DataFrame Warnings -------- - Setting low values for min_support may increase training time. """ # loading dependencies import pandas as pd from IPython.display import display, HTML, clear_output, update_display from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # reshaping the dataframe basket = ( X.groupby([txid, iid])[iid] .count() .unstack() .reset_index() .fillna(0) .set_index(txid) ) if ignore_list is not None: basket = basket.drop(ignore_list, axis=1) def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 basket = basket.applymap(encode_units) frequent_itemsets = apriori(basket, min_support=min_support, use_colnames=True) rules = association_rules(frequent_itemsets, metric=metric, min_threshold=threshold) rules = rules.sort_values(by=[metric], ascending=False).reset_index(drop=True) rules = rules.round(round) # storing into experiment tup = ("Basket", basket) experiment__.append(tup) tup = ("Frequent Itemsets", frequent_itemsets) experiment__.append(tup) tup = ("Rules", rules) experiment__.append(tup) return rules
# 分析MovieLens 电影分类中的频繁项集和关联规则 import pandas as pd from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # 数据加载 movies = pd.read_csv('./movie_actors.csv') #print(movies.head()) # 将genres进行one-hot编码(离散特征有多少取值,就用多少维来表示这个特征) movies_hot_encoded = movies.drop('actors', 1).join(movies.actors.str.get_dummies('/')) pd.options.display.max_columns = 100 print(movies_hot_encoded.head()) # 将movieId, title设置为index movies_hot_encoded.set_index(['title'], inplace=True) #print(movies_hot_encoded.head()) # 挖掘频繁项集,最小支持度为0.02 itemsets = apriori(movies_hot_encoded, use_colnames=True, min_support=0.05) # 按照支持度从大到小进行时候粗 itemsets = itemsets.sort_values(by="support", ascending=False) print('-' * 20, '频繁项集', '-' * 20) print(itemsets) # 根据频繁项集计算关联规则,设置最小提升度为2 rules = association_rules(itemsets, metric='lift', min_threshold=2) # 按照提升度从大到小进行排序 rules = rules.sort_values(by="lift", ascending=False) rules.to_csv('./rules.csv') print('-' * 20, '关联规则', '-' * 20) print(rules)
for index, row in data_arm.iterrows(): labels = {} uncommons = list(set(items) - set(row)) commons = list(set(items).intersection(row)) for uc in uncommons: labels[uc] = 0 for com in commons: labels[com] = 1 encoded_vals.append(labels) encoded_vals[0] ohe_df = pd.DataFrame(encoded_vals) freq_items = apriori(ohe_df, min_support=0.05, use_colnames=True, verbose=1) freq_items.head(7) rules = association_rules(freq_items, metric="confidence", min_threshold=0.1) print(rules.head(50)) subset = rules[rules['conviction'] > 1.2] with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(subset) print((subset.sort_values(['conviction', 'lift'], ascending=(False, False))).to_string()) subset.to_csv("Data/pred_set_arm_out.csv") z = subset['support'] y = subset['confidence'] plt.scatter(z, y, alpha=0.5, color='black') n = np.arange(0, 12, 1) for i, txt in enumerate(n): plt.annotate('R' + str(txt), (z.iloc[i] + 0.001, y.iloc[i] + 0.001))
data = prepfunctions.dummification(perm_data.copy(deep=True), boolean_attributes, bins, strategie) MIN_SUP: float = 0.001 var_min_sup =[0.2, 0.1] + [round(i*MIN_SUP, 2) for i in range(100, 0, -10)] plt.figure() patterns: pd.DataFrame = pm.apriori(data, min_support=MIN_SUP, use_colnames=True, verbose=True) print(len(patterns),'patterns') nr_patterns = [] for sup in var_min_sup: pat = patterns[patterns['support']>=sup] nr_patterns.append(len(pat)) plt.figure(figsize=(6, 4)) ds.plot_line(var_min_sup, nr_patterns, title='Nr Patterns x Support', xlabel='support', ylabel='Nr Patterns') plt.savefig(subDir + 'HFCR Pattern Mining - Nr Patterns x Support') MIN_CONF: float = 0.1 rules = pm.association_rules(patterns, metric='confidence', min_threshold=MIN_CONF*5, support_only=False) print(f'\tfound {len(rules)} rules') nr_rules_sp = analyse_per_metric(rules, 'support', var_min_sup, subDir) plt.figure(figsize=(6, 4)) ds.plot_line(var_min_sup, nr_rules_sp, title='Nr Rules x Support', xlabel='support', ylabel='Nr. rules', percentage=False) plt.savefig(subDir + 'HFCR Pattern Mining - Nr Rules x Support') var_min_conf = [round(i * MIN_CONF, 2) for i in range(10, 5, -1)] nr_rules_cf = analyse_per_metric(rules, 'confidence', var_min_conf, subDir) plt.figure(figsize=(6, 4)) ds.plot_line(var_min_conf, nr_rules_cf, title='Nr Rules x Confidence', xlabel='confidence', ylabel='Nr Rules', percentage=False) plt.savefig(subDir + 'HFCR Pattern Mining - Nr Rules x Confidence')
import pandas as pd import time from mlxtend.frequent_patterns import apriori, association_rules #数据加载 data = pd.read_csv("movie_actors.csv") start = time.time() # 进行one-hot编码(离散值有多少取值,就用多少维来表示这个特征) data_hot_encode = data.drop('actors', 1).join(data.actors.str.get_dummies('/')) pd.options.display.max_columns = 100 print(data_hot_encode.head()) # 将movieId, title设置为index data_hot_encode.set_index(['title'], inplace=True) print(data_hot_encode.head()) frequent_items = apriori(data_hot_encode, min_support=0.02, use_colnames=True) rules = association_rules(frequent_items, metric='lift', min_threshold=0.5) print('频繁项集:', frequent_items) print('关联规则:', rules) end = time.time() print('计算用时:%s' % (end - start))
import pandas as pd from mlxtend.preprocessing import OnehotTransactions from mlxtend.frequent_patterns import apriori oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) print(df) frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True) print(frequent_itemsets) from mlxtend.frequent_patterns import association_rules association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2) print(rules[['antecedents', 'consequents', 'support']]) support = rules[['support']] #support = rules.as_matrix(columns=['support']) #confidence = rules.as_matrix(columns=['confidence']) #print(support) #print(confidence) import numpy as np import pandas as pd import networkx as nx import matplotlib.pyplot as plt # create state space and initial state probabilities
def main(): data = {} with open('online_retail.csv') as f: header_row = False header = [] for cols in csv.reader(f): if not header_row: header_row = True header = [i for i in cols] # list of headers for h in header: data[h] = [] # init of dictionary continue if len(cols) == len(header) and cols[0][0] != 'C' and cols[0] != '' and cols[ 2] != '': # chech for empty InvoiceIds or Descriptions for col, name in zip(cols, header): data[name].append(col) byinvoice = {} # dictionary with data grouped by invoice for invoiceNo, desc in zip(data['InvoiceNo'], data['Description']): if invoiceNo not in byinvoice.keys(): byinvoice[invoiceNo] = [] # if the dict doesn't have an invoice entry, initialize it byinvoice[invoiceNo].append(desc) unique_products = [] # set of unique products, is used for the bitmap matrix for prod in data['Description']: if prod not in unique_products: unique_products.append(prod) match_matrix = [] # bitmap matrix is done by a list of lists for inv in byinvoice.keys(): match_array = [0] * len(unique_products) # init of each row of bitmap matrix for prod in byinvoice[ inv]: # for every product of a given invoice, I get the index inside the list of unique products i = unique_products.index(prod) match_array[i] = 1 # that index is used to put 1 in the correct position inside current matrix row match_matrix.append(match_array) df = pd.DataFrame(data=match_matrix, columns=unique_products) ''' fi = fpgrowth(df, 0.05) print(len(fi)) print(fi.to_string()) # checking if result is right (first product is present in 10.9 % of invoices) i = 0 for inv in byinvoice.values(): if unique_products[0] in inv: i += 1 print(str(i/len(byinvoice.keys()))) ''' # time check # I did it with 0.05 because using minsup=0.01 with apriori resulted in a Memory Error :((((( print(timeit.timeit(lambda: apriori(df, 0.05), number=1)) print(timeit.timeit(lambda: fpgrowth(df, 0.05), number=1)) fi = fpgrowth(df, 0.01) print(len(fi)) fi_list = fi.values.tolist() # just to see the top 10 relevant informations # the larger the set, the more relevant the information (I think) fi_list.sort(key=lambda x: -len(x[1])) print(fi_list[:10]) # top 10 ar = association_rules(fi, metric="confidence", min_threshold=0.85) ar.to_csv(r'association_rules.csv', header=True) # dump to file for visualization reasons
def get_assoc_dataframe(self): assoc = association_rules(self.get_frequent_itemset(), metric = "confidence", min_threshold = 0.7) # earlier 0.6 return assoc