def test_itemsets_type(): res_colindice = apriori(df, use_colnames=False) # This is default behavior for i in res_colindice['itemsets']: assert isinstance(i, frozenset) is True res_colnames = apriori(df, use_colnames=True) for i in res_colnames['itemsets']: assert isinstance(i, frozenset) is True
def test_sparsedataframe_notzero_column(): dfs = pd.SparseDataFrame(df) dfs.columns = [i for i in range(len(dfs.columns))] apriori(dfs) dfs = pd.SparseDataFrame(df) dfs.columns = [i+1 for i in range(len(dfs.columns))] assert_raises(ValueError, 'Due to current limitations in Pandas, ' 'if the SparseDataFrame has integer column names,' 'names, please make sure they either start ' 'with `0` or cast them as string column names: ' '`df.columns = [str(i) for i in df.columns`].', apriori, dfs)
def test_frozenset_selection(): res_df = apriori(df, use_colnames=True) assert res_df.values.shape == (11, 2) assert res_df[res_df['itemsets'] == 'nothing'].values.shape == (0, 2) assert res_df[res_df['itemsets'] == {'Eggs', 'Kidney Beans'}].values.shape == (1, 2) assert res_df[res_df['itemsets'] == frozenset(('Eggs', 'Kidney Beans'))].values.shape\ == (1, 2) assert res_df[res_df['itemsets'] == frozenset(('Kidney Beans', 'Eggs'))].values.shape\ == (1, 2)
def test_with_fill_values(fill_value): sdf = df.to_sparse(fill_value=fill_value) res_df = apriori(sdf, use_colnames=True) assert res_df.values.shape == (11, 2) assert res_df[res_df['itemsets'] == 'nothing'].values.shape == (0, 2) assert res_df[res_df['itemsets'] == {'Eggs', 'Kidney Beans'}].values.shape == (1, 2) assert res_df[res_df['itemsets'] == frozenset(('Eggs', 'Kidney Beans'))].values.shape \ == (1, 2) assert res_df[res_df['itemsets'] == frozenset(('Kidney Beans', 'Eggs'))].values.shape \ == (1, 2)
def defectsContainsDataSet(self,param,head): basket_sets = (self.df[self.df[head].str.contains(param)].groupby(["DEFEC.", "INSPECCION"])["DEFEC."].count().unstack(level=0).fillna(0)) if(not basket_sets.empty): basket_sets = basket_sets.applymap(self.encode_units) #Apriori algorithm frequent_itemsets = apriori(basket_sets,min_support=0.05, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) #Add the elements affected by the rule return self.addNumberRules(rules,self.lenConData) else: return pd.DataFrame()
def test_default(): res_df = apriori(df) expect = pd.DataFrame([[0.8, np.array([3]), 1], [1.0, np.array([5]), 1], [0.6, np.array([6]), 1], [0.6, np.array([8]), 1], [0.6, np.array([10]), 1], [0.8, np.array([3, 5]), 2], [0.6, np.array([3, 8]), 2], [0.6, np.array([5, 6]), 2], [0.6, np.array([5, 8]), 2], [0.6, np.array([5, 10]), 2], [0.6, np.array([3, 5, 8]), 3]], columns=['support', 'itemsets', 'length']) for a, b in zip(res_df, expect): assert_array_equal(a, b)
def Apriori(Dataset_Encoded, min_support=0.05): # Build apriori model FrequentItems = apriori(Dataset_Encoded, min_support=min_support, use_colnames=True) return FrequentItems
# One hot from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() # transaction input X dataset te_arry = te.fit(transactions).transform( transactions) # format dataset suitable for ML APIs via fit() method. # Learns the unique labels in the dataset (True\Fales), # and via the transform() method df_x = pd.DataFrame( te_arry, # row=containing set of bool for specific product columns=te.columns_) # col=all products are arranged in alphabatical order # Step 4: Train model using Apiori algorithm # ref = https://rasbt.github.io/mlxtend/api_subpackages/mlxtend.frequent_patterns/ from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules df_sets = apriori( df_x, # values are either 0/1 or True/False. min_support=0.005, # a set of transactions containing(I)/transactions use_colnames=True) # Allowed col df_rules = association_rules( df_sets, # values are either 0/1 or True/False. metric='support', # (bydefault cofidence) but is support formula min_threshold=0.005, # 0.5% support_only=True) # It is support for eclat rule # if you use only "support", it called "ECLAT"
filtered = data.drop(['NONE']) data = data.reset_index() filtered = filtered.reset_index() transaction_list = [] # For loop to create a list of the unique transactions throughout the dataset: for i in filtered['Transaction'].unique(): tlist = list(set(filtered[filtered['Transaction'] == i]['Item'])) if len(tlist) > 0: transaction_list.append(tlist) te = TransactionEncoder() te_ary = te.fit(transaction_list).transform(transaction_list) df2 = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df2, min_support=0.01, use_colnames=True) #take minimum threshold rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.0001) rules.sort_values('confidence', ascending=False) #now categorise every rule with different range of confidence rules['support'] = rules['support'] * 100 rules['confidence'] = rules['confidence'] * 100 rules2 = rules[['antecedents', 'consequents', 'support', 'confidence']] rules2.sort_values('confidence', ascending=False) bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
def recommend(): df = pd.read_csv('database.csv') df1 = df['Product'].apply(lambda x: x.split(',')) te = TransactionEncoder() te_ary = te.fit(df1).transform(df1) df1 = pd.DataFrame(te_ary, columns=te.columns_).drop('', axis=1) ## print(te.columns_) ## print(df1) frequent_itemsets = apriori(df1, min_support=0.03, use_colnames=True) #sup=sum(frequent_itemsets['support'])*2/len(frequent_itemsets['support']) #frequent_itemsets = apriori(df1, min_support=sup, use_colnames=True) #print(frequent_itemsets) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply( lambda x: len(x)) items = frequent_itemsets[(frequent_itemsets['length'] >= 2) & (frequent_itemsets['support'] >= 0.04)] recP = items['itemsets'] ## win=tk.Tk() ## win.title('Recommendations for you...') ## win.geometry('500x500') ## label=tk.Label(win,text='Recommended products for you...') ## label.place(x=10,y=10) ## listbox=tk.Listbox(win,relief='flat',width=50) ## listbox.place(x=15,y=30) ## for i in recP: ## listbox.insert(tk.END,tuple(i)) ## ## win.mainloop() GP = pd.read_csv('price_list.csv') class SampleApp(tk.Tk): def __init__(self, *args, **kwargs): tk.Tk.__init__(self, *args, **kwargs) lb = tk.Listbox(self) for i in recP: lb.insert(tk.END, tuple(i)) lb.bind("<Double-Button-1>", self.OnDouble) lb.pack(side="top", fill="both", expand=True) def OnDouble(self, event): widget = event.widget selection = widget.curselection() value = widget.get(selection[0]) try: Rec_pric = ( GP.iloc[[list(GP['Product']).index(value[0])], :].values[0] [1] + GP.iloc[[list(GP['Product']).index(value[1])], :].values[0] [1]) - (0.1 * (GP.iloc[[list(GP['Product']). index(value[0])], :].values[0][1] + GP.iloc[[list(GP['Product']). index(value[1])], :].values[0][1])) img = IMG.new('RGB', (60, 30), color=(0, 0, 0)) d = ImageDraw.Draw(img) d.text((10, 10), "Rs." + str(Rec_pric), fill=(255, 255, 255)) img.save('images/recommend_price.png') list_file = os.scandir('images') item_list = [i.name for i in iter(list_file)] first, second = '', '' for i in item_list: if str(value[0]) == i[:len(str(value[0]))]: first = i if str(value[1]) == i[:len(str(value[1]))]: second = i #plt.title('Rs.'+str(Rec_pric)) for j in [first, second, 'recommend_price.png']: plt.subplot( 1, 3, [first, second, 'recommend_price.png'].index(j) + 1) img = plt.imread('images/' + j) plt.imshow(img) plt.xlabel(j[:-4]) plt.xticks([]) plt.yticks([]) plt.autoscale() plt.show() #label=tk.Label(roo,text=str(value[0])+'+'+str(value[1])+' = Rs.'+str(int(Rec_pric)),font=('Tahoma',30),fg='white',bg='black') except: roo = tk.Tk() roo.title('Offer for you...') label = tk.Label(roo, text='Something went wrong!!!', font=('Tahoma', 30), fg='white', bg='black') label.pack() roo.mainloop() if __name__ == "__main__": app = SampleApp() app.title('Recommended products') app.mainloop()
import pandas as pd import time from mlxtend.frequent_patterns import apriori, association_rules from wordcloud import WordCloud from nltk.tokenize import word_tokenize import matplotlib.pyplot as plt #数据加载 data = pd.read_csv("Market_Basket_Optimisation.csv", header=None, sep='/') start = time.time() # print(data.head()) # 进行one-hot编码(离散值有多少取值,就用多少维来表示这个特征) data_hot_encode = data.drop(0, 1).join(data[0].str.get_dummies(',')) pd.options.display.max_columns = 100 # print(data_hot_encode.head()) frequent_items = apriori(data_hot_encode, min_support=0.02, use_colnames=True) rules = association_rules(frequent_items, metric='lift', min_threshold=0.5) # 按照提升度从大到小进行排序 rules = rules.sort_values(by="lift", ascending=False) print('频繁项集:', frequent_items) print('-' * 20, '关联规则', '-' * 20) print(rules) print('关联规则:', rules) end = time.time() print('计算用时:%s' % (end - start)) # 去掉停用词 def remove_stop_words(f): stop_words = ['Movie'] for stop_word in stop_words:
dataset = [[int(n) for n in line.split()] for line in fin] # In[3]: te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset, sparse=True) sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False) sparse_df # In[4]: frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True) frequent_itemsets5 # In[5]: frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True) frequent_itemsets1 # In[6]: frequent_itemsets05 = apriori(sparse_df, min_support=0.05, use_colnames=True) frequent_itemsets05
cell_value = cell_value.replace('"', "") cell_value = cell_value.replace(r'[', '') cell_value = cell_value.replace(r']', '') # Splits each value apart based on spaces and appends them to the cell list cell_value_parsed = cell_value.split(',') all_cells_list.append(cell_value_parsed) print(all_cells_list) # Creation of the data frame based on the cell_list oht = OnehotTransactions() oht_ary = oht.fit(all_cells_list).transform(all_cells_list) df = pd.DataFrame(oht_ary, columns=oht.columns_) print(df) # ---------------------------------------------------------------------------------------------------------------------- # Configuration for apriori algorithm # ---------------------------------------------------------------------------------------------------------------------- # minimum value of the coefficient min_co = 0.3 # True or False whether or not to include the column names in the output use_colnames_bool = True # max number of associations max_len_value = None frequent_itemsets = apriori(df, min_support=min_co, use_colnames=use_colnames_bool, max_len=max_len_value) frequent_itemsets.to_csv('dataAssociation.csv') print(frequent_itemsets) print("done")
# ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], # ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], # ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], # ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']] from datetime import timedelta, date from es02 import es02 dataset = [] if __name__ == "__main__": global dataset es = es02() es.set_service("display") es.load_datas2(date(2017,12,1),date(2018,1,8)) dataset2 = es.dset dataset = [ item for item in dataset2 if len(item) > 1 ] for item in dataset: print(item) import pandas as pd from mlxtend.preprocessing import OnehotTransactions oht = OnehotTransactions() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.3,use_colnames=True) print(frequent_itemsets) from mlxtend.frequent_patterns import association_rules arule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(arule)
import pandas as pd from mlxtend.frequent_patterns import apriori, association_rules from numpy.testing import assert_raises one_ary = np.array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1], [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]]) cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk', 'Nutmeg', 'Onion', 'Unicorn', 'Yogurt'] df = pd.DataFrame(one_ary, columns=cols) df_freq_items = apriori(df, min_support=0.6) columns_ordered = ['antecedants', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction'] def test_default(): res_df = association_rules(df_freq_items) res_df['antecedants'] = res_df['antecedants'].apply( lambda x: str(frozenset(x))) res_df['consequents'] = res_df['consequents'].apply( lambda x: str(frozenset(x))) res_df.sort_values(columns_ordered, inplace=True) res_df.reset_index(inplace=True, drop=True)
# -*- coding: utf-8 -*- """ Created on Sat Aug 1 16:21:11 2020 @author: bibiboom """ import pandas as pd from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules data = pd.read_csv('dingdanbiao2.csv', encoding = 'gbk') data = data[["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q"]] data = data.drop(columns = ['a','b','c','d','e','g','h','i','j','k','m','n','o','p','q']) data_HE = data.drop('f',1).join(data.f.str.get_dummies(sep='|')) #print (data_HE) #data_HE.to_csv("data_HE.csv",index=False) data_HE.set_index(['l'],inplace=True) data_HE = data_HE.sort_values(by="l" , ascending=True) data_HE= data_HE.groupby(['l']).agg(['max']) #print (data_HE) itemsets = apriori(data_HE,use_colnames=True, min_support=0.05) itemsets = itemsets.sort_values(by="support" , ascending=False) print (itemsets) rules = association_rules(itemsets, metric='lift', min_threshold=1) rules = rules.sort_values(by="lift" , ascending=False) rules.to_csv('Project B.csv') print (rules)
def get_data(): # 读取数据 data = pd.read_csv(os.path.join(os.getcwd(), 'data', 'agaricus-lepiota.data'), header=None) # 筛选出毒蘑菇 data = data.loc[data.iloc[:, 0] == 'p', 1:] # 重置下行索引 data.reset_index(drop=True, inplace=True) # 将数据转化为 热编码 data = pd.get_dummies(data) return data if __name__ == '__main__': # 获取数据 data = get_data() # 打印下数据的维度 print(data.shape) # 看下数据 print(data.head()) # 发现频繁项集 frequent_sets = apriori(data, min_support=0.7, use_colnames=True, max_len=2) # 基于频繁项集 生成关联规则 rules = association_rules(frequent_sets, min_threshold=1) # 输出到 excel rules.to_excel('./data/rules.xlsx', index=False)
import pandas as pd from mlxtend.frequent_patterns import apriori, association_rules from numpy.testing import assert_raises as numpy_assert_raises one_ary = np.array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1], [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]]) cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk', 'Nutmeg', 'Onion', 'Unicorn', 'Yogurt'] df = pd.DataFrame(one_ary, columns=cols) df_freq_items = apriori(df, min_support=0.6) df_freq_items_with_colnames = apriori(df, min_support=0.6, use_colnames=True) columns_ordered = ['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction'] def test_default(): res_df = association_rules(df_freq_items) res_df['antecedents'] = res_df['antecedents'].apply( lambda x: str(frozenset(x))) res_df['consequents'] = res_df['consequents'].apply( lambda x: str(frozenset(x)))
# In[7]: print(player_combo) # In[8]: #Creating the dataframe of frequent itemsets te = TransactionEncoder() te_ary = te.fit(player_combo).transform(player_combo) match_df_freq = pd.DataFrame(te_ary, columns=te.columns_) # In[9]: #Define the minimum support and obtain the itemsets greater than the min support #support = No. of times the required itemset occured / total no. of matches match_sup = apriori(match_df_freq, min_support=0.1, use_colnames=True) print(match_sup) # In[10]: #generate association rules rules = association_rules(match_sup, metric="lift", min_threshold=1) # In[11]: #print the association rules rules # In[12]: #extract only the combinations occured at a winning match
def apriori_wrapper_low_memory(*args, **kwargs): return apriori(*args, **kwargs, low_memory=True)
def get_apriori(self): frequent_itemsets = apriori(self.encode_categorical(), min_support=0.07, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) return rules
def show_rules(): def encode_text_dummy(df, name): dummies = pd.get_dummies(df[name]) for x in dummies.columns: dummy_name = "{}-{}".format(name, x) df[dummy_name] = dummies[x] df.drop(name, axis=1, inplace=True) # Read data csv in df = pd.read_csv('prescription_data.csv', sep=',', low_memory=False) # Create subset of data with only a few columns used for association analysis data = df[['gender', 'specialty', 'settlement_type']] encode_text_dummy(data, 'gender') encode_text_dummy(data, 'specialty') encode_text_dummy(data, 'settlement_type') #data.head() # Get frequent itemsets freq_items1 = apriori(data, min_support=0.009, use_colnames=True, verbose=1) freq_items1 # Get the rules rules1 = association_rules(freq_items1, metric="confidence", min_threshold=0.2) #rules1 #Test 1 Visualization plt.scatter(rules1['support'], rules1['confidence'], alpha=0.5) plt.xlabel('Support') plt.ylabel('Confidence') plt.title('Support vs Confidence') #plt.show() # Only grab needed columns from rule results rules1_results = rules1[['antecedents', 'consequents', 'confidence']] #rules1_results.head() #rules1_results['confidence'].values # Filter rules based on a relatively high confidence level - 90% results = rules1_results[rules1_results['confidence'].values >= .9] results1 = results['antecedents'] antecedents = ([list(x) for x in results1]) length = len(antecedents) results2 = results['consequents'] consequents = ([list(x) for x in results2]) confidence = results['confidence'].tolist() return render_template('analyze.html', antecedents=antecedents, consequents=consequents, confidence=confidence, length=length) #if __name__ == '__main__': # app.run(debug=True, use_reloader=True) #set FLASK_APP=app.py python -m flask run
# -*- coding: utf-8 -*- """ Created on Fri Jun 5 18:22:51 2020 @author: shashi """ import pandas as pd from mlxtend.frequent_patterns import apriori, association_rules movie = pd.read_csv( 'C:\\Users\\shashi\\Downloads\\DATA SCIENCE\\data science assignment\\assignment of association rules\\my_movies.csv' ) movie Freq_item = apriori(movie, min_support=0.005, max_len=3, use_colnames=True) Freq_item.shape # most freq item on basis of support Freq_item.sort_values('support', ascending=False, inplace=True) import matplotlib.pyplot as plt plt.bar(x=list(range(1, 11)), height=Freq_item.support[1:11], color='rgmyk') plt.xticks(list(range(1, 11)), Freq_item.itemsets[1:11]) plt.xlabel('item-sets') plt.ylabel('support') rules = association_rules(Freq_item, metric='lift', min_threshold=1) rules.shape #number of rules at 0.005 support = 124 rules.head(10) #checking with support value 0.010 Freq_item2 = apriori(movie, min_support=0.010, max_len=3, use_colnames=True) Freq_item.shape
te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) # In[5]: df # In[6]: from mlxtend.frequent_patterns import apriori apriori(df, min_support=0.6) # In[7]: apriori(df, min_support=0.6, use_colnames=True) # In[8]: frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) frequent_itemsets
import pandas as pd from mlxtend.frequent_patterns import apriori, association_rules df = pd.read_csv('apriori_data.csv') apri_df = apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, low_memory=False) rules = association_rules(apri_df, metric='confidence', min_threshold=0.3) print(rules)
sep=',', names=str_cols, usecols=range(0, 4)) skst_cols = ['sku', 'store', 'cost', 'retail'] skstinfo = pd.read_csv('data/skstinfo.nosync.csv', names=skst_cols, usecols=range(0, 4)) random.seed(308) store_samp = random.sample(strinfo.store.tolist(), 5) skstinfo = skstinfo[skstinfo.store.isin(store_samp)] sku_pricing = skstinfo.drop('store', axis=1).groupby(['sku']).mean() sku_pricing['margin'] = sku_pricing.retail - sku_pricing.cost top10_df = top10_df.join(sku_pricing, how='left') print("These are the 10 most commonly purchased items") print(top10_df.filter(['pct', 'brand', 'cost', 'retail', 'margin'])) print("Dillards sells a lot of makeup!") # association rules begin here freqItems = apriori(assoc_df, min_support=0.001, use_colnames=True) assoc_rules = association_rules(freqItems, metric="lift", min_threshold=1) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', 100) print('The following ~100 SKUs should be grouped together on the salesfloor') print(assoc_rules.sort_values(by='lift', ascending=False).iloc[0:100, ])
from mlxtend.frequent_patterns import association_rules df = pd.read_excel("data/Online Retail.xlsx") df.columns # transform to wide format basket = (df[df['Country'] == "France"].groupby([ 'InvoiceNo', 'Description' ])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')) # encode to 0 or 1 (higher quantities not relevant) def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 basket_sets = basket.applymap(encode_units) basket_sets.drop('POSTAGE', inplace=True, axis=1) # identify frequent itemsets frequent_itemsets = apriori(basket_sets, min_support=0.02, use_colnames=True) # get association rules rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2) rules.sort_values(by='lift', ascending=False)
from mlxtend.frequent_patterns import apriori, association_rules pd.set_option('display.max_columns', 12) # =================================================================================== # Business Problem :- Association Rules on Book Data. # =================================================================================== book = pd.read_csv("book.csv") book.shape book.head() book.isnull().sum() book.info() book.columns # Appling Apriori Rules Book_apr = apriori(book, min_support=0.015, max_len=4, use_colnames=True) # Most Frequent item sets based on support (Sorting) Book_apr.sort_values('support', ascending=False, inplace=True) # Graphical Representation plt.bar(x=list(range(0, 11)), height=Book_apr.support[0:11], color='rgmyk') plt.xticks(list(range(0, 11)), Book_apr.itemsets[0:11], rotation=90) plt.xlabel('item-sets') plt.ylabel('support') plt.subplots_adjust(bottom=0.3, top=0.99) # Custom the subplot layout # Obtaining Association rules rules = association_rules(Book_apr, metric="lift", min_threshold=1) rules.head(20) rules.sort_values('lift', ascending=False, inplace=True)
dataset = [ ['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs'] ] import pandas as pd from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) print(df) # apriori from mlxtend.frequent_patterns import apriori print(apriori(df, min_support=0.6)) print(apriori(df, min_support=0.6, use_colnames=True))
def get_frq_items(ip, min_support): return apriori(ip, min_support, use_colnames = True)
def get_apriori(self, bin_df): return apriori(bin_df, min_support=self.min_sup, use_colnames=True, max_len=None)
plt.bar(height = frequencies[:11],x = list(range(0,11)),color='rgbkymc');plt.xticks(list(range(0,11),),items[:11]);plt.xlabel("items") plt.ylabel("Count") # Creating Data Frame groceries_series = pd.DataFrame(pd.Series(groceries_list)) # removing the last empty transaction groceries_series = groceries_series.iloc[:9835,:] # removing the last empty transaction #Next i would like to give a name for the columns which having data's groceries_series.columns = ["transactions"] # creating a dummy variable or in a binary matrix format X = groceries_series['transactions'].str.join(sep='*').str.get_dummies(sep='*') #applying the apriori frequent_itemsets = apriori(X,min_support=0.005, max_len=3,use_colnames = True) frequent_itemsets.shape #(989, 2) # Most Frequent item sets based on support frequent_itemsets.sort_values('support',ascending = False,inplace=True) plt.bar(x = list(range(1,11)),height = frequent_itemsets.support[1:11],color='rgmyk');plt.xticks(list(range(1,11)),frequent_itemsets.itemsets[1:11]) plt.xlabel('item-sets');plt.ylabel('support') rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules.shape #(2700, 9) rules.head(20) rules.sort_values('lift',ascending = False,inplace=True) antecedents consequents ... leverage conviction
label="Q3", linestyles="dotted") plt.legend() plt.xlabel("Unique item counts") plt.ylabel("Frequency") plt.show() # The threshhold will be 75/9385 = 0.008 ListItem = Grocery_data.groupby(['Customer' ])['Item'].apply(list).values.tolist() te = TransactionEncoder() te_ary = te.fit(ListItem).transform(ListItem) ItemIndicator = pandas.DataFrame(te_ary, columns=te.columns_) # Median is taken as maximum itemset frequent_itemsets = apriori(ItemIndicator, min_support=75 / (len(n_customer)), max_len=3, use_colnames=True) #import ipdb;ipdb.set_trace() #print(frequent_itemsets) print( "The k-item sets which appeared in the market basket of at least seventy-five(75) customers are: \n", frequent_itemsets['itemsets']) print("\nThe number of itemsets found are:", len(frequent_itemsets['itemsets'])) # Itemsets found are 522 and maximum K = 3 as observed by itemsets column of frequent_itemsets dataframe # association rule for the frequent itemsets assoc_rules = association_rules( frequent_itemsets, metric='confidence', min_threshold=0.01) #default metric is confidence print(
order_raw_data.to_csv('類別組合貢獻_原始購買資料.csv',encoding = 'utf_8_sig') #前兩項產品類別購買金額(僅會計算訂單前兩項產品的貢獻) cate_bind_rev = test3.groupby([0,1])['real_item_amount'].sum() cate_bind_al itemrev.to_csv('類別組合貢獻_前兩項產品.csv',encoding = 'utf_8_sig') ###5.change transaction data into that can be doing asscoiation rule### te = TransactionEncoder() te_ary = te.fit(transactions).transform(transactions) #transactions會變成列是顧客,行類別的二維矩陣 transactions_df = pd.DataFrame(te_ary, columns=te.columns_) ###6.association rule### frequent_itemsets_cate = apriori(transactions_df, min_support=0.001, use_colnames=True) frequent_itemsets_cate['length'] = frequent_itemsets_cate['itemsets'].apply(lambda x: len(x)) len(frequent_itemsets_cate) # res_cate = association_rules(frequent_itemsets_cate, metric="confidence", min_threshold= 0) len(res_cate.index) #final_freqsets['test_ante'] = tuple(final_freqsets['antecedents'])
te_ary = te.fit(ListItem).transform(ListItem) trainData = pd.DataFrame(te_ary, columns=te.columns_) # Item List -> Item Indicator print("Items list in sales receipt format: \n",ListItem) # In[232]: # print("2(d)") totalTransactions=np.count_nonzero(itemperCustomer) minSupport=75/totalTransactions frequent_itemsets = apriori(trainData, min_support = minSupport, use_colnames = True) print("Frequent itemset \n",frequent_itemsets) # In[233]: print("2(d)") noOfItemset=frequent_itemsets.support.count() print("Total number of itemset: ",noOfItemset,"\n") print("The highest value of k in the itemset: 4")
artists = artists.drop(["url"], axis=1) ua_artists = pd.merge(ua, artists, on=["artistsID", "artistsID"]) ua_artists = ua_artists.groupby('userID')['name'].apply( ','.join).reset_index() lista_artistas = artists["name"].tolist() with open("../data/user_artists_boolean.csv", 'w', encoding="utf8") as dest: for index, row in ua_artists.iterrows(): resp = "{}".format(row['userID']) for item in lista_artistas: if row['name'].find(item) != -1: resp += ",1" else: resp += ",0" resp += "\n" dest.write(resp) a = pd.read_csv( "../data/user_artists_boolean.csv", sep=",", header=0, ) a = a.drop(["user"], axis=1) frequent_itemsets = apriori(a, min_support=0.05, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules.sort_values(['confidence', 'lift'], ascending=[0, 0]).to_csv("../output/regras_supp_005.csv")
basket_encoded = basket_France.applymap(hot_encode) basket_France = basket_encoded basket_encoded = basket_Por.applymap(hot_encode) basket_Por = basket_encoded basket_encoded = basket_Sweden.applymap(hot_encode) basket_Sweden = basket_encoded # ### Buliding the models and analyzing the results # #### a) France: # In[30]: # Building the model frq_items = apriori(basket_France, min_support=0.05, use_colnames=True) # Collecting the inferred rules in a dataframe rules = association_rules(frq_items, metric="lift", min_threshold=1) rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False]) rules.head() # From the above output, it can be seen that paper cups and paper plates are bought together in France. This is because the French have a culture of having a get-together with their friends and family atleast once a week. Also, since the French government has banned the use of plastic in the country, the people have to purchase the paper -based alternatives. # #### b) Portugal: # In[31]: # Building the model frq_items = apriori(basket_Por, min_support=0.05, use_colnames=True)
axs.legend() # Viewing the plots, a bin width of 4 is decided and the same is reported. plt.show() item_list = list(grouped_groceries_by_cust['Item'].apply( list)) # converts the data into Item list format. te = TransactionEncoder() te_ary = te.fit(item_list).transform( item_list) # converts to item indicator format. trainData = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets: DataFrame = apriori( trainData, min_support=75 / len(unique_customers), use_colnames=True, max_len=np.median(groceries_cust_item_count['Unique item count'].values) ) # generates frequent itemset against min support and max length itemset provided. print(f"\nTotal frequent itemsets: {frequent_itemsets['itemsets'].count()}") max_len = 0 for itemset in frequent_itemsets['itemsets']: max_len = max(max_len, len(itemset)) print(f"Maximum length frequent itemset: {max_len}") conf_itemset: DataFrame = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.01) print(
elif x == 'ศุกร์': fri = y/countfri #AVG_FRIDAY #print('fri' ,int(fri)) # In[24]: #Workdays dataframe set up & working on apriori algo col_name = ['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก','วิภาวดี','แคราย','สะพานพระนั่งเกล้า','รัตนาธิเบศ','พงษ์เพชร','บางเขน','เกษตร','งามวงศ์วาน'] #Monday df_mon = pd.read_csv('COLAB_MODEL_mon.csv') df_mon.drop('nameDAY',inplace=True,axis=1) df_mon.columns = col_name df_mon.drop(['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก'],inplace=True,axis=1) frequent_itemsets_mon = apriori(df_mon, min_support=0.07, max_len=2, use_colnames=True) #Tuesday df_tue = pd.read_csv('COLAB_MODEL_tue.csv') df_tue.columns = col_name df_tue.drop(['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก'],inplace=True,axis=1) frequent_itemsets_tue = apriori(df_tue, min_support=0.07,max_len=2, use_colnames=True) #Wednesday df_wed = pd.read_csv('COLAB_MODEL_wed.csv') df_wed.columns = col_name df_wed.drop(['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก'],inplace=True,axis=1) frequent_itemsets_wed = apriori(df_wed, min_support=0.07,max_len=2 , use_colnames=True) #Thursday df_thu = pd.read_csv('COLAB_MODEL_thu.csv')
def test_max_len(): res_df1 = apriori(df) assert len(res_df1.iloc[-1, -1]) == 3 res_df2 = apriori(df, max_len=2) assert len(res_df2.iloc[-1, -1]) == 2
plt.bar(height = frequencies[0:11],x = list(range(0,11)),color='rgbkymc');plt.xticks(list(range(0,11),),items[0:11]);plt.xlabel("items");plt.ylabel("Count") # Creating Data Frame for the transactions data # Purpose of converting all list into Series object Coz to treat each list element as entire element not to separate groceries_series = pd.DataFrame(pd.Series(groceries_list)) groceries_series = groceries_series.iloc[:9835,:] # removing the last empty transaction groceries_series.columns = ["transactions"] # creating a dummy columns for the each item in each transactions ... Using column names as item name X = groceries_series['transactions'].str.join(sep='*').str.get_dummies(sep='*') frequent_itemsets = apriori(X, min_support=0.005, max_len=3,use_colnames = True) # Most Frequent item sets based on support frequent_itemsets.sort_values('support',ascending = False,inplace=True) plt.bar(x = list(range(1,11)),height = frequent_itemsets.support[1:11],color='rgmyk');plt.xticks(list(range(1,11)),frequent_itemsets.itemsets[1:11]);plt.xlabel('item-sets');plt.ylabel('support') rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules.head(20) rules.sort_values('lift',ascending = False,inplace=True) ## To eliminate Redudancy in Rules ## def to_list(i): return (sorted(list(i)))
import os import pandas as pd import numpy as np from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # Question 1 data_gpa = pd.read_csv( os.path.join(os.path.dirname(__file__), 'specs/gpa_question1.csv')) dataset_gpa = data_gpa.drop(columns='count') data_ohe = pd.get_dummies(dataset_gpa) frequent_itemsets = apriori(data_ohe, use_colnames=True, min_support=0.15) frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False) if not os.path.exists(os.path.join(os.path.dirname(__file__), 'output')): os.makedirs(os.path.join(os.path.dirname(__file__), 'output')) frequent_itemsets.to_csv(os.path.join(os.path.dirname(__file__), 'output/question1_out_apriori.csv'), index=False) frequent_itemsets.to_csv(os.path.join(os.path.dirname(__file__), 'output/question1_out_apriori.csv'), index=False) rules9_gpa = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.9) if not os.path.exists(os.path.join(os.path.dirname(__file__), 'output')): os.makedirs(os.path.join(os.path.dirname(__file__), 'output'))
def get_frequent_itemset(self): df = self.get_oht_dataframe() frequent_itemsets = apriori(df, min_support = 0.01, use_colnames = True) # earlier 0.001 return frequent_itemsets