Ejemplo n.º 1
0
def test_itemsets_type():
    res_colindice = apriori(df, use_colnames=False)  # This is default behavior
    for i in res_colindice['itemsets']:
        assert isinstance(i, frozenset) is True

    res_colnames = apriori(df, use_colnames=True)
    for i in res_colnames['itemsets']:
        assert isinstance(i, frozenset) is True
Ejemplo n.º 2
0
def test_sparsedataframe_notzero_column():
    dfs = pd.SparseDataFrame(df)
    dfs.columns = [i for i in range(len(dfs.columns))]
    apriori(dfs)

    dfs = pd.SparseDataFrame(df)
    dfs.columns = [i+1 for i in range(len(dfs.columns))]
    assert_raises(ValueError,
                  'Due to current limitations in Pandas, '
                  'if the SparseDataFrame has integer column names,'
                  'names, please make sure they either start '
                  'with `0` or cast them as string column names: '
                  '`df.columns = [str(i) for i in df.columns`].',
                  apriori, dfs)
Ejemplo n.º 3
0
def test_frozenset_selection():
    res_df = apriori(df, use_colnames=True)
    assert res_df.values.shape == (11, 2)
    assert res_df[res_df['itemsets']
                  == 'nothing'].values.shape == (0, 2)
    assert res_df[res_df['itemsets']
                  == {'Eggs', 'Kidney Beans'}].values.shape == (1, 2)
    assert res_df[res_df['itemsets']
                  == frozenset(('Eggs', 'Kidney Beans'))].values.shape\
        == (1, 2)
    assert res_df[res_df['itemsets']
                  == frozenset(('Kidney Beans', 'Eggs'))].values.shape\
        == (1, 2)
Ejemplo n.º 4
0
 def test_with_fill_values(fill_value):
     sdf = df.to_sparse(fill_value=fill_value)
     res_df = apriori(sdf, use_colnames=True)
     assert res_df.values.shape == (11, 2)
     assert res_df[res_df['itemsets']
                   == 'nothing'].values.shape == (0, 2)
     assert res_df[res_df['itemsets']
                   == {'Eggs', 'Kidney Beans'}].values.shape == (1, 2)
     assert res_df[res_df['itemsets']
                   == frozenset(('Eggs', 'Kidney Beans'))].values.shape \
         == (1, 2)
     assert res_df[res_df['itemsets']
                   == frozenset(('Kidney Beans', 'Eggs'))].values.shape \
         == (1, 2)
Ejemplo n.º 5
0
 def defectsContainsDataSet(self,param,head):
    
    basket_sets = (self.df[self.df[head].str.contains(param)].groupby(["DEFEC.", "INSPECCION"])["DEFEC."].count().unstack(level=0).fillna(0))
    if(not basket_sets.empty):
     
     basket_sets = basket_sets.applymap(self.encode_units)
     #Apriori algorithm
     frequent_itemsets = apriori(basket_sets,min_support=0.05, use_colnames=True)
     
     rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
     #Add the elements affected by the rule
     return self.addNumberRules(rules,self.lenConData)
    else:
      return pd.DataFrame()
Ejemplo n.º 6
0
def test_default():
    res_df = apriori(df)
    expect = pd.DataFrame([[0.8, np.array([3]), 1],
                           [1.0, np.array([5]), 1],
                           [0.6, np.array([6]), 1],
                           [0.6, np.array([8]), 1],
                           [0.6, np.array([10]), 1],
                           [0.8, np.array([3, 5]), 2],
                           [0.6, np.array([3, 8]), 2],
                           [0.6, np.array([5, 6]), 2],
                           [0.6, np.array([5, 8]), 2],
                           [0.6, np.array([5, 10]), 2],
                           [0.6, np.array([3, 5, 8]), 3]],
                          columns=['support', 'itemsets', 'length'])

    for a, b in zip(res_df, expect):
        assert_array_equal(a, b)
Ejemplo n.º 7
0
def Apriori(Dataset_Encoded, min_support=0.05):
    # Build apriori model
    FrequentItems = apriori(Dataset_Encoded,
                            min_support=min_support,
                            use_colnames=True)
    return FrequentItems
Ejemplo n.º 8
0
# One hot
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()  # transaction input X dataset
te_arry = te.fit(transactions).transform(
    transactions)  # format dataset suitable for ML APIs via fit() method.
#   Learns the unique labels in the dataset (True\Fales),
#   and via the transform() method

df_x = pd.DataFrame(
    te_arry,  # row=containing set of bool for specific product 
    columns=te.columns_)  # col=all products are arranged in alphabatical order

# Step 4: Train model using Apiori algorithm
# ref = https://rasbt.github.io/mlxtend/api_subpackages/mlxtend.frequent_patterns/
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

df_sets = apriori(
    df_x,  # values are either 0/1 or True/False.
    min_support=0.005,  # a set of transactions containing(I)/transactions
    use_colnames=True)  # Allowed col

df_rules = association_rules(
    df_sets,  # values are either 0/1 or True/False.
    metric='support',  # (bydefault cofidence) but is support formula 
    min_threshold=0.005,  # 0.5%
    support_only=True)  # It is support for eclat rule

# if you use only "support", it called "ECLAT"
filtered = data.drop(['NONE'])
data = data.reset_index()
filtered = filtered.reset_index()
transaction_list = []

# For loop to create a list of the unique transactions throughout the dataset:
for i in filtered['Transaction'].unique():
    tlist = list(set(filtered[filtered['Transaction'] == i]['Item']))
    if len(tlist) > 0:
        transaction_list.append(tlist)

te = TransactionEncoder()
te_ary = te.fit(transaction_list).transform(transaction_list)
df2 = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df2, min_support=0.01, use_colnames=True)
#take minimum threshold
rules = association_rules(frequent_itemsets,
                          metric='confidence',
                          min_threshold=0.0001)

rules.sort_values('confidence', ascending=False)

#now categorise every rule with different range of confidence
rules['support'] = rules['support'] * 100
rules['confidence'] = rules['confidence'] * 100
rules2 = rules[['antecedents', 'consequents', 'support', 'confidence']]

rules2.sort_values('confidence', ascending=False)
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
Ejemplo n.º 10
0
def recommend():
    df = pd.read_csv('database.csv')
    df1 = df['Product'].apply(lambda x: x.split(','))
    te = TransactionEncoder()
    te_ary = te.fit(df1).transform(df1)
    df1 = pd.DataFrame(te_ary, columns=te.columns_).drop('', axis=1)
    ##    print(te.columns_)
    ##    print(df1)
    frequent_itemsets = apriori(df1, min_support=0.03, use_colnames=True)
    #sup=sum(frequent_itemsets['support'])*2/len(frequent_itemsets['support'])
    #frequent_itemsets = apriori(df1, min_support=sup, use_colnames=True)
    #print(frequent_itemsets)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(
        lambda x: len(x))
    items = frequent_itemsets[(frequent_itemsets['length'] >= 2)
                              & (frequent_itemsets['support'] >= 0.04)]
    recP = items['itemsets']
    ##    win=tk.Tk()
    ##    win.title('Recommendations for you...')
    ##    win.geometry('500x500')
    ##    label=tk.Label(win,text='Recommended products for you...')
    ##    label.place(x=10,y=10)
    ##    listbox=tk.Listbox(win,relief='flat',width=50)
    ##    listbox.place(x=15,y=30)
    ##    for i in recP:
    ##        listbox.insert(tk.END,tuple(i))
    ##
    ##    win.mainloop()
    GP = pd.read_csv('price_list.csv')

    class SampleApp(tk.Tk):
        def __init__(self, *args, **kwargs):
            tk.Tk.__init__(self, *args, **kwargs)
            lb = tk.Listbox(self)
            for i in recP:
                lb.insert(tk.END, tuple(i))
            lb.bind("<Double-Button-1>", self.OnDouble)
            lb.pack(side="top", fill="both", expand=True)

        def OnDouble(self, event):
            widget = event.widget
            selection = widget.curselection()
            value = widget.get(selection[0])

            try:

                Rec_pric = (
                    GP.iloc[[list(GP['Product']).index(value[0])], :].values[0]
                    [1] +
                    GP.iloc[[list(GP['Product']).index(value[1])], :].values[0]
                    [1]) - (0.1 *
                            (GP.iloc[[list(GP['Product']).
                                      index(value[0])], :].values[0][1] +
                             GP.iloc[[list(GP['Product']).
                                      index(value[1])], :].values[0][1]))
                img = IMG.new('RGB', (60, 30), color=(0, 0, 0))
                d = ImageDraw.Draw(img)
                d.text((10, 10), "Rs." + str(Rec_pric), fill=(255, 255, 255))
                img.save('images/recommend_price.png')
                list_file = os.scandir('images')
                item_list = [i.name for i in iter(list_file)]
                first, second = '', ''

                for i in item_list:
                    if str(value[0]) == i[:len(str(value[0]))]:
                        first = i
                    if str(value[1]) == i[:len(str(value[1]))]:
                        second = i
                #plt.title('Rs.'+str(Rec_pric))
                for j in [first, second, 'recommend_price.png']:
                    plt.subplot(
                        1, 3,
                        [first, second, 'recommend_price.png'].index(j) + 1)
                    img = plt.imread('images/' + j)
                    plt.imshow(img)
                    plt.xlabel(j[:-4])
                    plt.xticks([])
                    plt.yticks([])
                    plt.autoscale()
                plt.show()
                #label=tk.Label(roo,text=str(value[0])+'+'+str(value[1])+' = Rs.'+str(int(Rec_pric)),font=('Tahoma',30),fg='white',bg='black')

            except:
                roo = tk.Tk()
                roo.title('Offer for you...')
                label = tk.Label(roo,
                                 text='Something went wrong!!!',
                                 font=('Tahoma', 30),
                                 fg='white',
                                 bg='black')
                label.pack()
                roo.mainloop()

    if __name__ == "__main__":
        app = SampleApp()
        app.title('Recommended products')
        app.mainloop()
Ejemplo n.º 11
0
import pandas as pd
import time
from mlxtend.frequent_patterns import apriori, association_rules
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

#数据加载
data = pd.read_csv("Market_Basket_Optimisation.csv", header=None, sep='/')
start = time.time()
# print(data.head())
# 进行one-hot编码(离散值有多少取值,就用多少维来表示这个特征)
data_hot_encode = data.drop(0, 1).join(data[0].str.get_dummies(','))
pd.options.display.max_columns = 100
# print(data_hot_encode.head())
frequent_items = apriori(data_hot_encode, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_items, metric='lift', min_threshold=0.5)
# 按照提升度从大到小进行排序
rules = rules.sort_values(by="lift", ascending=False)
print('频繁项集:', frequent_items)
print('-' * 20, '关联规则', '-' * 20)
print(rules)
print('关联规则:', rules)
end = time.time()
print('计算用时:%s' % (end - start))


# 去掉停用词
def remove_stop_words(f):
    stop_words = ['Movie']
    for stop_word in stop_words:
Ejemplo n.º 12
0
dataset = [[int(n) for n in line.split()] for line in fin]


# In[3]:


te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset, sparse=True)
sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False)
sparse_df


# In[4]:


frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True)
frequent_itemsets5


# In[5]:


frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True)
frequent_itemsets1


# In[6]:


frequent_itemsets05 = apriori(sparse_df, min_support=0.05, use_colnames=True)
frequent_itemsets05
    cell_value = cell_value.replace('"', "")
    cell_value = cell_value.replace(r'[', '')
    cell_value = cell_value.replace(r']', '')
    # Splits each value apart based on spaces and appends them to the cell list
    cell_value_parsed = cell_value.split(',')
    all_cells_list.append(cell_value_parsed)
print(all_cells_list)

# Creation of the data frame based on the cell_list
oht = OnehotTransactions()
oht_ary = oht.fit(all_cells_list).transform(all_cells_list)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
print(df)

# ----------------------------------------------------------------------------------------------------------------------
# Configuration for apriori algorithm
# ----------------------------------------------------------------------------------------------------------------------
# minimum value of the coefficient
min_co = 0.3
# True or False whether or not to include the column names in the output
use_colnames_bool = True
# max number of associations
max_len_value = None
frequent_itemsets = apriori(df,
                            min_support=min_co,
                            use_colnames=use_colnames_bool,
                            max_len=max_len_value)

frequent_itemsets.to_csv('dataAssociation.csv')
print(frequent_itemsets)
print("done")
Ejemplo n.º 14
0
Archivo: ar01.py Proyecto: mrok88/es
#            ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#            ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
#            ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
#            ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
from datetime import timedelta, date
from es02 import es02 
dataset = []
if __name__ == "__main__":
    global dataset
    es = es02()
    es.set_service("display")
    es.load_datas2(date(2017,12,1),date(2018,1,8))
    dataset2 = es.dset

dataset = [ item for item in dataset2  if len(item) > 1 ]
for item in dataset:
    print(item)

import pandas as pd
from mlxtend.preprocessing import OnehotTransactions

oht = OnehotTransactions()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.3,use_colnames=True)
print(frequent_itemsets)

from mlxtend.frequent_patterns import association_rules
arule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
print(arule)
Ejemplo n.º 15
0
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from numpy.testing import assert_raises

one_ary = np.array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
                    [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
                    [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
                    [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
                    [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])

cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk',
        'Nutmeg', 'Onion', 'Unicorn', 'Yogurt']

df = pd.DataFrame(one_ary, columns=cols)

df_freq_items = apriori(df, min_support=0.6)

columns_ordered = ['antecedants', 'consequents',
                   'antecedent support', 'consequent support',
                   'support',
                   'confidence', 'lift', 'leverage', 'conviction']


def test_default():
    res_df = association_rules(df_freq_items)
    res_df['antecedants'] = res_df['antecedants'].apply(
        lambda x: str(frozenset(x)))
    res_df['consequents'] = res_df['consequents'].apply(
        lambda x: str(frozenset(x)))
    res_df.sort_values(columns_ordered, inplace=True)
    res_df.reset_index(inplace=True, drop=True)
Ejemplo n.º 16
0
# -*- coding: utf-8 -*-
"""
Created on Sat Aug  1 16:21:11 2020

@author: bibiboom
"""



import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

data = pd.read_csv('dingdanbiao2.csv', encoding = 'gbk')
data = data[["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q"]]
data = data.drop(columns = ['a','b','c','d','e','g','h','i','j','k','m','n','o','p','q'])
data_HE = data.drop('f',1).join(data.f.str.get_dummies(sep='|'))
#print (data_HE)
#data_HE.to_csv("data_HE.csv",index=False)
data_HE.set_index(['l'],inplace=True)
data_HE = data_HE.sort_values(by="l" , ascending=True) 
data_HE= data_HE.groupby(['l']).agg(['max'])
#print (data_HE)
itemsets = apriori(data_HE,use_colnames=True, min_support=0.05)
itemsets = itemsets.sort_values(by="support" , ascending=False) 
print (itemsets)
rules = association_rules(itemsets, metric='lift', min_threshold=1)
rules = rules.sort_values(by="lift" , ascending=False) 
rules.to_csv('Project B.csv')
print (rules)
Ejemplo n.º 17
0
def get_data():
    # 读取数据
    data = pd.read_csv(os.path.join(os.getcwd(), 'data',
                                    'agaricus-lepiota.data'),
                       header=None)
    # 筛选出毒蘑菇
    data = data.loc[data.iloc[:, 0] == 'p', 1:]
    # 重置下行索引
    data.reset_index(drop=True, inplace=True)
    # 将数据转化为 热编码
    data = pd.get_dummies(data)
    return data


if __name__ == '__main__':
    # 获取数据
    data = get_data()
    # 打印下数据的维度
    print(data.shape)
    # 看下数据
    print(data.head())
    # 发现频繁项集
    frequent_sets = apriori(data,
                            min_support=0.7,
                            use_colnames=True,
                            max_len=2)
    # 基于频繁项集 生成关联规则
    rules = association_rules(frequent_sets, min_threshold=1)
    # 输出到 excel
    rules.to_excel('./data/rules.xlsx', index=False)
Ejemplo n.º 18
0
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from numpy.testing import assert_raises as numpy_assert_raises

one_ary = np.array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
                    [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
                    [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
                    [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
                    [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])

cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk',
        'Nutmeg', 'Onion', 'Unicorn', 'Yogurt']

df = pd.DataFrame(one_ary, columns=cols)

df_freq_items = apriori(df, min_support=0.6)

df_freq_items_with_colnames = apriori(df, min_support=0.6, use_colnames=True)

columns_ordered = ['antecedents', 'consequents',
                   'antecedent support', 'consequent support',
                   'support',
                   'confidence', 'lift', 'leverage', 'conviction']


def test_default():
    res_df = association_rules(df_freq_items)
    res_df['antecedents'] = res_df['antecedents'].apply(
        lambda x: str(frozenset(x)))
    res_df['consequents'] = res_df['consequents'].apply(
        lambda x: str(frozenset(x)))
# In[7]:

print(player_combo)

# In[8]:

#Creating the dataframe of frequent itemsets
te = TransactionEncoder()
te_ary = te.fit(player_combo).transform(player_combo)
match_df_freq = pd.DataFrame(te_ary, columns=te.columns_)

# In[9]:

#Define the minimum support and obtain the itemsets greater than the min support
#support = No. of times the required itemset occured / total no. of matches
match_sup = apriori(match_df_freq, min_support=0.1, use_colnames=True)
print(match_sup)

# In[10]:

#generate association rules
rules = association_rules(match_sup, metric="lift", min_threshold=1)

# In[11]:

#print the association rules
rules

# In[12]:

#extract only the combinations occured at a winning match
def apriori_wrapper_low_memory(*args, **kwargs):
    return apriori(*args, **kwargs, low_memory=True)
Ejemplo n.º 21
0
 def get_apriori(self):
     frequent_itemsets = apriori(self.encode_categorical(), min_support=0.07, use_colnames=True)
     rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
     return rules
Ejemplo n.º 22
0
def show_rules():
    def encode_text_dummy(df, name):
        dummies = pd.get_dummies(df[name])
        for x in dummies.columns:
            dummy_name = "{}-{}".format(name, x)
            df[dummy_name] = dummies[x]
        df.drop(name, axis=1, inplace=True)

    # Read data csv in
    df = pd.read_csv('prescription_data.csv', sep=',', low_memory=False)
    # Create subset of data with only a few columns used for association analysis
    data = df[['gender', 'specialty', 'settlement_type']]
    encode_text_dummy(data, 'gender')
    encode_text_dummy(data, 'specialty')
    encode_text_dummy(data, 'settlement_type')
    #data.head()
    # Get frequent itemsets
    freq_items1 = apriori(data,
                          min_support=0.009,
                          use_colnames=True,
                          verbose=1)
    freq_items1
    # Get the rules
    rules1 = association_rules(freq_items1,
                               metric="confidence",
                               min_threshold=0.2)
    #rules1
    #Test 1 Visualization
    plt.scatter(rules1['support'], rules1['confidence'], alpha=0.5)
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.title('Support vs Confidence')
    #plt.show()
    # Only grab needed columns from rule results
    rules1_results = rules1[['antecedents', 'consequents', 'confidence']]
    #rules1_results.head()
    #rules1_results['confidence'].values
    # Filter rules based on a relatively high confidence level - 90%
    results = rules1_results[rules1_results['confidence'].values >= .9]

    results1 = results['antecedents']

    antecedents = ([list(x) for x in results1])
    length = len(antecedents)

    results2 = results['consequents']

    consequents = ([list(x) for x in results2])

    confidence = results['confidence'].tolist()

    return render_template('analyze.html',
                           antecedents=antecedents,
                           consequents=consequents,
                           confidence=confidence,
                           length=length)


#if __name__ == '__main__':
#   app.run(debug=True, use_reloader=True)

#set FLASK_APP=app.py     python -m flask run
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  5 18:22:51 2020

@author: shashi
"""
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
movie = pd.read_csv(
    'C:\\Users\\shashi\\Downloads\\DATA SCIENCE\\data science assignment\\assignment of association rules\\my_movies.csv'
)
movie

Freq_item = apriori(movie, min_support=0.005, max_len=3, use_colnames=True)
Freq_item.shape
# most freq item on basis of support

Freq_item.sort_values('support', ascending=False, inplace=True)
import matplotlib.pyplot as plt
plt.bar(x=list(range(1, 11)), height=Freq_item.support[1:11], color='rgmyk')
plt.xticks(list(range(1, 11)), Freq_item.itemsets[1:11])
plt.xlabel('item-sets')
plt.ylabel('support')

rules = association_rules(Freq_item, metric='lift', min_threshold=1)
rules.shape
#number of rules at 0.005 support = 124
rules.head(10)
#checking with support value 0.010
Freq_item2 = apriori(movie, min_support=0.010, max_len=3, use_colnames=True)
Freq_item.shape
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)


# In[5]:


df


# In[6]:


from mlxtend.frequent_patterns import apriori

apriori(df, min_support=0.6)


# In[7]:


apriori(df, min_support=0.6, use_colnames=True)


# In[8]:


frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

df = pd.read_csv('apriori_data.csv')
apri_df = apriori(df,
                  min_support=0.5,
                  use_colnames=False,
                  max_len=None,
                  verbose=0,
                  low_memory=False)
rules = association_rules(apri_df, metric='confidence', min_threshold=0.3)
print(rules)
                      sep=',',
                      names=str_cols,
                      usecols=range(0, 4))

skst_cols = ['sku', 'store', 'cost', 'retail']
skstinfo = pd.read_csv('data/skstinfo.nosync.csv',
                       names=skst_cols,
                       usecols=range(0, 4))

random.seed(308)
store_samp = random.sample(strinfo.store.tolist(), 5)

skstinfo = skstinfo[skstinfo.store.isin(store_samp)]

sku_pricing = skstinfo.drop('store', axis=1).groupby(['sku']).mean()
sku_pricing['margin'] = sku_pricing.retail - sku_pricing.cost
top10_df = top10_df.join(sku_pricing, how='left')

print("These are the 10 most commonly purchased items")
print(top10_df.filter(['pct', 'brand', 'cost', 'retail', 'margin']))
print("Dillards sells a lot of makeup!")

# association rules begin here

freqItems = apriori(assoc_df, min_support=0.001, use_colnames=True)
assoc_rules = association_rules(freqItems, metric="lift", min_threshold=1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
print('The following ~100 SKUs should be grouped together on the salesfloor')
print(assoc_rules.sort_values(by='lift', ascending=False).iloc[0:100, ])
from mlxtend.frequent_patterns import association_rules

df = pd.read_excel("data/Online Retail.xlsx")

df.columns

# transform to wide format
basket = (df[df['Country'] == "France"].groupby([
    'InvoiceNo', 'Description'
])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))


# encode to 0 or 1 (higher quantities not relevant)
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1


basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

# identify frequent itemsets
frequent_itemsets = apriori(basket_sets, min_support=0.02, use_colnames=True)

# get association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

rules.sort_values(by='lift', ascending=False)
from mlxtend.frequent_patterns import apriori, association_rules
pd.set_option('display.max_columns', 12)

# ===================================================================================
# Business Problem :- Association Rules on Book Data.
# ===================================================================================

book = pd.read_csv("book.csv")
book.shape
book.head()
book.isnull().sum()
book.info()
book.columns

# Appling Apriori Rules
Book_apr = apriori(book, min_support=0.015, max_len=4, use_colnames=True)

# Most Frequent item sets based on support (Sorting)
Book_apr.sort_values('support', ascending=False, inplace=True)

# Graphical Representation
plt.bar(x=list(range(0, 11)), height=Book_apr.support[0:11], color='rgmyk')
plt.xticks(list(range(0, 11)), Book_apr.itemsets[0:11], rotation=90)
plt.xlabel('item-sets')
plt.ylabel('support')
plt.subplots_adjust(bottom=0.3, top=0.99)  # Custom the subplot layout

# Obtaining Association rules
rules = association_rules(Book_apr, metric="lift", min_threshold=1)
rules.head(20)
rules.sort_values('lift', ascending=False, inplace=True)
Ejemplo n.º 29
0
dataset = [
    ['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
    ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
    ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
    ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
    ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']
]

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
print(df)

# apriori
from mlxtend.frequent_patterns import apriori
print(apriori(df, min_support=0.6))
print(apriori(df, min_support=0.6, use_colnames=True))
def get_frq_items(ip, min_support):
    return apriori(ip, min_support, use_colnames = True)
Ejemplo n.º 31
0
 def get_apriori(self, bin_df):
     return apriori(bin_df, min_support=self.min_sup, use_colnames=True, max_len=None)
Ejemplo n.º 32
0
plt.bar(height = frequencies[:11],x = list(range(0,11)),color='rgbkymc');plt.xticks(list(range(0,11),),items[:11]);plt.xlabel("items")
plt.ylabel("Count")


# Creating Data Frame 

groceries_series  = pd.DataFrame(pd.Series(groceries_list))
# removing the last empty transaction
groceries_series = groceries_series.iloc[:9835,:] # removing the last empty transaction
#Next i would like to give a name for the columns which having data's
groceries_series.columns = ["transactions"]

# creating a dummy variable or in a binary matrix format
X = groceries_series['transactions'].str.join(sep='*').str.get_dummies(sep='*')
#applying the apriori
frequent_itemsets = apriori(X,min_support=0.005, max_len=3,use_colnames = True)
frequent_itemsets.shape
#(989, 2)

# Most Frequent item sets based on support 
frequent_itemsets.sort_values('support',ascending = False,inplace=True)
plt.bar(x = list(range(1,11)),height = frequent_itemsets.support[1:11],color='rgmyk');plt.xticks(list(range(1,11)),frequent_itemsets.itemsets[1:11])
plt.xlabel('item-sets');plt.ylabel('support')

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.shape
#(2700, 9)

rules.head(20)
rules.sort_values('lift',ascending = False,inplace=True)
           antecedents         consequents  ...  leverage  conviction
Ejemplo n.º 33
0
           label="Q3",
           linestyles="dotted")
plt.legend()
plt.xlabel("Unique item counts")
plt.ylabel("Frequency")
plt.show()

# The threshhold will be 75/9385 = 0.008
ListItem = Grocery_data.groupby(['Customer'
                                 ])['Item'].apply(list).values.tolist()
te = TransactionEncoder()
te_ary = te.fit(ListItem).transform(ListItem)
ItemIndicator = pandas.DataFrame(te_ary, columns=te.columns_)
# Median is taken as maximum itemset
frequent_itemsets = apriori(ItemIndicator,
                            min_support=75 / (len(n_customer)),
                            max_len=3,
                            use_colnames=True)
#import ipdb;ipdb.set_trace()
#print(frequent_itemsets)
print(
    "The k-item sets which appeared in the market basket of at least seventy-five(75) customers are: \n",
    frequent_itemsets['itemsets'])
print("\nThe number of itemsets found are:",
      len(frequent_itemsets['itemsets']))
# Itemsets found are 522 and maximum K = 3 as observed by itemsets column of frequent_itemsets dataframe

# association rule for the frequent itemsets
assoc_rules = association_rules(
    frequent_itemsets, metric='confidence',
    min_threshold=0.01)  #default metric is confidence
print(
order_raw_data.to_csv('類別組合貢獻_原始購買資料.csv',encoding = 'utf_8_sig')

#前兩項產品類別購買金額(僅會計算訂單前兩項產品的貢獻)
cate_bind_rev = test3.groupby([0,1])['real_item_amount'].sum()
cate_bind_al itemrev.to_csv('類別組合貢獻_前兩項產品.csv',encoding = 'utf_8_sig')

###5.change transaction data into that can be doing asscoiation rule###
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

#transactions會變成列是顧客,行類別的二維矩陣
transactions_df = pd.DataFrame(te_ary, columns=te.columns_)

###6.association rule###
frequent_itemsets_cate = apriori(transactions_df, 
                                min_support=0.001,
                                use_colnames=True)

frequent_itemsets_cate['length'] = frequent_itemsets_cate['itemsets'].apply(lambda x: len(x))
len(frequent_itemsets_cate)

#
res_cate = association_rules(frequent_itemsets_cate,
                        metric="confidence",
                        min_threshold= 0)

len(res_cate.index)

#final_freqsets['test_ante'] = tuple(final_freqsets['antecedents'])

Ejemplo n.º 35
0
te_ary = te.fit(ListItem).transform(ListItem)
trainData = pd.DataFrame(te_ary, columns=te.columns_) # Item List -> Item Indicator

print("Items list in sales receipt format: \n",ListItem)


# In[232]:


#
print("2(d)")
totalTransactions=np.count_nonzero(itemperCustomer)

minSupport=75/totalTransactions

frequent_itemsets = apriori(trainData, min_support = minSupport, use_colnames = True)

print("Frequent itemset \n",frequent_itemsets)


# In[233]:


print("2(d)")
noOfItemset=frequent_itemsets.support.count()

print("Total number of itemset: ",noOfItemset,"\n")

print("The highest value of k in the itemset: 4")

Ejemplo n.º 36
0
    artists = artists.drop(["url"], axis=1)
    ua_artists = pd.merge(ua, artists, on=["artistsID", "artistsID"])

    ua_artists = ua_artists.groupby('userID')['name'].apply(
        ','.join).reset_index()
    lista_artistas = artists["name"].tolist()
    with open("../data/user_artists_boolean.csv", 'w',
              encoding="utf8") as dest:
        for index, row in ua_artists.iterrows():
            resp = "{}".format(row['userID'])
            for item in lista_artistas:
                if row['name'].find(item) != -1:
                    resp += ",1"
                else:
                    resp += ",0"
            resp += "\n"
            dest.write(resp)


a = pd.read_csv(
    "../data/user_artists_boolean.csv",
    sep=",",
    header=0,
)
a = a.drop(["user"], axis=1)
frequent_itemsets = apriori(a, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.sort_values(['confidence', 'lift'],
                  ascending=[0, 0]).to_csv("../output/regras_supp_005.csv")
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded

basket_encoded = basket_Por.applymap(hot_encode)
basket_Por = basket_encoded

basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded

# ### Buliding the models and analyzing the results
# #### a) France:

# In[30]:

# Building the model
frq_items = apriori(basket_France, min_support=0.05, use_colnames=True)

# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
rules.head()

# From the above output, it can be seen that paper cups and paper plates are bought together in France. This is because the French have a culture of having a get-together with their friends and family atleast once a week. Also, since the French government has banned the use of plastic in the country, the people have to purchase the paper -based alternatives.

# #### b) Portugal:

# In[31]:

# Building the model
frq_items = apriori(basket_Por, min_support=0.05, use_colnames=True)
Ejemplo n.º 38
0
    axs.legend()

# Viewing the plots, a bin width of 4 is decided and the same is reported.
plt.show()

item_list = list(grouped_groceries_by_cust['Item'].apply(
    list))  # converts the data into Item list format.

te = TransactionEncoder()
te_ary = te.fit(item_list).transform(
    item_list)  # converts to item indicator format.
trainData = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets: DataFrame = apriori(
    trainData,
    min_support=75 / len(unique_customers),
    use_colnames=True,
    max_len=np.median(groceries_cust_item_count['Unique item count'].values)
)  # generates frequent itemset against min support and max length itemset provided.

print(f"\nTotal frequent itemsets: {frequent_itemsets['itemsets'].count()}")

max_len = 0
for itemset in frequent_itemsets['itemsets']:
    max_len = max(max_len, len(itemset))

print(f"Maximum length frequent itemset: {max_len}")

conf_itemset: DataFrame = association_rules(frequent_itemsets,
                                            metric='confidence',
                                            min_threshold=0.01)
print(
    elif x == 'ศุกร์':
        fri = y/countfri  #AVG_FRIDAY
        #print('fri' ,int(fri))


# In[24]:


#Workdays dataframe set up & working on apriori algo
col_name = ['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก','วิภาวดี','แคราย','สะพานพระนั่งเกล้า','รัตนาธิเบศ','พงษ์เพชร','บางเขน','เกษตร','งามวงศ์วาน']
#Monday
df_mon = pd.read_csv('COLAB_MODEL_mon.csv')
df_mon.drop('nameDAY',inplace=True,axis=1)
df_mon.columns = col_name
df_mon.drop(['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก'],inplace=True,axis=1)
frequent_itemsets_mon = apriori(df_mon, min_support=0.07, max_len=2, use_colnames=True)

#Tuesday
df_tue = pd.read_csv('COLAB_MODEL_tue.csv')
df_tue.columns = col_name
df_tue.drop(['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก'],inplace=True,axis=1)
frequent_itemsets_tue = apriori(df_tue, min_support=0.07,max_len=2, use_colnames=True)

#Wednesday
df_wed = pd.read_csv('COLAB_MODEL_wed.csv')
df_wed.columns = col_name
df_wed.drop(['รถติด','อุบัติเหตุ','ซ่อม','ฝนตก'],inplace=True,axis=1)
frequent_itemsets_wed = apriori(df_wed, min_support=0.07,max_len=2 , use_colnames=True)

#Thursday
df_thu = pd.read_csv('COLAB_MODEL_thu.csv')
Ejemplo n.º 40
0
def test_max_len():
    res_df1 = apriori(df)
    assert len(res_df1.iloc[-1, -1]) == 3

    res_df2 = apriori(df, max_len=2)
    assert len(res_df2.iloc[-1, -1]) == 2
plt.bar(height = frequencies[0:11],x = list(range(0,11)),color='rgbkymc');plt.xticks(list(range(0,11),),items[0:11]);plt.xlabel("items");plt.ylabel("Count")


# Creating Data Frame for the transactions data 

# Purpose of converting all list into Series object Coz to treat each list element as entire element not to separate 
groceries_series  = pd.DataFrame(pd.Series(groceries_list))
groceries_series = groceries_series.iloc[:9835,:] # removing the last empty transaction

groceries_series.columns = ["transactions"]


# creating a dummy columns for the each item in each transactions ... Using column names as item name
X = groceries_series['transactions'].str.join(sep='*').str.get_dummies(sep='*')
frequent_itemsets = apriori(X, min_support=0.005, max_len=3,use_colnames = True)

# Most Frequent item sets based on support 
frequent_itemsets.sort_values('support',ascending = False,inplace=True)
plt.bar(x = list(range(1,11)),height = frequent_itemsets.support[1:11],color='rgmyk');plt.xticks(list(range(1,11)),frequent_itemsets.itemsets[1:11]);plt.xlabel('item-sets');plt.ylabel('support')


rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head(20)
rules.sort_values('lift',ascending = False,inplace=True)

## To eliminate Redudancy in Rules ##
def to_list(i):
    return (sorted(list(i)))

Ejemplo n.º 42
0
import os
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Question 1
data_gpa = pd.read_csv(
    os.path.join(os.path.dirname(__file__), 'specs/gpa_question1.csv'))
dataset_gpa = data_gpa.drop(columns='count')

data_ohe = pd.get_dummies(dataset_gpa)
frequent_itemsets = apriori(data_ohe, use_colnames=True, min_support=0.15)
frequent_itemsets = frequent_itemsets.sort_values(by='support',
                                                  ascending=False)

if not os.path.exists(os.path.join(os.path.dirname(__file__), 'output')):
    os.makedirs(os.path.join(os.path.dirname(__file__), 'output'))
    frequent_itemsets.to_csv(os.path.join(os.path.dirname(__file__),
                                          'output/question1_out_apriori.csv'),
                             index=False)
frequent_itemsets.to_csv(os.path.join(os.path.dirname(__file__),
                                      'output/question1_out_apriori.csv'),
                         index=False)

rules9_gpa = association_rules(frequent_itemsets,
                               metric='confidence',
                               min_threshold=0.9)
if not os.path.exists(os.path.join(os.path.dirname(__file__), 'output')):
    os.makedirs(os.path.join(os.path.dirname(__file__), 'output'))
	def get_frequent_itemset(self):
		df = self.get_oht_dataframe()
		frequent_itemsets = apriori(df, min_support = 0.01, use_colnames = True) # earlier 0.001
		return frequent_itemsets