Esempio n. 1
0
def test_lift():
    res_df = association_rules(df_freq_items,
                               min_threshold=1.1,
                               metric='lift')
    assert res_df.values.shape[0] == 6

    res_df = association_rules(df_freq_items_with_colnames,
                               min_threshold=1.1,
                               metric='lift')
    assert res_df.values.shape[0] == 6
Esempio n. 2
0
def test_confidence():
    res_df = association_rules(df_freq_items,
                               min_threshold=0.8,
                               metric='confidence')
    assert res_df.values.shape[0] == 9

    res_df = association_rules(df_freq_items_with_colnames,
                               min_threshold=0.8,
                               metric='confidence')
    assert res_df.values.shape[0] == 9
Esempio n. 3
0
def test_conviction():
    res_df = association_rules(df_freq_items,
                               min_threshold=1.5,
                               metric='conviction')
    assert res_df.values.shape[0] == 11

    res_df = association_rules(df_freq_items_with_colnames,
                               min_threshold=1.5,
                               metric='conviction')
    assert res_df.values.shape[0] == 11
Esempio n. 4
0
def test_default():
    res_df = association_rules(df_freq_items)
    res_df['antecedents'] = res_df['antecedents'].apply(
        lambda x: str(frozenset(x)))
    res_df['consequents'] = res_df['consequents'].apply(
        lambda x: str(frozenset(x)))
    res_df.sort_values(columns_ordered, inplace=True)
    res_df.reset_index(inplace=True, drop=True)

    expect = pd.DataFrame([
        [(8,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf],
        [(6,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf],
        [(8, 3), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf],
        [(8, 5), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf],
        [(8,), (3, 5), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf],
        [(3,), (5,), 0.8, 1.0, 0.8, 1.0, 1.0, 0.0, np.inf],
        [(5,), (3,), 1.0, 0.8, 0.8, 0.8, 1.0, 0.0, 1.0],
        [(10,), (5,), 0.6, 1.0, 0.6, 1.0, 1.0, 0.0, np.inf],
        [(8,), (3,), 0.6, 0.8, 0.6, 1.0, 1.25, 0.12, np.inf]],
        columns=columns_ordered
    )

    expect['antecedents'] = expect['antecedents'].apply(
        lambda x: str(frozenset(x)))
    expect['consequents'] = expect['consequents'].apply(
        lambda x: str(frozenset(x)))
    expect.sort_values(columns_ordered, inplace=True)
    expect.reset_index(inplace=True, drop=True)

    assert res_df.equals(expect), res_df
Esempio n. 5
0
def test_override_metric_with_support():

    res_df = association_rules(df_freq_items_with_colnames,
                               min_threshold=0.8)
    # default metric is confidence
    assert res_df.values.shape[0] == 9

    res_df = association_rules(df_freq_items_with_colnames,
                               min_threshold=0.8,
                               metric='support')
    assert res_df.values.shape[0] == 2

    res_df = association_rules(df_freq_items_with_colnames,
                               min_threshold=0.8,
                               support_only=True)
    assert res_df.values.shape[0] == 2
Esempio n. 6
0
def test_datatypes():
    res_df = association_rules(df_freq_items)
    for i in res_df['antecedents']:
        assert isinstance(i, frozenset) is True

    for i in res_df['consequents']:
        assert isinstance(i, frozenset) is True

    # cast itemset-containing dataframe to set and
    # check if association_rule converts it internally
    # back to frozensets
    df_freq_items_copy = df_freq_items.copy()
    df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\
        .apply(lambda x: set(x))

    res_df = association_rules(df_freq_items)
    for i in res_df['antecedents']:
        assert isinstance(i, frozenset) is True

    for i in res_df['consequents']:
        assert isinstance(i, frozenset) is True
Esempio n. 7
0
def test_empty_result():
    expect = pd.DataFrame(
        columns=['antecedents', 'consequents',
                 'antecedent support',
                 'consequent support',
                 'support',
                 'confidence', 'lift', 'leverage',
                 'conviction']
    )
    res_df = association_rules(df_freq_items, min_threshold=2)

    assert res_df.equals(expect)
Esempio n. 8
0
def test_frozenset_selection():
    res_df = association_rules(df_freq_items)

    sel = res_df[res_df['consequents'] == frozenset((3, 5))]
    assert sel.values.shape[0] == 1

    sel = res_df[res_df['consequents'] == frozenset((5, 3))]
    assert sel.values.shape[0] == 1

    sel = res_df[res_df['consequents'] == {3, 5}]
    assert sel.values.shape[0] == 1

    sel = res_df[res_df['antecedents'] == frozenset((8, 3))]
    assert sel.values.shape[0] == 1
Esempio n. 9
0
 def defectsContainsDataSet(self,param,head):
    
    basket_sets = (self.df[self.df[head].str.contains(param)].groupby(["DEFEC.", "INSPECCION"])["DEFEC."].count().unstack(level=0).fillna(0))
    if(not basket_sets.empty):
     
     basket_sets = basket_sets.applymap(self.encode_units)
     #Apriori algorithm
     frequent_itemsets = apriori(basket_sets,min_support=0.05, use_colnames=True)
     
     rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
     #Add the elements affected by the rule
     return self.addNumberRules(rules,self.lenConData)
    else:
      return pd.DataFrame()
Esempio n. 10
0
def test_on_df_with_missing_entries_support_only():
    # this is a data frame where information about
    # antecedents and consequents have been cropped
    # see https://github.com/rasbt/mlxtend/issues/390
    # for more details
    dict = {'itemsets': [['177', '176'], ['177', '179'],
                         ['176', '178'], ['176', '179'],
                         ['93', '100'], ['177', '178'],
                         ['177', '176', '178']],
            'support': [0.253623, 0.253623, 0.217391,
                        0.217391, 0.181159, 0.108696, 0.108696]}

    df = pd.DataFrame(dict)
    df_result = association_rules(df, support_only=True, min_threshold=0.1)

    assert df_result['support'].shape == (18,)
    assert int(np.isnan(df_result['support'].values).any()) != 1
def rule_generation(frequent_itemsets):
    rules = association_rules(frequent_itemsets,
                              metric="confidence",
                              min_threshold=0.8)
    rules.to_csv("tmp.csv")
    return rules
Esempio n. 12
0
File: ar01.py Progetto: mrok88/es
#            ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#            ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
#            ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
#            ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
from datetime import timedelta, date
from es02 import es02 
dataset = []
if __name__ == "__main__":
    global dataset
    es = es02()
    es.set_service("display")
    es.load_datas2(date(2017,12,1),date(2018,1,8))
    dataset2 = es.dset

dataset = [ item for item in dataset2  if len(item) > 1 ]
for item in dataset:
    print(item)

import pandas as pd
from mlxtend.preprocessing import OnehotTransactions

oht = OnehotTransactions()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.3,use_colnames=True)
print(frequent_itemsets)

from mlxtend.frequent_patterns import association_rules
arule = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
print(arule)
Esempio n. 13
0
# Working on table Products with Top Support
TopSupport = freq_items.copy()
TopSupport.columns = ['Percentage of Orders', 'Product']
TopSupport = TopSupport[['Product', 'Percentage of Orders']]
TopSupport['Percentage of Orders'] = TopSupport['Percentage of Orders'].apply(lambda x: round(x * 100, 2))
TopSupport.insert(0, 'Serial No.', range(1, 1 + len(TopSupport)))
TopSupport['Product'] = TopSupport['Product'].apply(lambda x: list(x))
# TopSupport['Dish Name'] = TopSupport['Dish Name'].apply(lambda x: ', '.join(x))
TopSupport['Product'] = TopSupport['Product'].apply(lambda x: 0 if len(x) >= 2 else x[0])
TopSupport = TopSupport[TopSupport['Product'] != 0]
selectfew_TopSupport = TopSupport.head(10)
top5perc = round(selectfew_TopSupport['Percentage of Orders'].head().sum(), 1)

# Working on Product Associations based on Top Support
rules_TopSupport = association_rules(freq_items, metric="support", min_threshold=0.015)
rules_TopSupport = rules_TopSupport.sort_values(by=['support'], ascending=False)  # MADE CHANGE HERE
rules_TopSupport = rules_TopSupport[rules_TopSupport.lift > 1]
rules_TopSupport = rules_TopSupport[['antecedents', 'consequents', 'support']]
rules_TopSupport.columns = ['Product', 'Associated Product', 'Percentage of Orders']
rules_TopSupport['Percentage of Orders'] = rules_TopSupport['Percentage of Orders'].apply(lambda x: round(x * 100, 2))
rules_TopSupport.insert(0, 'Serial No', range(1, 1 + len(rules_TopSupport)))

rules_TopSupport['Product'] = rules_TopSupport['Product'].apply(lambda x: list(x))
rules_TopSupport['Product'] = rules_TopSupport['Product'].apply(lambda x: x[0])
rules_TopSupport['Associated Product'] = rules_TopSupport['Associated Product'].apply(lambda x: list(x))
rules_TopSupport['Associated Product'] = rules_TopSupport['Associated Product'].apply(lambda x: ', '.join(x))


def remove_common_rows(data):
    ind = []
# In[10]:

#Creating the dataframe of frequent itemsets
te = TransactionEncoder()
te_ary = te.fit(player_combo).transform(player_combo)
match_df_freq = pd.DataFrame(te_ary, columns=te.columns_)

# In[11]:

match_sup = apriori(match_df_freq, min_support=0.1, use_colnames=True)
print(match_sup)

# In[12]:

rules = association_rules(match_sup, metric="lift", min_threshold=1)

# In[13]:

rules

# In[14]:

won_rules = rules[(rules['consequents'] == {"won"})]

# In[15]:

won_rules

# In[16]:
# In[14]:

frequent_itemsets = apriori(movies_new,
                            min_support=0.002,
                            max_len=3,
                            use_colnames=True)
frequent_itemsets

# In[15]:

frequent_itemsets.sort_values('support', ascending=False, inplace=True)
frequent_itemsets.sort_values

# In[16]:

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head(20)
rules.sort_values('lift', ascending=False).head(10)

# In[17]:

frequent_itemsets_1 = apriori(movies_new,
                              min_support=0.004,
                              max_len=4,
                              use_colnames=True)
frequent_itemsets_1

# In[18]:

frequent_itemsets_1.sort_values('support', ascending=False, inplace=True)
frequent_itemsets_1.sort_values
Esempio n. 16
0
df = pd.DataFrame(te_ary, columns=te.columns_)
df
#this matrix of transactions : T/ F indicate their presence in each Trans ID
df.shape
#get back orginal transactions
orgtrans1 = te_ary[:]
te.inverse_transform(orgtrans1)

#%%% #frequent itemsets - Most Imp Step
support_threshold = 0.01
frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True)
frequent_itemsets
print(frequent_itemsets) #dataframe with the itemsets

#%%%%  - Support
support3 = association_rules(frequent_itemsets, metric="support", min_threshold = .3)
print(support3)
print(support3[['antecedents', 'consequents', 'support','confidence']])
#---
support2 = association_rules(frequent_itemsets, metric="support", min_threshold = .2)
print(support2[['antecedents', 'consequents', 'support','confidence']])

#%%%% Lift

lift1 = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(lift1)
print(lift1[['antecedents', 'consequents', 'support', 'lift','confidence']])
#--
lift2 = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
print(lift2)
print(lift2[['antecedents', 'consequents', 'support', 'lift','confidence']])
Esempio n. 17
0
lst

# In[5]:

bucket = (lst.groupby(
    ['ID', 'Item'])['Quantity'].sum().unstack().reset_index().fillna(
        0).set_index('ID').applymap(lambda x: 1 if x > 0 else 0))
bucket

# Sebelum data dimasukkan dalam algoritma apriori, data sudah harus berbentuk seperti di tabel di atas

# ### Frquent Itemsets
# Misalnya kita ingin menggunakan  Minimal Support 30%

# In[7]:

frequent_itemsets = apriori(bucket, min_support=0.30, use_colnames=True)
frequent_itemsets

# ### Make Rules
# Misalnya kita ingin membuat rules berdasarkan nilai minimal confidence 70%

# In[8]:

rules = association_rules(frequent_itemsets,
                          metric='confidence',
                          min_threshold=0.7)
rules

# In[ ]:
lst2 = []
for content in contents_list:
    lst = listmaker(content)
    if lst != []:
        lst2.append(lst)

# Association Analysis
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(lst2).transform(lst2)
df = pd.DataFrame(te_ary, columns=te.columns_)

from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets,
                          metric="lift",
                          min_threshold=1,
                          support_only=False)
rules['length'] = rules['antecedents'].apply(lambda x: len(x))
rules['length2'] = rules['consequents'].apply(lambda x: len(x))
rules = rules[(rules['length'] == 1) & (rules['length2'] == 1)]

import openpyxl
rules.to_excel("비타민제 소비자별 연관분석 결과/Vitamin Workers.xlsx")
Esempio n. 19
0
#most popular wine: cantina pinot bianco

# In[42]:

df = df.groupby(['order', 'orderNumber']).size().reset_index(name='count')
basket = (df.groupby([
    'orderNumber', 'order'
])['count'].sum().unstack().reset_index().fillna(0).set_index('orderNumber'))


def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1


basket_sets = basket.applymap(encode_units)

frequent_itemsets = apriori(basket_sets, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift")
rules.sort_values('lift', ascending=False, inplace=True)
rules.head(10)

#sorted by lift
#top item - if a customer orders raost chicken, they are 2.75 times more likely to also purchase duckhorn chardonnay

# In[36]:

# In[ ]:
Esempio n. 20
0
tr_enc = TransactionEncoder()
basket = pd.DataFrame(tr_enc.fit_transform(transactions),
                      columns=tr_enc.columns_)

# In[14]:

basket

# In[15]:

# səbət analizi üçün lazım olan funksiyalar

from mlxtend.frequent_patterns import apriori, association_rules

# In[16]:

# məhsulların ayrı ayrılıqda və birlikdə səbətlərdə görünmə dərəcəsi

frequent = apriori(basket,
                   min_support=0.00002,
                   low_memory=True,
                   use_colnames=True)
frequent

# In[17]:

# sol tərəfdəki məhsullar alındıqda sağ tərəfdəki məhsulların alınma ehtimalı ən yüksək olan kombinasiyalar (confidence dəyərinə görə)
association_rules(frequent, metric="confidence",
                  min_threshold=0.05).sort_values(by="confidence",
                                                  ascending=False)
Esempio n. 21
0
dataset_series.columns = ['transactions']

# Creating a dummy columns for the each item in each transactions...using column names as index
X = dataset_series['transactions'].str.join(sep='*').str.get_dummies(sep='*')

'''   Support = 0.001 and max_len = 3   '''

frequent_itemsets = apriori(X, min_support=0.001, max_len=3, use_colnames=True) # 9968 itemsets
# Most frequent item sets based on support 
frequent_itemsets.sort_values('support', ascending =False, inplace=True)
# Barplot of frequent item sets
plt.bar(x=list(range(1,11)), height=frequent_itemsets.support[1:11], color='rgmyk')
plt.xticks(list(range(1,11)), frequent_itemsets.itemsets[1:11], rotation=45);plt.xlabel('item_sets');plt.ylabel('support')
# Rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1) # 45712 rules are generated
rules.head(10)
rules.sort_values('confidence',ascending =False, inplace=True)
rules.head(10)

'''   Support = 0.001 and max_len = 2   '''

frequent_itemsets = apriori(X, min_support=0.001, max_len=2, use_colnames=True) # 3138 itemsets
# Most frequent item sets based on support 
frequent_itemsets.sort_values('support', ascending =False, inplace=True)
# Barplot of frequent item sets
plt.bar(x=list(range(1,11)), height=frequent_itemsets.support[1:11], color='rgmyk')
plt.xticks(list(range(1,11)), frequent_itemsets.itemsets[1:11], rotation=45);plt.xlabel('item_sets');plt.ylabel('support')
# Rules
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1) # 5426 rules are generated
rules.head(10)
Esempio n. 22
0
#df_inv_pro.sum()
df_inv_pro.loc["tot_sepet"] = df_inv_pro.apply(lambda x: x.sum(), axis=0)
df_inv_pro.tail()
############################################
# Birliktelik Kurallarının Çıkarılması
############################################
df_inv_pro.drop("tot_product", axis=1, inplace=True)

df_inv_pro.drop("tot_sepet", axis=0, inplace=True)

frequent_itemsets = apriori(df_inv_pro, min_support=0.01, use_colnames=True)
frequent_itemsets.sort_values("support", ascending=False)

rules = association_rules(frequent_itemsets,
                          metric="support",
                          min_threshold=0.01)
rules.head()
rules.sort_values("lift", ascending=False).head()

# tüm çalışmanın fonksiyonlaştırılması
import pandas as pd
pd.set_option('display.max_columns', None)
from mlxtend.frequent_patterns import apriori, association_rules
from helpers.helpers import crm_data_prep, create_invoice_product_df

df_ = pd.read_excel(
    r"C:\Users\Erkan\Desktop\DSMLBC-4\4.Hafta_26-29_Ocak Haftası\Ödevler ve Çalışmalar\online_retail_II.xlsx",
    sheet_name="Year 2010-2011")
df = df_.copy()
df = crm_data_prep(df)
Esempio n. 23
0
 def get_apriori(self):
     frequent_itemsets = apriori(self.encode_categorical(), min_support=0.07, use_colnames=True)
     rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
     return rules
Esempio n. 24
0
frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True)
frequent_itemsets1


# In[6]:


frequent_itemsets05 = apriori(sparse_df, min_support=0.05, use_colnames=True)
frequent_itemsets05


# In[8]:


rules05 = association_rules(frequent_itemsets05, metric="confidence", min_threshold=0.5)
rules05


# In[9]:


frequent_itemsets01 = apriori(sparse_df, min_support=0.01, use_colnames=True)
frequent_itemsets01


# In[10]:


rules01 = association_rules(frequent_itemsets01, metric="confidence", min_threshold=0.7)
rules01
Esempio n. 25
0
import numpy as np

dataframe = pd.read_csv('ARMatrixControl.csv')
del dataframe['Classes']
del dataframe['Unnamed: 0']

min_support=0.095
print "support: ", min_support
frequent_itemsets = apriori(dataframe, min_support=min_support, use_colnames=True)
fileName = "control_frequent_itemsets_" + str(min_support) + '.csv'
frequent_itemsets.to_csv(fileName)

for confidence_increment in range(50, 100, 10):
    min_confidence = confidence_increment / float(100)
    print "confidence: ", min_confidence
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
    fileName = "control_association_rules_C" + str(min_confidence) + "_S" + str(min_support)
    rules.to_csv(fileName+'.csv')


print "DONE BITCH"









Esempio n. 26
0
np_data_a = all_data.to_numpy()
np_data_a = [[
    elem for elem in row[1:] if isinstance(elem, str) and elem in items
] for row in np_data_a]
np_data_a = [row for row in np_data_a if len(row) > 1]
te_a = TransactionEncoder()
te_ary_a = te_a.fit_transform(np_data_a)
data_a = pd.DataFrame(te_ary_a, columns=te_a.columns_)
data_a

# %%
result = fpgrowth(data_a, min_support=0.05, use_colnames=True)
result

# %%
rules_conf = association_rules(result, min_threshold=0.3)
rules_conf

# %%
rules_sup = association_rules(result, min_threshold=0.01, metric='support')
rules_sup

# %%
rules_lift = association_rules(result, min_threshold=0.01, metric='lift')
rules_lift

# %%
rules_leverage = association_rules(result,
                                   min_threshold=0.01,
                                   metric='leverage')
rules_leverage
Esempio n. 27
0
# 1.2 Importing Required modules.

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

import pandas as pd
import numpy as np

# 1.3 Input data

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

# 1.4 Creating the dataframe of frequent itemsets.

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

# 1.5 Applying Apriori algorithm and finding Association rules.

sup = apriori(df, min_support=0.002, use_colnames=True)
rules = association_rules(sup, metric="support", min_threshold=0.5)
print(rules)
def my_func(df, algo_type):
	# Import the dataset
	
	response = {'error': False}

	try:
		items = (df['0'].unique())

		encoded_vals = []
		for index, row in df.iterrows():
		    labels = {}
		    uncommons = list(set(items) - set(row))
		    commons = list(set(items).intersection(row))
		    for uc in uncommons:
		        labels[uc] = 0
		    for com in commons:
		        labels[com] = 1
		    encoded_vals.append(labels)

		# FP-Growth module requires a dataframe that has either 0 and 1 or True and False as data
		# we need to One Hot Encode the data.
		ohe_df = pd.DataFrame(encoded_vals)

		if algo_type == 'fp-growth':
			# Applying fp-growth
			freq_items = fpgrowth(ohe_df, min_support=0.2, use_colnames=True, verbose=1)
		elif algo_type == 'apriori':
			freq_items = apriori(ohe_df, min_support=0.2, use_colnames=True, verbose=1)

		# Mining Association Rules
		rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)

		# Visualizing results
		# Support vs Confidence
		plt.scatter(rules['support'], rules['confidence'], alpha=0.5)
		plt.xlabel('support')
		plt.ylabel('confidence')
		plt.title('Support vs Confidence')
		support_confidence = generate_graph_img(plt)

		# Support vs Lift
		plt.scatter(rules['support'], rules['lift'], alpha=0.5)
		plt.xlabel('support')
		plt.ylabel('lift')
		plt.title('Support vs Lift')
		support_lift = generate_graph_img(plt)

		# Lift vs Confidence
		fit = np.polyfit(rules['lift'], rules['confidence'], 1)
		fit_fn = np.poly1d(fit)
		plt.plot(rules['lift'], rules['confidence'], 'yo', rules['lift'], 
		fit_fn(rules['lift']))
		lift_confidence = generate_graph_img(plt)
		response = {
			'support_confidence': f'data:image/png;base64,{support_confidence}',
			'support_lift': f'data:image/png;base64,{support_lift}',
			'lift_confidence': f'data:image/png;base64,{lift_confidence}',
			'error': False
		}
	except Exception as e:
		response = {
			'error': str(e)
		}
	return response
basket_Roti = basket_encoded

basket_encoded = basket_Selai.applymap(hot_encode)
basket_Selai = basket_encoded

basket_encoded = basket_Mentega.applymap(hot_encode)
basket_Mentega = basket_encoded

basket_encoded = basket_Susu.applymap(hot_encode)
basket_Susu = basket_encoded

basket_encoded = basket_Cokelat.applymap(hot_encode)
basket_Cokelat = basket_encoded

frq_items = apriori(basket_Roti, min_support=0.3, use_colnames=True)
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

frq_items = apriori(basket_Selai, min_support=0.3, use_colnames=True)
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

frq_items = apriori(basket_Mentega, min_support=0.3, use_colnames=True)
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
print(rules.head())

frq_items = apriori(basket_Selai, min_support=0.3, use_colnames=True)
rules = association_rules(frq_items, metric="lift", min_threshold=1)
Esempio n. 30
0
edges.to_json('./output/edges.json', orient='records')

print('frequent itemsets')
frequent_itemsets = apriori(basket_sets,
                            min_support=min_support,
                            use_colnames=True,
                            max_len=max_len)
frequent_itemsets.sort_values(
    'support',
    inplace=True,
    ascending=False,
)

print('rules')
rules = association_rules(frequent_itemsets,
                          metric=min_threshold_metric,
                          min_threshold=min_threshold)
rules.sort_values('lift', inplace=True, ascending=False)

print('found ' + str(len(rules)) + ' rules')
rules.to_json('./output/rules.json', orient='records')

with open(out_js, "w") as fs:
    fs.truncate(0)
    fs.write("var loadeddata = {\n")
    fs.write('products : ' + nodes.to_json(orient='records') + ',\n')
    # fs.write('var links = ' + edges.to_json(orient='records') + '\n')
    fs.write('rules : ' + rules.to_json(orient='records') + ',\n')
    fs.write('}')

# basket.sort_values('korfu', inplace=True, ascending=False)
Esempio n. 31
0
df.info
df.describe
frequent_itemsets = apriori(df.iloc[: , 5:15], min_support = 0.015, max_len = 5, use_colnames = True)

# Most Frequent item sets based on support 
frequent_itemsets.sort_values('support', ascending = False, inplace = True)

# barplot of top 10  for visualization
import matplotlib.pyplot as plt
plt.bar(x = list(range(0, 11)), height = frequent_itemsets.support[0:11], color ='rgmyk')
plt.xticks(list(range(0, 11)), frequent_itemsets.itemsets[0:11], rotation=10)
plt.xlabel('item-sets')
plt.ylabel('support')
plt.show()

rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1) #writing rules by arules
rules.head(10)
rules.sort_values('lift', ascending = False).head(10)  #sorting values using highest lift ratio

#considering only required columns and removing profusion
def to_list(i):
    return (sorted(list(i)))

ma_X = rules.antecedents.apply(to_list) + rules.consequents.apply(to_list) #considering antecedent and consequent columns and coverting them to list

ma_X = ma_X.apply(sorted) #sorting in sequence

rules_sets = list(ma_X) #coverting from str to list

unique_rules_sets = [list(m) for m in set(tuple(i) for i in rules_sets)] #using set to remove profusion(duplicates)
Esempio n. 32
0
def mba(survey):
    #selecting customer_id and merchant cols
    results = survey[[
        "customer_id", "restaurant", "fffc", "grocery", "electronic", "sports",
        "clothing", "household", "hba", "beverage"
    ]]

    #creating a new df with a col of combination of all merchants in a str (done cause of chained assignnment)
    merchant = pd.DataFrame()
    merchant["merchants"] = results[results.columns[1:]].apply(
        lambda y: ','.join(y.dropna().astype(str)), axis=1)

    #create another df with just the combined merchants in str and customer_id
    new_merchant = pd.concat([results["customer_id"], merchant["merchants"]],
                             axis=1)

    #text pre-processing
    #converting all inputs to lowercase
    new_merchant = new_merchant.applymap(lambda s: s.lower()
                                         if type(s) == str else s)

    #replace str containing 'and' with ','
    new_merchant = new_merchant.applymap(lambda x: str(x).replace('and ', ','))

    #strip data of all whitespaces, not done here to match merchants_db input
    #new_merchant = new_merchant.applymap(lambda r: str(r).replace(" ",""))
    #new_merchant = new_merchant.applymap(lambda w: str(w).strip())

    #change merchants col from str to list
    new_merchant["merchants"] = new_merchant["merchants"].str.split('[:;.,/]')

    #explode merchants cols and ensure index reset since customer_id is the identifier and already in col
    exploded_merchant = new_merchant.explode("merchants")
    exploded_merchant.reset_index(drop=True, inplace=True)

    #check for nil or - values or empty string values (survey input error)
    for j in exploded_merchant.index:
        if exploded_merchant["merchants"][j] == "nil" or exploded_merchant[
                "merchants"][j] == "Nil" or exploded_merchant["merchants"][
                    j] == "" or exploded_merchant["merchants"][
                        j] == "nan" or exploded_merchant["merchants"][j] == "-":
            exploded_merchant.drop([j], inplace=True)

    #remove front and trailing whitespaces
    exploded_merchant = exploded_merchant.applymap(lambda w: str(w).strip())

    #for mba add a quantity col
    exploded_merchant.insert(2, "quantity", 1)

    #explodexploded_merchantcsv', index = False)
    #print(exploded_merchant)
    #note exploded_merchantls are in str and quantity col in numpy.float

    market_basket = exploded_merchant.groupby(['customer_id',
                                               'merchants'])['quantity']
    market_basket = market_basket.sum().unstack().reset_index().fillna(
        0).set_index('customer_id')
    market_basket = market_basket.applymap(encode_data)

    #itemsets are possible generation after applying apriori on min_support
    itemsets = apriori(market_basket, min_support=0.07, use_colnames=True)

    #rules is a df of possible associations
    rules = association_rules(itemsets, metric="lift", min_threshold=0)

    #note sorting done after appending categories or else error

    #check fronzen set antecedents for single item (sort by category)
    for k in rules.index:
        if len(rules["antecedents"][k]) > 1:
            rules.drop([k], inplace=True)

    #convert fronzenset from itemset into list or string(for single item)
    rules['antecedents'] = rules['antecedents'].apply(
        lambda a: ','.join(list(a)))
    rules['consequents'] = rules['consequents'].apply(
        lambda a: ','.join(list(a)))

    #round values to 3dp for visualization
    rules['antecedent support'] = round(rules['antecedent support'], 3)
    rules['consequent support'] = round(rules['consequent support'], 3)
    rules['support'] = round(rules['support'], 3)
    rules['confidence'] = round(rules['confidence'], 3)
    rules['lift'] = round(rules['lift'], 3)
    rules['leverage'] = round(rules['leverage'], 3)
    rules['conviction'] = round(rules['conviction'], 3)

    #creating new cols for graphs
    #new col = itemset
    rules['itemsets'] = rules['antecedents'] + ' -> ' + rules['consequents']

    #new col = count, assign int instead of float
    rules['count'] = round(rules["support"] * len(new_merchant.index))
    rules["count"] = rules["count"].astype(int)

    #rules.to_csv("mba_007.csv")
    return rules
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

df = pd.read_csv(
    r'C:\Users\arash\Desktop\association-rule-mining\dataframe-python.csv')
df
# pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df)
frequent_itemsets_fp = fpgrowth(df, min_support=0.2, use_colnames=True)
print(frequent_itemsets_fp)
rules_fp = association_rules(frequent_itemsets_fp,
                             metric="confidence",
                             min_threshold=0.7)
print(rules_fp)
rules_fp
Esempio n. 34
0
        if x != '':
            print(x + ' → ' + y)

from mlxtend.preprocessing import TransactionEncoder
TE = TransactionEncoder()
data = TE.fit_transform(symptoms)
print(data)

import pandas as pd
df = pd.DataFrame(data, columns=TE.columns_)
df.head()

from mlxtend.frequent_patterns import apriori
items = apriori(df, min_support=0.1, use_colnames=True)
print(items)

print(items[items['itemsets'].apply(lambda x: len(x)) >= 2])


from mlxtend.frequent_patterns import association_rules
rules = association_rules(items, min_threshold=0.7)
print(rules)

for i, j in rules.iterrows():
    X = j['antecedents']
    Y = j['consequents']
    x = ', '.join([item for item in X])
    y = ', '.join([item for item in Y])
    print(x + ' → ' + y)

Esempio n. 35
0
df.shape
#get back orginal transactions
orgtrans1 = te_ary[:]
te.inverse_transform(orgtrans1)

#%%% #frequent itemsets - Most Imp Step
support_threshold = 0.01
#https://github.com/rasbt/mlxtend/blob/master/mlxtend/frequent_patterns/apriori.py
frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True)
frequent_itemsets
print(frequent_itemsets) #dataframe with the itemsets

#%%%%  - Support Rules
association_rules?
#output - DF with antecedents -> consequent
supportRules3 = association_rules(frequent_itemsets, metric="support", min_threshold = .3)
print(supportRules3)
supportRules3.head()

print(supportRules3[['antecedents', 'consequents', 'support','confidence','lift']])
#---
supportRules2 = association_rules(frequent_itemsets, metric="support", min_threshold = .2)
print(supportRules2[['antecedents', 'consequents', 'support','confidence','lift']])


#%%%% Lift  : generally > 1 for strong associations

lift1 = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(lift1)
lift1
print(lift1[['antecedents', 'consequents', 'support', 'lift','confidence']])
Esempio n. 36
0
def bil_aprori():
    if request.method == 'POST':
        f = request.files['file']
        #  f.save(secure_filename(f.filename))
        print(f)
        app.logger.info("File Received")
    else:
        return 'Error in Upload'

    warnings.filterwarnings('ignore')
    df = pd.read_csv(f)
    print("Dataset Import Success")
    df['Item'] = df['Item'].str.lower()

    x = df['Item'] == 'none'
    print(x.value_counts())
    df = df.drop(df[df.Item == 'none'].index)

    len(df['Item'].unique())

    df_for_top10_Items = df['Item'].value_counts().head(10)
    Item_array = np.arange(len(df_for_top10_Items))

    import matplotlib.pyplot as plt
    # plt.figure(figsize=(15,5))
    # Items_name=['coffee','bread','tea','cake','pastry','sandwich','medialuna','hot chocolate','cookies','brownie']
    # plt.bar(Item_array,df_for_top10_Items.iloc[:])
    # plt.xticks(Item_array,Items_name)
    # plt.title('Top 5 most selling items')
    # # plt.show()
    # plt.savefig('static/new_plot1.png')

    fig, ax = plt.subplots(figsize=(16, 7))
    df['Item'].value_counts().sort_values(ascending=False).head(20).plot.bar(
        width=0.5, edgecolor='k', align='center', linewidth=1)
    plt.xlabel('Food Item', fontsize=20)
    plt.ylabel('Number of transactions', fontsize=17)
    ax.tick_params(labelsize=20)
    plt.title('20 Most Sold Items', fontsize=20)
    # plt.grid()

    plt.savefig('static/new_plot1.png')
    plt.clf()
    plt.cla()
    plt.close()
    ######################################################################

    df['Date'] = pd.to_datetime(df['Date'])
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.hour
    df['day_of_week'] = df['Date'].dt.weekday
    d = df.loc[:, 'Date']

    weekday_names = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
        'Sunday'
    ]
    Weekday_number = [0, 1, 2, 3, 4, 5, 6]
    week_df = d.groupby(d.dt.weekday).count().reindex(Weekday_number)
    Item_array_week = np.arange(len(week_df))

    plt.figure(figsize=(15, 5))
    my_colors = 'rk'
    plt.bar(Item_array_week, week_df, color=my_colors)
    plt.xticks(Item_array_week, weekday_names)
    plt.title('Number of Transactions made based on Weekdays')
    #plt.show()
    plt.savefig('static/new_plot2.png')

    plt.clf()
    plt.cla()
    plt.close()
    #####################################################################

    dt = df.loc[:, 'Time']
    Hour_names = [
        7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
    ]
    time_df = dt.groupby(dt).count().reindex(Hour_names)
    Item_array_hour = np.arange(len(time_df))

    plt.figure(figsize=(15, 5))
    my_colors = 'rb'
    plt.bar(Item_array_hour, time_df, color=my_colors)
    plt.xticks(Item_array_hour, Hour_names)
    plt.title('Number of Transactions made based on Hours')
    #plt.show()
    plt.savefig('static/new_plot3.png')
    plt.clf()
    plt.cla()
    plt.close()

    ##############################################################################

    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules

    hot_encoded_df = df.groupby(
        ['Transaction',
         'Item'])['Item'].count().unstack().reset_index().fillna(0).set_index(
             'Transaction')

    def encode_units(x):
        if x <= 0:
            return 0
        if x >= 1:
            return 1

    hot_encoded_df = hot_encoded_df.applymap(encode_units)

    frequent_itemsets = apriori(hot_encoded_df,
                                min_support=0.01,
                                use_colnames=True)

    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=1)
    rules.head()
    rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.5)]

    ###############################################################################
    support = rules['support'].values
    confidence = rules['confidence'].values
    import seaborn as sns

    for i in range(len(support)):
        support[i] = support[i]
        confidence[i] = confidence[i]

    # plt.figure(figsize=(15,5))
    # my_colors = 'rb'
    # plt.bar(Item_array_hour,time_df, color=my_colors)
    # plt.xticks(Item_array_hour,Hour_names)
    # plt.title('Number of Transactions made based on Hours')
    # #plt.show()
    # plt.savefig('static/new_plot3.png')

    plt.plot()
    plt.figure(figsize=(15, 5))
    plt.scatter(support, confidence, alpha=0.5, marker="*")
    plt.title('Association Rules')
    plt.xlabel('support')
    plt.ylabel('confidence')
    #fig=()
    #sns.regplot(x=support, y=confidence, fit_reg=False)
    # plt.show(p)
    # fig = p.get_figure()
    # fig.savefig('out.png')
    # fig = fig1.get_figure()
    # fig.savefig("output.png")
    # fig = sns.regplot(x=support, y=confidence, fit_reg=False)
    # fig.figure.savefig('static/new_plot4.png')

    #fig = sns.regplot(x=support, y=confidence, fit_reg=False)
    #fig.figure.savefig('../test.png')

    #plt.show()
    plt.savefig('static/new_plot4.png')

    plt.clf()
    plt.cla()
    plt.close()

    ######################
    rules_to_show = 20
    import networkx as nx
    plt.plot()
    G1 = nx.DiGraph()

    color_map = []
    N = 50
    colors = np.random.rand(N)
    strs = [
        'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10',
        'R11'
    ]

    for i in range(rules_to_show):
        G1.add_nodes_from(["R" + str(i)])

        for a in rules.iloc[i]['antecedents']:

            G1.add_nodes_from([a])

            G1.add_edge(a, "R" + str(i), color=colors[i], weight=2)

        for c in rules.iloc[i]['consequents']:

            G1.add_nodes_from([c])

            G1.add_edge("R" + str(i), c, color=colors[i], weight=2)

    for node in G1:
        found_a_string = False
        for item in strs:
            if node == item:
                found_a_string = True
        if found_a_string:
            color_map.append('yellow')
        else:
            color_map.append('green')

    edges = G1.edges()
    colors = [G1[u][v]['color'] for u, v in edges]
    weights = [G1[u][v]['weight'] for u, v in edges]

    pos = nx.spring_layout(G1, k=16, scale=1)
    nx.draw(G1,
            pos,
            edges=edges,
            node_color=color_map,
            edge_color=colors,
            width=weights,
            font_size=16,
            with_labels=False)

    for p in pos:  # raise text positions
        pos[p][1] += 0.07

    nx.draw_networkx_labels(G1, pos)
    # plt.show()
    plt.savefig('static/new_plot5.png')
    plt.clf()
    plt.cla()
    plt.close()

    import time
    time.sleep(5)  # Delays for 5 seconds. You can also use a float value.
    return render_template('out.html',
                           name='Top 5 most selling items',
                           url='new_plot1.png',
                           name1='abc',
                           url2='new_plot2.png',
                           url3='new_plot3.png',
                           url4='new_plot4.png',
                           url5='new_plot5.png')
Esempio n. 37
0
def create_model(metric="confidence", threshold=0.5, min_support=0.05, round=4):

    """
    This function creates an association rules model using data and identifiers 
    passed at setup stage. This function internally transforms the data for 
    association rule mining.


    Example
    -------
    >>> from pycaret.datasets import get_data
    >>> data = get_data('france')
    >>> from pycaret.arules import *
    >>> exp_name = setup(data = data, transaction_id = 'InvoiceNo', item_id = 'Description')
    >>> model1 = create_model(metric = 'confidence')


    metric: str, default = 'confidence'
        Metric to evaluate if a rule is of interest. Default is set to confidence. 
        Other available metrics include 'support', 'lift', 'leverage', 'conviction'. 
        These metrics are computed as follows:

        * support(A->C) = support(A+C) [aka 'support'], range: [0, 1]
        * confidence(A->C) = support(A+C) / support(A), range: [0, 1]
        * lift(A->C) = confidence(A->C) / support(C), range: [0, inf]
        * leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1]
        * conviction = [1 - support(C)] / [1 - confidence(A->C)], range: [0, inf]
    

    threshold: float, default = 0.5
        Minimal threshold for the evaluation metric, via the `metric` parameter,
        to decide whether a candidate rule is of interest.
    

    min_support: float, default = 0.05
        A float between 0 and 1 for minumum support of the itemsets returned.
        The support is computed as the fraction `transactions_where_item(s)_occur /
        total_transactions`.
    

    round: int, default = 4
        Number of decimal places metrics in score grid will be rounded to. 


    Returns:
        pandas.DataFrame
        

    Warnings
    --------
    - Setting low values for min_support may increase training time.
  
    """

    # loading dependencies
    import pandas as pd
    from IPython.display import display, HTML, clear_output, update_display
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules

    # reshaping the dataframe
    basket = (
        X.groupby([txid, iid])[iid]
        .count()
        .unstack()
        .reset_index()
        .fillna(0)
        .set_index(txid)
    )
    if ignore_list is not None:
        basket = basket.drop(ignore_list, axis=1)

    def encode_units(x):

        if x <= 0:
            return 0
        if x >= 1:
            return 1

    basket = basket.applymap(encode_units)

    frequent_itemsets = apriori(basket, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric=metric, min_threshold=threshold)
    rules = rules.sort_values(by=[metric], ascending=False).reset_index(drop=True)
    rules = rules.round(round)

    # storing into experiment
    tup = ("Basket", basket)
    experiment__.append(tup)

    tup = ("Frequent Itemsets", frequent_itemsets)
    experiment__.append(tup)

    tup = ("Rules", rules)
    experiment__.append(tup)

    return rules
Esempio n. 38
0
# 分析MovieLens 电影分类中的频繁项集和关联规则
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# 数据加载
movies = pd.read_csv('./movie_actors.csv')
#print(movies.head())
# 将genres进行one-hot编码(离散特征有多少取值,就用多少维来表示这个特征)
movies_hot_encoded = movies.drop('actors',
                                 1).join(movies.actors.str.get_dummies('/'))
pd.options.display.max_columns = 100
print(movies_hot_encoded.head())

# 将movieId, title设置为index
movies_hot_encoded.set_index(['title'], inplace=True)
#print(movies_hot_encoded.head())
# 挖掘频繁项集,最小支持度为0.02
itemsets = apriori(movies_hot_encoded, use_colnames=True, min_support=0.05)
# 按照支持度从大到小进行时候粗
itemsets = itemsets.sort_values(by="support", ascending=False)
print('-' * 20, '频繁项集', '-' * 20)
print(itemsets)
# 根据频繁项集计算关联规则,设置最小提升度为2
rules = association_rules(itemsets, metric='lift', min_threshold=2)
# 按照提升度从大到小进行排序
rules = rules.sort_values(by="lift", ascending=False)
rules.to_csv('./rules.csv')
print('-' * 20, '关联规则', '-' * 20)
print(rules)
for index, row in data_arm.iterrows():
    labels = {}
    uncommons = list(set(items) - set(row))
    commons = list(set(items).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
encoded_vals[0]
ohe_df = pd.DataFrame(encoded_vals)


freq_items = apriori(ohe_df, min_support=0.05, use_colnames=True, verbose=1)
freq_items.head(7)
rules = association_rules(freq_items, metric="confidence", min_threshold=0.1)
print(rules.head(50))

subset = rules[rules['conviction'] > 1.2]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(subset)

print((subset.sort_values(['conviction', 'lift'], ascending=(False, False))).to_string())
subset.to_csv("Data/pred_set_arm_out.csv")

z = subset['support']
y = subset['confidence']
plt.scatter(z, y, alpha=0.5, color='black')
n = np.arange(0, 12, 1)
for i, txt in enumerate(n):
    plt.annotate('R' + str(txt), (z.iloc[i] + 0.001, y.iloc[i] + 0.001))
Esempio n. 40
0
        data = prepfunctions.dummification(perm_data.copy(deep=True), boolean_attributes, bins, strategie)

        MIN_SUP: float = 0.001
        var_min_sup =[0.2, 0.1] + [round(i*MIN_SUP, 2) for i  in range(100, 0, -10)]

        plt.figure()
        patterns: pd.DataFrame = pm.apriori(data, min_support=MIN_SUP, use_colnames=True, verbose=True)
        print(len(patterns),'patterns')
        nr_patterns = []
        for sup in var_min_sup:
            pat = patterns[patterns['support']>=sup]
            nr_patterns.append(len(pat))

        plt.figure(figsize=(6, 4))
        ds.plot_line(var_min_sup, nr_patterns, title='Nr Patterns x Support', xlabel='support', ylabel='Nr Patterns')
        plt.savefig(subDir + 'HFCR Pattern Mining - Nr Patterns x Support')

        MIN_CONF: float = 0.1
        rules = pm.association_rules(patterns, metric='confidence', min_threshold=MIN_CONF*5, support_only=False)
        print(f'\tfound {len(rules)} rules')

        nr_rules_sp = analyse_per_metric(rules, 'support', var_min_sup, subDir)
        plt.figure(figsize=(6, 4))
        ds.plot_line(var_min_sup, nr_rules_sp, title='Nr Rules x Support', xlabel='support', ylabel='Nr. rules', percentage=False)
        plt.savefig(subDir + 'HFCR Pattern Mining - Nr Rules x Support')

        var_min_conf = [round(i * MIN_CONF, 2) for i in range(10, 5, -1)]
        nr_rules_cf = analyse_per_metric(rules, 'confidence', var_min_conf, subDir)
        plt.figure(figsize=(6, 4))
        ds.plot_line(var_min_conf, nr_rules_cf, title='Nr Rules x Confidence', xlabel='confidence', ylabel='Nr Rules', percentage=False)
        plt.savefig(subDir + 'HFCR Pattern Mining - Nr Rules x Confidence')
Esempio n. 41
0
import pandas as pd
import time
from mlxtend.frequent_patterns import apriori, association_rules
#数据加载
data = pd.read_csv("movie_actors.csv")
start = time.time()
# 进行one-hot编码(离散值有多少取值,就用多少维来表示这个特征)
data_hot_encode = data.drop('actors', 1).join(data.actors.str.get_dummies('/'))
pd.options.display.max_columns = 100
print(data_hot_encode.head())

# 将movieId, title设置为index
data_hot_encode.set_index(['title'], inplace=True)
print(data_hot_encode.head())
frequent_items = apriori(data_hot_encode, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_items, metric='lift', min_threshold=0.5)
print('频繁项集:', frequent_items)
print('关联规则:', rules)
end = time.time()
print('计算用时:%s' % (end - start))
Esempio n. 42
0
import pandas as pd
from mlxtend.preprocessing import OnehotTransactions
from mlxtend.frequent_patterns import apriori

oht = OnehotTransactions()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
print(df)

frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
print(frequent_itemsets)

from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
print(rules[['antecedents', 'consequents', 'support']])
support = rules[['support']]
#support = rules.as_matrix(columns=['support'])
#confidence = rules.as_matrix(columns=['confidence'])
#print(support)
#print(confidence)

import numpy as np
import pandas as pd
import networkx as nx

import matplotlib.pyplot as plt

# create state space and initial state probabilities
Esempio n. 43
0
def main():
    data = {}
    with open('online_retail.csv') as f:
        header_row = False
        header = []
        for cols in csv.reader(f):
            if not header_row:
                header_row = True
                header = [i for i in cols]  # list of headers
                for h in header:
                    data[h] = []  # init of dictionary
                continue
            if len(cols) == len(header) and cols[0][0] != 'C' and cols[0] != '' and cols[
                2] != '':  # chech for empty InvoiceIds or Descriptions
                for col, name in zip(cols, header):
                    data[name].append(col)

    byinvoice = {}  # dictionary with data grouped by invoice
    for invoiceNo, desc in zip(data['InvoiceNo'], data['Description']):
        if invoiceNo not in byinvoice.keys():
            byinvoice[invoiceNo] = []  # if the dict doesn't have an invoice entry, initialize it

        byinvoice[invoiceNo].append(desc)

    unique_products = []  # set of unique products, is used for the bitmap matrix
    for prod in data['Description']:
        if prod not in unique_products:
            unique_products.append(prod)

    match_matrix = []  # bitmap matrix is done by a list of lists

    for inv in byinvoice.keys():
        match_array = [0] * len(unique_products)  # init of each row of bitmap matrix
        for prod in byinvoice[
            inv]:  # for every product of a given invoice, I get the index inside the list of unique products
            i = unique_products.index(prod)
            match_array[i] = 1  # that index is used to put 1 in the correct position inside current matrix row

        match_matrix.append(match_array)

    df = pd.DataFrame(data=match_matrix, columns=unique_products)

    '''
    fi = fpgrowth(df, 0.05)
    print(len(fi))
    print(fi.to_string())
    
    # checking if result is right (first product is present in 10.9 % of invoices)
    i = 0
    for inv in byinvoice.values():
        if unique_products[0] in inv:
            i += 1
    print(str(i/len(byinvoice.keys())))
    '''

    # time check
    # I did it with 0.05 because using minsup=0.01 with apriori resulted in a Memory Error :(((((
    print(timeit.timeit(lambda: apriori(df, 0.05), number=1))
    print(timeit.timeit(lambda: fpgrowth(df, 0.05), number=1))

    fi = fpgrowth(df, 0.01)
    print(len(fi))
    fi_list = fi.values.tolist()

    # just to see the top 10 relevant informations
    # the larger the set, the more relevant the information     (I think)
    fi_list.sort(key=lambda x: -len(x[1]))
    print(fi_list[:10])  # top 10

    ar = association_rules(fi, metric="confidence", min_threshold=0.85)
    ar.to_csv(r'association_rules.csv', header=True)  # dump to file for visualization reasons
	def get_assoc_dataframe(self):
		assoc = association_rules(self.get_frequent_itemset(), metric = "confidence", min_threshold = 0.7) # earlier 0.6
		return assoc