def test_cloning():

    oht = TransactionEncoder()
    oht.fit(dataset)
    oht2 = clone(oht)

    msg = ("'TransactionEncoder' object has no attribute 'columns_'")
    assert_raises(AttributeError,
                  msg,
                  oht2.transform,
                  dataset)

    trans = oht2.fit_transform(dataset)
    np.testing.assert_array_equal(expect, trans)
Beispiel #2
0
dataset=[["Bread","Milk","Beer"],
        ["Bread","Diapers","Egg"],
        ["Milk","Diapers","Beer","Cola"],
        ["Bread","Milk","Diapers","Beer"],
        ["Bread","Milk","Cola"]]

import pandas as pd  
from mlxtend.preprocessing import TransactionEncoder   
from mlxtend.frequent_patterns import apriori    

te=TransactionEncoder() 
te=te.fit(dataset)  
te_ary=te.transform(dataset)

df=pd.DataFrame(te_ary,columns=te.columns_)

frequent_itemsets=apriori(df,min_support=0.6,use_colnames=True)

#to apply association rules
from mlxtend.frequent_patterns import association_rules

rules=association_rules(frequent_itemsets,metric='support',min_threshold=0.6)

cartItem=[]
init=input("1.Beer\t2.Bread\t3.Cola\t4.Diapers 5.Egg 6.Milk.\n What do you wish to buy? ")
init=init.capitalize()
cartItem=cartItem+[init]
permission='y'

checklist=['Beer','Bread','Cola','Diapers','Egg','Milk']
Beispiel #3
0
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
dff = pd.read_excel(r'C', na_values='?', sheet_name='data')#подгружаем данные в данных содержатся пустые ячейки
print(dff.head())
ddd = dff.pivot_table(values='Сумма', columns='Значение', index='ID')
ddd[np.isnan(ddd)] = 0
ddd[ddd>0]=1
print(ddd.head())
def transaction_list(df): #функция для создания списка транзакция
    list_external=[]
    for i in range(df.shape[0]):
        list_internal=[]
        data=df.iloc[i]
        index=data[data>0]
        for element in index.index:
            list_internal.append(element)
        list_external.append(list_internal)
    return list_external
transactions = transaction_list(ddd) #создаем список транзакция по каждому клиенту
print(transactions[0])#список магазинов, который посещает клиент под номером [0]
te = TransactionEncoder() # создаем частные наборы с уровнем поддержки (сколько транзакций содержится в датасете) (min_support = 0,2)- это минимум
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)#сгенерируем ассоциативные правила с уровнем доверия (как часто правила срабатывают для всего датасета) 0.1 (очень низкий)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.1)#сгенерируем ассоциативные правила с уровнем независимости (насколько  элементы события зависят друг от друга) больше 1.2
writer = pd.ExcelWriter(r'')#запишем в Excel
rules.to_excel(writer, sheet_name = 'test')
writer.save()
print("все гуд, чувак")
Beispiel #4
0
if __name__ == '__main__':
    ML_DS_PATH = '../dataset/ml-out'
    MATRIX_PATH = '../temp'

    ratings_old = pd.read_csv(ML_DS_PATH + '/ratings_old.csv')

    s_rate_old = pd.read_csv(MATRIX_PATH + '/s_rate_old.csv')
    s_rate_old = s_rate_old.set_index('MovieID')
    s_rate_old.rename(columns=int, inplace=True)

    with open(MATRIX_PATH + '/cluster_old.pickle', 'rb') as f:
        cluster = pickle.load(f)

    model_start = time.time()  # 打点计时
    te = TransactionEncoder()
    te_ary = te.fit(cluster).transform(cluster)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = apriori(df, min_support=0.08, use_colnames=True)
    ar = association_rules(frequent_itemsets,
                           metric="confidence",
                           min_threshold=0.2)
    model_end = time.time()  # 打点计时
    print('Modeling Time: %s' % (model_end - model_start))  # 打点计时
    recommend_list_example = recommend_apriori(ratings_old, ar, 5)
    print(recommend_list_example)

    # 准确率 Precision & 召回率 Recall & 覆盖率 Coverage & 多样性 Diversity
    s_rate_new = pd.read_csv(MATRIX_PATH + '/s_rate_new.csv')
    s_rate_new = s_rate_new.set_index('MovieID')
    s_rate_new.rename(columns=int, inplace=True)
data = pd.read_csv(input_clause, sep='\t', na_filter=False)

#data=data.head(500)

dataset = []
for ind, row in data.iterrows():
    cl = row['Clause'].split(';')[:-1]
    cl = [c.strip() for c in cl]
    dataset.append(cl)
'''##one hot encoding
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset) ##one_hot encoding
df = pd.DataFrame(te_ary, columns=te.columns_)'''

##sparse encoding
te = TransactionEncoder()
oht_ary = te.fit(dataset).transform(dataset, sparse=True)
sparse_df = pd.SparseDataFrame(oht_ary,
                               columns=te.columns_,
                               default_fill_value=False)
#print (sparse_df)

frequent_itemsets = apriori(sparse_df, min_support=0.00002, use_colnames=True)

frequent_itemsets = frequent_itemsets.sort_values(by='support',
                                                  ascending=False)
print(frequent_itemsets)
frequent_itemsets['Word_clause'] = ''

input_features = fp + 'feature_details' + file_date + '.txt'
df_features = pd.read_csv(input_features, sep='\t', na_filter=False)
Beispiel #6
0
cur.close()
conn.close()

#Sets column and row size in output
desired_width = 320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns', 10)
dataset = pd.DataFrame(transaction_list)

#Changes need to be made to dataset to apply fp algorithm
#Converting the data frame into a list of lists
records = []
for i in range(0, 9835):
    records.append([str(dataset.values[i, j]) for j in range(0, 20)])

#"Cleaning" dataset by using TransactionEncoder and dropping the 'None' column
TE = TransactionEncoder()
array = TE.fit(records).transform(records)
transf_df = pd.DataFrame(array, columns=TE.columns_)
cleanDataset = transf_df.drop(['None'], axis=1)

#Using association rules to mine dataset
assocRules = fpgrowth(cleanDataset, min_support=0.05, use_colnames=True)
rules = association_rules(assocRules, metric='lift', min_threshold=1)
print(rules)

#Prints time taken for execution
print(datetime.now() - startTime)
sqlContext = HiveContext(sc)
df = sc.textFile('hdfs:/user/capstone/dataset.csv').map(
    lambda line: line.split(","))
header = df.first()
fields = [
    StructField(field_name, StringType(), True) for field_name in header
]  #get the types of header variable fields
schema = StructType(fields)
filter_data = df.filter(lambda row: row != header)
SelectDf = sqlContext.createDataFrame(filter_data, schema=schema)
SelectDf.registerTempTable("transactions")
DescriptionGrp = sqlContext.sql(
    "SELECT distinct InvoiceNo,Description FROM transactions group by InvoiceNo,Description"
)
transactions = DescriptionGrp.groupBy("InvoiceNo").agg(
    collect_list("Description").alias("desc")).rdd.map(lambda x: x.desc)
print(transactions.take(5))
transactionsDF = sqlContext.createDataFrame(transactions)
print(transactionsDF.take(5))
te = TransactionEncoder()
te_ary = te.fit(transactionsDF).transform(transactionsDF)
transactionrecord = pd.DataFrame(te_ary, columns=te.columns_)
print(transactionrecord)
#association_rules_mba=apriori(transactionrecord,min_support=0.002,min_confidence=0.002,min_lift=1.2,min_length=2)
#transactionsparaRDD = sc.parallelize(association_rules_mba)
rulesDF = apriori(transactionrecord,
                  min_support=0.6,
                  verbose=1,
                  low_memory=True)
ptint(rulesDF)
def test_fit():
    oht = TransactionEncoder()
    oht.fit(dataset)
    assert(oht.columns_ == ['Apple', 'Bananas', 'Beer',
                            'Chicken', 'Milk', 'Rice'])
Beispiel #9
0
"""
import csv
 
dict = dkey
w = csv.writer(open("output.csv", "w"))
for key, val in dict.items():
  w.writerow([key, val])     
        
    """
    
# To create a list of lists from the dictionary values 
i=10002    
while i in range(10002,42580):
    dkey[i]=list(map(str,dkey[i]))
we=list(dkey.values())

#Fitting the association rule learning model
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
dat=we[1:50]
te_ary = te.fit_transform(dat,sparse=False)
df = pd.DataFrame(te_ary, columns=te.columns_)
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

print (frequent_itemsets)

from mlxtend.frequent_patterns import association_rules
t=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
Beispiel #10
0
from apyori import apriori
rules = apriori(symptoms, min_support=0.1, min_confidence=0.7)
results = list(rules)

for i in results:
    for j in i.ordered_statistics:
        X = j.items_base
        Y = j.items_add
        x = ', '.join([item for item in X])
        y = ', '.join([item for item in Y])
        if x != '':
            print(x + ' → ' + y)

from mlxtend.preprocessing import TransactionEncoder
TE = TransactionEncoder()
data = TE.fit_transform(symptoms)
print(data)

import pandas as pd
df = pd.DataFrame(data, columns=TE.columns_)
df.head()

from mlxtend.frequent_patterns import apriori
items = apriori(df, min_support=0.1, use_colnames=True)
print(items)

print(items[items['itemsets'].apply(lambda x: len(x)) >= 2])


from mlxtend.frequent_patterns import association_rules
def main():

    dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1")
    author_title = dblp_data
    dataset = author_title.to_numpy()
    list1 = dataset[:,2].tolist()

    #convert authors to lower case
    list2 = []
    for i in list1:
        sublist = i.lower().split()
        list2.append(sublist)
    
    te = TransactionEncoder()
    te_ary = te.fit(list2).transform(list2)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent = fpgrowth(df, min_support=0.001, use_colnames=True)
    frequent = frequent[frequent['itemsets'].str.len()>1]

    freqauth_list = []
    for i in frequent['itemsets']:
        freqauth_list.append([x for x in i])

    freqauth_dict = {}
    for i in freqauth_list:
        title_idx_sublist = []
        for idx, j in enumerate(list2):
            if set(i).issubset(j):
                title_idx_sublist.append(idx)
        freqauth_dict.update({tuple(i):title_idx_sublist})

    freqauth_title_dict = {}
    kstem = ks.PyKrovetzStemmer()
    for key, value in freqauth_dict.items():
        title_df = author_title.iloc[value]['title']
        title_sublist = list(title_df)
        title_sublists = []
        temp_list = []
        for x in title_sublist:
            tempx     = re.sub(r'[.]','', x)
            temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split()
            temp_list2 = []
            if isinstance(temp_list, list):
                temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist])
                title_sublists.extend(temp_list2)
            else:
                if not temp_list in stopwordlist:
                    title_sublists.extend([kstem.stem(temp_list)])
        freqauth_title_dict.update({key:title_sublists})

    # Closed / Top k titles of frequent authors
    freqauth_title_dict_closed = {}
    for k, v in freqauth_title_dict.items():
        ps = PrefixSpan(v)
        closed_Seq_pattern = ps.topk(5, closed=True)
        freqauth_title_dict_closed.update({k:closed_Seq_pattern})

    # To get frequent author's context indicators
    frequentlist = freqauth_list
    cleanedList  = list2

    new_author_list = []
    for i in range(0,len(frequentlist)):
        temp_author_list = []
        authorlist = list(frequentlist[i])
        found = 0
        for k in range(0,len(cleanedList)):
            for j in range(0, len(authorlist)):
                if (authorlist[j] in(cleanedList[k])):
                    found = 1
                else:
                    found = 0
                    break
                    
            if found == 1:
                for jj in range(0,len(authorlist)):
                    if (authorlist[jj] in(cleanedList[k])):
                        cleanedList[k].remove(authorlist[jj])
                temp_author_list.append(cleanedList[k])

        new_author_list.append(temp_author_list)

    context_indicator_list = []
    for i in range(0,len(new_author_list)):
        te = TransactionEncoder()
        te_ary = te.fit(new_author_list[i]).transform(new_author_list[i])
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True)

        supp = frequent_author_list.support.unique()  # all unique support count
        # Dictionary storing itemset with same support count key
        freq_dic = {}
        for i in range(len(supp)):
            inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets'])
            freq_dic[supp[i]] = inset
        # Dictionary storing itemset with  support count <= key
        freq_dic2 = {}
        for i in range(len(supp)):
            inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets'])
            freq_dic2[supp[i]] = inset2

        # Find Closed frequent itemset
        close_freq = []
        for index, row in frequent_author_list.iterrows():
            isclose = True
            cli = row['itemsets']
            cls = row['support']
            checkset = freq_dic[cls]
            for i in checkset:
                if (cli != i):
                    if (frozenset.issubset(cli, i)):
                        isclose = False
                        break

            if (isclose):
                close_freq.append([x for x in  (row['itemsets'])])
        context_indicator_list.append(close_freq)
    
    freqauth_context_ind_dict = {}
    for authpair, titlelist in freqauth_title_dict_closed.items():
        cleantitlelist = []
        for i in titlelist:
            if isinstance(i, tuple):
                if isinstance(i[1], list):
                    listtostring = ' '.join(i[1])
                    cleantitlelist.append(listtostring)
        freqauth_context_ind_dict.update({authpair:cleantitlelist})

    # Merging both titles and Context indicator author for frequent pattern authors 
    for idx, key in enumerate(freqauth_context_ind_dict):
        newval = []
        if len(context_indicator_list[idx])> 0:
            for i in context_indicator_list[idx]:
                if len(i) > 0:                
                    tempstr = '&'.join(i)
                    newval = freqauth_context_ind_dict[key]
                    newval.append(tempstr)
                    freqauth_context_ind_dict.update({key:newval})

# Context Indicator Weighting
    CI_list = list(freqauth_context_ind_dict.values())
    freqauth_context_in_weights = {}
    for key, value in freqauth_context_ind_dict.items():
        freq_auth_CI_list = value
        length_of_CI = len(value)
        temp_dict = {}
        for i in freq_auth_CI_list:
            count_tmp = 0
            for j in CI_list:
                if (i in (j)):
                    count_tmp += 1
            weight = round(1 - ((count_tmp - 1) /  count_tmp), 2)
            if (weight > 0.1):
                temp_dict.update({i:weight})
        sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        freqauth_context_in_weights.update({key:sorted_weights_dict})

    freq_auth_transactions = {}
    list_of_freq_auth = list(freqauth_context_in_weights.keys())
    for i in range(0, len(freqauth_title_dict)):
        temp_dict = {}
        title_list = freqauth_title_dict.get(list_of_freq_auth[i])
        CI_list = freqauth_context_in_weights[list_of_freq_auth[i]]
        CI_list_auth = []
        for n, c in enumerate(CI_list):
            CI_list_auth.append(c[0])
        for j in range(0, len(title_list)):
            cos_sim = cos_similarity(CI_list_auth,title_list[j])
            cos_sim = round(cos_sim, 3)
            t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j])
            temp_dict.update({t_title:cos_sim})

        sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        t_len = len(list(temp_dict.values()))
        max_len = t_len
        if (t_len > 4):
            max_len = 4
        sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len])
        freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1})

    # To find the strongest SSP - Match against similarity of the context units

    freq_auth_SSPs = {}
    list_of_freq_auth = list(freqauth_context_ind_dict.keys())
    list_of_freq_auth_CI =  list(freqauth_context_ind_dict.values())
    len_list_of_freq_auth_CI = len(list_of_freq_auth_CI)

    context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float)
    for i in range (0,len_list_of_freq_auth_CI):
        for j in range (0,len_list_of_freq_auth_CI):
            cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j])
            cos_sim = round(cos_sim, 3)
            if (i != j):
                context_indicator_similarity[i][j] = cos_sim
                context_indicator_similarity[j][i] = cos_sim

    context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int)
    for i in range(0,len(context_indicator_similarity)):
        context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:]

    SSP_Author_List = []
    for i in range(0,len(list_of_freq_auth)):
        temp_author_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
           temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]])
        SSP_Author_List.append(temp_author_list_ssp)

    SSP_Title_List = []

    CI_list_title = list(freqauth_title_dict_closed.values())
    CI_list1 = []
    for i in (CI_list_title):
        temp_list3 = []
        for j in i:
            CI_str = ' '.join(j[1])
            temp_list3.append(CI_str)
        CI_list1.append(list(set(temp_list3)))

    for i in range(0,len(CI_list1)):
        temp_title_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
            ssp_str = CI_list1[context_indicator_similarity_idx[i][j]]
            temp_title_list_ssp.extend(ssp_str)
        SSP_Title_List.append(list(set(temp_title_list_ssp)))

    # Write the output to a CSV file
    # a) list_of_freq_auth
    # b) list_of_freq_auth_CI / freqauth_context_in_weights
    # c) freq_auth_transactions
    # d) SSP_Author_List
    # e) SSP_Title_List
    #for i in range(0, frequent_author_list):
    #print(len(SSP_Title_List))
    #print(SSP_Title_List)
    titles_list_with_weight = list(freq_auth_transactions.values())
    # Joining SSP authors
    SSP_authors_formatted = []
    for i in range(0,len(SSP_Author_List)):
        temp_list = []
        for j in range(0, len(SSP_Author_List[i])):
            authors = '&'.join(list(SSP_Author_List[i][j]))
            temp_list.append(authors)
        SSP_authors_formatted.append(temp_list)

    with open("./output.txt", 'w', encoding="utf-8") as f:
        f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' +
                'Transaction 2' + '||'  + 'Transaction 3' + '||'  + 'Transaction 4' + '||' + 'SSP - Co-Author' +
                '||' + 'SSP - Title' + '\n')
        for i in range(0, len(list_of_freq_auth)):
            authors = ' '.join(list(list_of_freq_auth[i]))
            f.write(authors + '||')
            Context_indicators = '; '.join(list_of_freq_auth_CI[i])
            f.write(Context_indicators + '||')
            for j in (titles_list_with_weight[i].keys()):
                f.write(j + '||')
            ssp_authors = '; '.join(SSP_authors_formatted[i])
            f.write(ssp_authors + '||')
            ssp_titles = '; '.join(SSP_Title_List[i])
            f.write(ssp_titles )
            f.write('\n')
def test_inverse_transform():
    oht = TransactionEncoder()
    oht.fit(dataset)
    np.testing.assert_array_equal(np.array(data_sorted),
                                  np.array(oht.inverse_transform(expect)))
def test_fit_transform():
    oht = TransactionEncoder()
    trans = oht.fit_transform(dataset)
    np.testing.assert_array_equal(expect, trans)
def test_transform_sparse():
    oht = TransactionEncoder()
    oht.fit(dataset)
    trans = oht.transform(dataset, sparse=True)
    assert(isinstance(trans, csr_matrix))
    np.testing.assert_array_equal(expect, trans.todense())
Beispiel #15
0
plt.title("Support level of 9%")
data = pd.read_csv("BreadBasket_DMS.csv")

data = data.set_index(['Item'])
filtered = data.drop(['NONE'])
data = data.reset_index()
filtered = filtered.reset_index()
transaction_list = []

# For loop to create a list of the unique transactions throughout the dataset:
for i in filtered['Transaction'].unique():
    tlist = list(set(filtered[filtered['Transaction'] == i]['Item']))
    if len(tlist) > 0:
        transaction_list.append(tlist)

te = TransactionEncoder()
te_ary = te.fit(transaction_list).transform(transaction_list)
df2 = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df2, min_support=0.09, use_colnames=True)
#take minimum threshold
rules = association_rules(frequent_itemsets,
                          metric='confidence',
                          min_threshold=0.0001)

rules.sort_values('confidence', ascending=False)

#now categorise every rule with different range of confidence.
rules['support'] = rules['support'] * 100
rules['confidence'] = rules['confidence'] * 100
rules2 = rules[['antecedents', 'consequents', 'support', 'confidence']]
    for num in l: 
        if num not in final_list: 
            final_list.append(num) 
    
    records_new.append(final_list)
    




from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder


  
te=TransactionEncoder()
te_data=te.fit(records_new).transform(records_new)
data_x=pd.DataFrame(te_data,columns=te.columns_)
print(data_x.head())

frequent_items= apriori(data_x, use_colnames=True, min_support=0.0045)

rules = association_rules(frequent_items, metric="lift", min_threshold=1)



rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequents_len"] = rules["consequents"].apply(lambda x: len(x))


Dataframe = pd.read_csv(
    "D:\\Machine Learning_Algoritms\\Apriori\\GroceryStoreDataSet.csv",
    encoding='latin1',
    names=['products'],
    header=None)
num_records = len(Dataframe)
print(num_records)

transactions = []
for i in range(0, num_records):
    transactions.append([str(Dataframe.values[i, j]) for j in range(0, 3)])

Dataframe = list(Dataframe["products"].apply(lambda x: x.split(',')))

from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_data = te.fit_transform(Dataframe)

Dataframe = pd.DataFrame(te_data, columns=te.columns_)

count = Dataframe.loc[:, :].sum()
reverse_count = count.sort_values(0, ascending=False).head(11)
reverse_count = reverse_count.to_frame()
reverse_count = reverse_count.reset_index()
#reverse_count = reverse_count.rename(columns = {“index”: “items” ,0: “count”})

plt.style.available

plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('dark_background')
ax = reverse_count.plot.barh()
Beispiel #18
0
def transaction_encoding(dataset):
    oht = TransactionEncoder()
    oht_ary = oht.fit(dataset).transform(dataset)
    new_df = pd.DataFrame(oht_ary, columns=oht.columns_)
    return new_df
Beispiel #19
0
    def setUp(self):
        database = [['a'], ['b'], ['c', 'd'], ['e']]
        te = TransactionEncoder()
        te_ary = te.fit(database).transform(database)

        self.df = pd.DataFrame(te_ary, columns=te.columns_)
Beispiel #20
0
# məhsulları ayrı ayrılıqda list formatına çevirmək

transactions = list(items["mehsul_ad"].transform(lambda x: x.split(";")))

# In[12]:

transactions[0]

# In[13]:

# məhsulları pivot formatında səbətə çevirmək: hər bir sətirdə(səbətdə) hansı məhsulların olub olmaması

from mlxtend.preprocessing import TransactionEncoder

tr_enc = TransactionEncoder()
basket = pd.DataFrame(tr_enc.fit_transform(transactions),
                      columns=tr_enc.columns_)

# In[14]:

basket

# In[15]:

# səbət analizi üçün lazım olan funksiyalar

from mlxtend.frequent_patterns import apriori, association_rules

# In[16]:
Beispiel #21
0
# In[6]:

player_combo = []  #list of lists match players and result
for i in range(0, 105):
    rowItem = []
    for j in range(0, 13):
        rowItem.append(str(match_data.values[i, j]))
    player_combo.append(rowItem)

# In[7]:

# player_combo

# In[8]:

te = TransactionEncoder()
te_ary = te.fit(player_combo).transform(player_combo)
match_df_freq = pd.DataFrame(te_ary, columns=te.columns_)

# In[9]:

match_df_freq

# In[10]:

match_sup = apriori(match_df_freq, min_support=0.1, use_colnames=True)
match_sup

# In[11]:

rules = association_rules(match_sup, metric="lift", min_threshold=1)
Beispiel #22
0
all_data = pd.read_csv('dataset_group.csv', header=None)
# В файле нет строки с названием столбцов, поэтому параметр header равен None.
# Интерес представляет информация об id покупателя - столбец с названием 1
# Название купленного товара хранится в столбце с названием 2

unique_id = list(set(all_data[1]))
# print(len(unique_id)) #Выведем количество id

items = list(set(all_data[2]))
# print(len(items)) #Выведем количество товаров

dataset = [[elem for elem in all_data[all_data[1] == id][2] if elem in
            items] for id in unique_id]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

# print(df)

results = apriori(df, min_support=0.3, use_colnames=True)
results['length'] = results['itemsets'].apply(lambda x: len(x))  # добавление размера набора
# print(results)

results = apriori(df, min_support=0.3, use_colnames=True, max_len=1)
# print(results)

results = apriori(df, min_support=0.3, use_colnames=True)
results['length'] = results['itemsets'].apply(lambda x: len(x))
results = results[results['length'] == 2]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = pd.read_csv('Groceries.csv')

listset = dataset.groupby(['Customer'])['Item'].apply(list).values.tolist()
print(f"2a.the number of customer is {len(listset)}")



from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(listset).transform(listset)
ItemIndicator = pd.DataFrame(te_ary, columns=te.columns_)
print(f"2b.the number of unique items in the market basket across all customers is {ItemIndicator.shape[1]}")

Items_count_list=[]
for i in listset:
    Item_count = len(i)
    Items_count_list.append(Item_count)
    
Customers_index_list=[]
for i in range(len(listset)):
    Customers_index_list.append(i+1)
    

P3,median,P1 = np.percentile(Items_count_list,[75,50,25])
iqr = P3-P1
h = 2*iqr*len(Items_count_list)**(-1/3.0)
max_value = max(Items_count_list)
            removing_typo(content)
            analyzing_list = content
    return analyzing_list


lst2 = []
for content in contents_list:
    lst = listmaker(content)
    if lst != []:
        lst2.append(lst)

# Association Analysis
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(lst2).transform(lst2)
df = pd.DataFrame(te_ary, columns=te.columns_)

from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets,
                          metric="lift",
                          min_threshold=1,
                          support_only=False)
rules['length'] = rules['antecedents'].apply(lambda x: len(x))
rules['length2'] = rules['consequents'].apply(lambda x: len(x))
rules = rules[(rules['length'] == 1) & (rules['length2'] == 1)]
Beispiel #25
0
import numpy as numpy
import matplotlib.pyplot as pyplot
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

file_data_set = "/Users/mendesbarreto/Git/machine-learing-course/introdution-to-python/data/titanic.raw.rdata"
reference_to_data_set_from_r_data = r['load'](file_data_set)
dataset = r['titanic.raw']
panda_data_frame = pandas2ri.ri2py_dataframe(r['titanic.raw'])

records = []
for colum in range(0, 2201):
    records.append([str(panda_data_frame.values[colum, row]) for row in range(0, 4)])

transaction_encoder = TransactionEncoder()
transaction_encoder_array = transaction_encoder.fit(records).transform(records)
data_frame = panda.DataFrame(transaction_encoder_array, columns=transaction_encoder.columns_)
frequent_itemsets = apriori(data_frame, min_support=0.3, use_colnames=True)

association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
association_rules = association_rules

association_rules.plot()
pyplot.show()
#  Test libs
# file_data_set = "/Users/mendesbarreto/Git/machine-learing-course/introdution-to-python/data/titanic.raw.rdata"
# reference_to_data_set_from_r_data = r['load'](file_data_set)
# dataset = r['titanic.raw']
# panda_data_frame = pandas2ri.ri2py_dataframe(r['titanic.raw'])
Beispiel #26
0
#-----------------------------
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import time
import logging
pd.set_option('display.max_columns',None)
#-----
transactions = [['I1','I2','I5'],['I2','I4'],['I2','I3'] ,['I1','I2','I4'],['I1','I3'], ['I2','I3'],['I1','I3'], ['I1','I2','I3','I5'],['I1','I2','I3']]
transactions
#----
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
te_ary
te.columns_
df = pd.DataFrame(te_ary, columns=te.columns_)
df
#this matrix of transactions : T/ F indicate their presence in each Trans ID
df.shape
#get back orginal transactions
orgtrans1 = te_ary[:]
te.inverse_transform(orgtrans1)

#%%% #frequent itemsets - Most Imp Step
support_threshold = 0.01
frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True)
frequent_itemsets
df_asi_ent = df_asi_ent.drop(columns = ['opportunity_id','created_at','name_region','opp_background_req','opp_language_req','opp_skill_req','opp_background_pref','opp_language_pref','opp_skill_pref', 'programme_id'])
df_asi_ent = df_asi_ent.drop(columns = ['programme_id'])



from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori 
from mlxtend.frequent_patterns import association_rules
data = df_asi_ent

records = []  
for i in range(data.shape[0]):  
    records.append([str(data.values[i,j]) for j in range(data.shape[1])])

te = TransactionEncoder()
te_ary = te.fit(records).transform(records)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support = 0.009, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
result = rules[(rules['consequents'] == {'rejected'}) & (rules['confidence'] > 0.5)]








import youtube_process
from mlxtend.preprocessing import TransactionEncoder
file_US = "USvideos.csv"
US_data = pd.read_csv(file_US, keep_default_na=False, low_memory=False)
US_data
df = US_data[['category_id','views']]
df
with open("US_category_id.json", 'r') as f:
    content = json.load(f)
category_map = {}
for i in content['items']:
    category_map[int(i['id'])] = i['snippet']['title']
category_map
t = df['category_id'].map(category_map)
df = pd.concat([df,t],axis=1)
df.columns=['category_id','views','category']
grade = []
for i in df['views'].values:
    views_map = lambda x:{x>=4194399:'A',1823157<=x<4194399:'B',681861<=x<1823157:'C',
                          242329<=x<681861:'D',549<=x<242329:'E'}
    grade.append(views_map(i)[True])
df['views_grade'] = grade
df = df.drop(['category_id', 'views'], axis = 1)
df
def deal(data):
    return data.dropna().tolist()
df_arr = df.apply(deal,axis=1).tolist() # 转化成列表
TE = TransactionEncoder()  # 定义模型
df_tf = TE.fit_transform(df_arr)
df = pd.DataFrame(df_tf,columns=TE.columns_)
df
'''
Created on 10.10.2019

@author: Tim
'''

from mlxtend.frequent_patterns import apriori as mlxtend_apriori
from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth
from mlxtend.frequent_patterns import association_rules as mlxtend_association_rules
from mlxtend.preprocessing import TransactionEncoder

import spn_apriori.itemsets_utils

transaction_encoder = TransactionEncoder()

import pandas as pd
import numpy as np

from data import real_data, synthetic_data

import spn_apriori.apriori_evaluation as evaluation


def test_spn_apriori_synthetic():
    df, _, _ = synthetic_data.generate_simple_transactions(seed=123)


def test_spn_apriori_real():
    df, value_dict, parametric_types = real_data.get_adult_41_items()
    eval = spn_apriori.itemsets_utils.cross_eval(df,
                                                 'UCI', [0.01, 0.4],
Beispiel #30
0
def calculate(request):
    if request.method == 'POST':
        """
        Get data inject from html
        """
        # upload and get file
        uploaded_file = request.FILES['file_name']
        fs = FileSystemStorage()
        name = fs.save(uploaded_file.name, uploaded_file)

        url = fs.url(name)
        # get minsupp and minconf
        minsupp = request.POST['minsupp']
        minconf = request.POST['minconf']
        # get algorithm
        selectedAlgorithm = request.POST['selectedAlgorithm']

        BASE_DIR = dirname(os.path.dirname(os.path.abspath(__file__)))
        url_split = url.split("/")
        for value in url_split:
            if value == '':
                url_split.remove(value)
        for value in url_split:
            BASE_DIR = BASE_DIR + "\\" + str(value)
        # read dataset
        if name.find(".csv") != -1:
            store_data = pd.read_csv(BASE_DIR, header=None)
        if name.find(".xlsx") != -1:
            store_data = pd.read_excel(BASE_DIR, header=None)

        # change data conform algorithm
        records = []
        for i in range(0, len(store_data)):

            records.append([
                str(store_data.values[i, j])
                for j in range(0, len(store_data.columns))
            ])

        records_withoutNan = []

        for i in range(0, len(records)):
            new = []
            for j in range(0, len(records[i])):
                if str(records[i][j]) != "nan":
                    new.append(str(records[i][j]))
            records_withoutNan.append(new)

        te = TransactionEncoder()
        te_ary = te.fit(records_withoutNan).transform(records_withoutNan)
        df = pd.DataFrame(te_ary, columns=te.columns_)
        """
        function APRIORI algorithm
        return alist rules
        in rules have (super rules, sup rules and confidence every a rules)
        """
        def apriori_find_association_rules(dataset, minsup, minconf):
            patterns_ap = apriori(df,
                                  min_support=float(minsupp) / 100,
                                  use_colnames=True)
            rules_ap = association_rules(patterns_ap,
                                         metric="confidence",
                                         min_threshold=float(minconf) / 100)
            rules_ap_sort_descending = rules_ap.sort_values(by="lift",
                                                            ascending=False)
            return rules_ap_sort_descending

        """
        function FP-GROWTH algorithm
        return pattens, rules
        in rules have (super rules, sup rules and confidence every a rules)
        """

        def fpgrowth_find_association_rules(dataset, minsup, minconf):
            patterns_fp = fpgrowth(df,
                                   min_support=float(minsupp) / 100,
                                   use_colnames=True,
                                   verbose=0)
            rules_fp = association_rules(patterns_fp,
                                         metric="confidence",
                                         min_threshold=float(minconf) / 100)
            rules_fp_sort_descending = rules_fp.sort_values(by="lift",
                                                            ascending=False)
            return rules_fp_sort_descending

        """
        set event use Apriori or FP_Growth
        """
        if selectedAlgorithm == 'Apriori':
            """
            association_results_APRIORI: is a List Object Apriori return after calculate
            """
            association_results_APRIORI = apriori_find_association_rules(
                df, minsupp, minconf)
            rules_ap_antecedents_list = list(
                association_results_APRIORI['antecedents'])
            rules_ap_consequents_list = list(
                association_results_APRIORI['consequents'])
            rules_ap_support_list = list(
                association_results_APRIORI['support'])
            rules_ap_confidence_list = list(
                association_results_APRIORI['confidence'])
            rules_ap_lift_list = list(association_results_APRIORI['lift'])

            rules_ap_final = []
            for i in range(0, len(rules_ap_antecedents_list)):
                onerules = {}
                onerules['antecedents'] = list(rules_ap_antecedents_list[i])
                onerules['consequents'] = list(rules_ap_consequents_list[i])
                onerules['support'] = round(rules_ap_support_list[i], 3)
                onerules['confidence'] = round(rules_ap_confidence_list[i], 3)
                onerules['lift'] = round(rules_ap_lift_list[i], 3)
                rules_ap_final.append(onerules)

            return render(
                request, 'website/show_rules.html', {
                    'selectedAlgorithm': selectedAlgorithm,
                    'lenrules': len(rules_ap_final),
                    'lendata': len(df),
                    'association_rules': rules_ap_final
                })
        elif selectedAlgorithm == 'FP-Growth':

            association_results_FPGROWTH = fpgrowth_find_association_rules(
                df, minsupp, minconf)

            rules_fp_antecedents_list = list(
                association_results_FPGROWTH['antecedents'])
            rules_fp_consequents_list = list(
                association_results_FPGROWTH['consequents'])
            rules_fp_support_list = list(
                association_results_FPGROWTH['support'])
            rules_fp_confidence_list = list(
                association_results_FPGROWTH['confidence'])
            rules_fp_lift_list = list(association_results_FPGROWTH['lift'])

            rules_fp_final = []
            for i in range(0, len(rules_fp_antecedents_list)):
                onerules = {}
                onerules['antecedents'] = list(rules_fp_antecedents_list[i])
                onerules['consequents'] = list(rules_fp_consequents_list[i])
                onerules['support'] = round(rules_fp_support_list[i], 3)
                onerules['confidence'] = round(rules_fp_confidence_list[i], 3)
                onerules['lift'] = round(rules_fp_lift_list[i], 3)
                rules_fp_final.append(onerules)
            return render(
                request, 'website/show_rules.html', {
                    'selectedAlgorithm': selectedAlgorithm,
                    'lenrules': len(rules_fp_final),
                    'lendata': len(df),
                    'association_rules': rules_fp_final
                })
Beispiel #31
0
        try:
            config.read('../config.ini')
        except:
            print('Error, could not find config.ini')
    
    verbose = config['DEFAULT']['Verbose'].lower() == 'true'

    # load the playlists as transactions
    if verbose:
        print('Loading transactions from disk.')
    playlist_transactions, unique_songs = load_playlists(config['model']['PlaylistsDir'])

    # load the transactions into an encoder and get a pandas dataframe
    if verbose:
        print('Encoding transactions')
    te = TransactionEncoder()
    te_ary = te.fit(playlist_transactions).transform(playlist_transactions)
    playlist_df = pd.DataFrame(te_ary, columns=te.columns_)

    # get the support, confidence, and max k values for the itemsets
    min_sup     = float(config['model']['MinSup'])
    max_k       = int(config['model']['ItemsetSize'])

    # run FP Growth algo on the transactions
    if verbose:
        print('Running FP Growth')

    frequent_itemsets = fpgrowth(playlist_df, min_support=min_sup, use_colnames=True, max_len=max_k)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    k_frequent_itemsets = frequent_itemsets[frequent_itemsets['length'] == max_k]
Beispiel #32
0
def calculate_database(request):
    if request.method == "POST":
        server = request.POST['ServerName']
        database = request.POST['DatabaseName']
        username = request.POST['Username']
        password = request.POST['Password']
        minsupp = request.POST['minsupp']
        minconf = request.POST['minconf']
        selectedAlgorithm = request.POST['selectedAlgorithm']
        conn = pyodbc.connect('Driver={ODBC Driver 13 for SQL Server};'
                              f'Server=' + server + ';'
                              f'Database=' + database + ';'
                              f'UID=' + username + ';'
                              f'PWD=' + password + ';'
                              'Mars_Connection=Yes;')
        cursor = conn.cursor()

        getSaleProduct = cursor.execute(
            'SELECT SalesOrderID,SalesOrderDetailID, [Production].[Product].[ProductID], Name FROM [Sales].[SalesOrderDetail] inner join [Production].[Product] on [Production].[Product].[ProductID] = [Sales].[SalesOrderDetail].[ProductID] order by [Sales].[SalesOrderDetail].SalesOrderID asc'
        )

        listSaleProduct = []
        for saleProduct in getSaleProduct:
            listSaleProduct.append(list(saleProduct))

        list_saleProduct_parent = []
        list_saleProduct_child = []
        temp = listSaleProduct[0][0]
        i = 0
        while i < len(listSaleProduct):
            if listSaleProduct[i][0] == temp:
                list_saleProduct_child.append(str(listSaleProduct[i][3]))
                i += 1
            else:
                list_saleProduct_parent.append(list_saleProduct_child)
                list_saleProduct_child = []
                temp = listSaleProduct[i][0]
                i = i

        te = TransactionEncoder()
        te_ary = te.fit(list_saleProduct_parent).transform(
            list_saleProduct_parent)
        df = pd.DataFrame(te_ary, columns=te.columns_)

        def apriori_find_association_rules(dataset, minsup, minconf):
            patterns_ap = apriori(df,
                                  min_support=float(minsupp) / 100,
                                  use_colnames=True)
            rules_ap = association_rules(patterns_ap,
                                         metric="confidence",
                                         min_threshold=float(minconf) / 100)
            rules_ap_sort_descending = rules_ap.sort_values(by="lift",
                                                            ascending=False)
            return rules_ap_sort_descending

        def fpgrowth_find_association_rules(dataset, minsup, minconf):
            patterns_fp = fpgrowth(df,
                                   min_support=float(minsupp) / 100,
                                   use_colnames=True,
                                   verbose=0)
            rules_fp = association_rules(patterns_fp,
                                         metric="confidence",
                                         min_threshold=float(minconf) / 100)
            rules_fp_sort_descending = rules_fp.sort_values(by="lift",
                                                            ascending=False)
            return rules_fp_sort_descending

        """
        set event use Apriori or FP_Growth
        """
        if selectedAlgorithm == 'Apriori':
            """
            association_results_APRIORI: is a List Object Apriori return after calculate
            """
            association_results_APRIORI = apriori_find_association_rules(
                df, minsupp, minconf)
            # print(association_results_APRIORI)
            rules_ap_antecedents_list = list(
                association_results_APRIORI['antecedents'])
            rules_ap_consequents_list = list(
                association_results_APRIORI['consequents'])
            rules_ap_support_list = list(
                association_results_APRIORI['support'])
            rules_ap_confidence_list = list(
                association_results_APRIORI['confidence'])
            rules_ap_lift_list = list(association_results_APRIORI['lift'])

            rules_ap_final = []
            for i in range(0, len(rules_ap_antecedents_list)):
                onerules = {}
                onerules['antecedents'] = list(rules_ap_antecedents_list[i])
                onerules['consequents'] = list(rules_ap_consequents_list[i])
                onerules['support'] = round(rules_ap_support_list[i], 3)
                onerules['confidence'] = round(rules_ap_confidence_list[i], 3)
                onerules['lift'] = round(rules_ap_lift_list[i], 3)
                rules_ap_final.append(onerules)

            return render(
                request, 'website/show_rules.html', {
                    'selectedAlgorithm': selectedAlgorithm,
                    'lenrules': len(rules_ap_final),
                    'lendata': len(df),
                    'association_rules': rules_ap_final
                })
        elif selectedAlgorithm == 'FP-Growth':

            association_results_FPGROWTH = fpgrowth_find_association_rules(
                df, minsupp, minconf)

            rules_fp_antecedents_list = list(
                association_results_FPGROWTH['antecedents'])
            rules_fp_consequents_list = list(
                association_results_FPGROWTH['consequents'])
            rules_fp_support_list = list(
                association_results_FPGROWTH['support'])
            rules_fp_confidence_list = list(
                association_results_FPGROWTH['confidence'])
            rules_fp_lift_list = list(association_results_FPGROWTH['lift'])

            rules_fp_final = []
            for i in range(0, len(rules_fp_antecedents_list)):
                onerules = {}
                onerules['antecedents'] = list(rules_fp_antecedents_list[i])
                onerules['consequents'] = list(rules_fp_consequents_list[i])
                onerules['support'] = round(rules_fp_support_list[i], 3)
                onerules['confidence'] = round(rules_fp_confidence_list[i], 3)
                onerules['lift'] = round(rules_fp_lift_list[i], 3)
                rules_fp_final.append(onerules)
            return render(
                request, 'website/show_rules.html', {
                    'selectedAlgorithm': selectedAlgorithm,
                    'lenrules': len(rules_fp_final),
                    'lendata': len(df),
                    'association_rules': rules_fp_final
                })
Beispiel #33
0
# Import Data
import pandas
import numpy as np

grocer_cust_item = pandas.read_csv(
    'C:\\Users\\soura\\OneDrive\\Desktop\\ML\\ML Spring20_Assignemnt\\HW2\\Groceries.csv',
    delimiter=',')

# Convert the Sale Receipt data to the Item List format
groc_ListItem = grocer_cust_item.groupby(
    ['Customer'])['Item'].apply(list).values.tolist()

# Convert the Item List format to the Item Indicator format
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(groc_ListItem).transform(groc_ListItem)
groc_ItemIndicator = pandas.DataFrame(te_ary, columns=te.columns_)

cost_items = []
cost_item_count = []
for item in groc_ListItem:
    uniq_items = set(item)
    cost_item_count.append(len(uniq_items))
    cost_items.append(list(uniq_items))

uniq_item_df = pandas.DataFrame(cost_items)

import matplotlib.pyplot as plt

plt.hist(cost_item_count, edgecolor='k')
    
    return dfRules


if __name__ == '__main__':    
    dataSet = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
    dataSet = [['菜品2', '菜品4', '菜品3'], ['菜品1', '菜品5'], ['菜品1', '菜品4'],
               ['菜品2', '菜品1', '菜品4', '菜品5'], ['菜品2', '菜品1'],
               ['菜品1', '菜品4'], ['菜品2', '菜品1'],
               ['菜品2', '菜品1', '菜品4', '菜品3'], ['菜品2', '菜品1', '菜品4'],
               ['菜品2', '菜品4', '菜品3']]
    
    minSup = 0.3
    minConf = 0.5
    
    Rules = Apriori(dataSet, minSup, minConf)    
    Rulesdf = Rules2df(Rules)   
    
    
    # 使用mlxtend的apriori算法
    from mlxtend.preprocessing import TransactionEncoder
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    
    tranEncoder = TransactionEncoder()
    te_ary = tranEncoder.fit(dataSet).transform(dataSet)
    df = pd.DataFrame(te_ary, columns=tranEncoder.columns_)
    FreLk_all = apriori(df, min_support=minSup, use_colnames=True)
    Rules_mlx = association_rules(FreLk_all, min_threshold=minConf)
    
    if key in dict.keys():
        return True
    else:
        return False


start = time.time()
dataset = []  # 2-D array for storing the sequences
with open('out.txt',
          'r') as fobj:  # Importing values from txt file containing dataset
    for line in fobj:
        numbers = [int(num)
                   for num in line.split()]  # Single row of the 2-D array
        dataset.append(numbers)

t = TransactionEncoder()
t_ary = t.fit(dataset).transform(dataset)  # Convrerting to table of true/false
df = pd.DataFrame(
    t_ary, columns=t.columns_
)  # Converting t_ary table to suitable form for giving input to apriori

frequent_set = fpgrowth(df, min_support=0.015, use_colnames=True)

frequent_set['length'] = frequent_set['itemsets'].apply(lambda x: len(x))
print(frequent_set)

end = time.time()
print(end - start)

# For generating length v/s Count plot
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


# In[2]:


fin = open("T10I4D100K.txt", "r")
dataset = [[int(n) for n in line.split()] for line in fin]


# In[3]:


te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset, sparse=True)
sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False)
sparse_df


# In[4]:


frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True)
frequent_itemsets5


# In[5]: