def test_cloning(): oht = TransactionEncoder() oht.fit(dataset) oht2 = clone(oht) msg = ("'TransactionEncoder' object has no attribute 'columns_'") assert_raises(AttributeError, msg, oht2.transform, dataset) trans = oht2.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
dataset=[["Bread","Milk","Beer"], ["Bread","Diapers","Egg"], ["Milk","Diapers","Beer","Cola"], ["Bread","Milk","Diapers","Beer"], ["Bread","Milk","Cola"]] import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori te=TransactionEncoder() te=te.fit(dataset) te_ary=te.transform(dataset) df=pd.DataFrame(te_ary,columns=te.columns_) frequent_itemsets=apriori(df,min_support=0.6,use_colnames=True) #to apply association rules from mlxtend.frequent_patterns import association_rules rules=association_rules(frequent_itemsets,metric='support',min_threshold=0.6) cartItem=[] init=input("1.Beer\t2.Bread\t3.Cola\t4.Diapers 5.Egg 6.Milk.\n What do you wish to buy? ") init=init.capitalize() cartItem=cartItem+[init] permission='y' checklist=['Beer','Bread','Cola','Diapers','Egg','Milk']
from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules dff = pd.read_excel(r'C', na_values='?', sheet_name='data')#подгружаем данные в данных содержатся пустые ячейки print(dff.head()) ddd = dff.pivot_table(values='Сумма', columns='Значение', index='ID') ddd[np.isnan(ddd)] = 0 ddd[ddd>0]=1 print(ddd.head()) def transaction_list(df): #функция для создания списка транзакция list_external=[] for i in range(df.shape[0]): list_internal=[] data=df.iloc[i] index=data[data>0] for element in index.index: list_internal.append(element) list_external.append(list_internal) return list_external transactions = transaction_list(ddd) #создаем список транзакция по каждому клиенту print(transactions[0])#список магазинов, который посещает клиент под номером [0] te = TransactionEncoder() # создаем частные наборы с уровнем поддержки (сколько транзакций содержится в датасете) (min_support = 0,2)- это минимум te_ary = te.fit(transactions).transform(transactions) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) print(frequent_itemsets) association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)#сгенерируем ассоциативные правила с уровнем доверия (как часто правила срабатывают для всего датасета) 0.1 (очень низкий) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.1)#сгенерируем ассоциативные правила с уровнем независимости (насколько элементы события зависят друг от друга) больше 1.2 writer = pd.ExcelWriter(r'')#запишем в Excel rules.to_excel(writer, sheet_name = 'test') writer.save() print("все гуд, чувак")
if __name__ == '__main__': ML_DS_PATH = '../dataset/ml-out' MATRIX_PATH = '../temp' ratings_old = pd.read_csv(ML_DS_PATH + '/ratings_old.csv') s_rate_old = pd.read_csv(MATRIX_PATH + '/s_rate_old.csv') s_rate_old = s_rate_old.set_index('MovieID') s_rate_old.rename(columns=int, inplace=True) with open(MATRIX_PATH + '/cluster_old.pickle', 'rb') as f: cluster = pickle.load(f) model_start = time.time() # 打点计时 te = TransactionEncoder() te_ary = te.fit(cluster).transform(cluster) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.08, use_colnames=True) ar = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2) model_end = time.time() # 打点计时 print('Modeling Time: %s' % (model_end - model_start)) # 打点计时 recommend_list_example = recommend_apriori(ratings_old, ar, 5) print(recommend_list_example) # 准确率 Precision & 召回率 Recall & 覆盖率 Coverage & 多样性 Diversity s_rate_new = pd.read_csv(MATRIX_PATH + '/s_rate_new.csv') s_rate_new = s_rate_new.set_index('MovieID') s_rate_new.rename(columns=int, inplace=True)
data = pd.read_csv(input_clause, sep='\t', na_filter=False) #data=data.head(500) dataset = [] for ind, row in data.iterrows(): cl = row['Clause'].split(';')[:-1] cl = [c.strip() for c in cl] dataset.append(cl) '''##one hot encoding te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) ##one_hot encoding df = pd.DataFrame(te_ary, columns=te.columns_)''' ##sparse encoding te = TransactionEncoder() oht_ary = te.fit(dataset).transform(dataset, sparse=True) sparse_df = pd.SparseDataFrame(oht_ary, columns=te.columns_, default_fill_value=False) #print (sparse_df) frequent_itemsets = apriori(sparse_df, min_support=0.00002, use_colnames=True) frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False) print(frequent_itemsets) frequent_itemsets['Word_clause'] = '' input_features = fp + 'feature_details' + file_date + '.txt' df_features = pd.read_csv(input_features, sep='\t', na_filter=False)
cur.close() conn.close() #Sets column and row size in output desired_width = 320 pd.set_option('display.width', desired_width) np.set_printoptions(linewidth=desired_width) pd.set_option('display.max_columns', 10) dataset = pd.DataFrame(transaction_list) #Changes need to be made to dataset to apply fp algorithm #Converting the data frame into a list of lists records = [] for i in range(0, 9835): records.append([str(dataset.values[i, j]) for j in range(0, 20)]) #"Cleaning" dataset by using TransactionEncoder and dropping the 'None' column TE = TransactionEncoder() array = TE.fit(records).transform(records) transf_df = pd.DataFrame(array, columns=TE.columns_) cleanDataset = transf_df.drop(['None'], axis=1) #Using association rules to mine dataset assocRules = fpgrowth(cleanDataset, min_support=0.05, use_colnames=True) rules = association_rules(assocRules, metric='lift', min_threshold=1) print(rules) #Prints time taken for execution print(datetime.now() - startTime)
sqlContext = HiveContext(sc) df = sc.textFile('hdfs:/user/capstone/dataset.csv').map( lambda line: line.split(",")) header = df.first() fields = [ StructField(field_name, StringType(), True) for field_name in header ] #get the types of header variable fields schema = StructType(fields) filter_data = df.filter(lambda row: row != header) SelectDf = sqlContext.createDataFrame(filter_data, schema=schema) SelectDf.registerTempTable("transactions") DescriptionGrp = sqlContext.sql( "SELECT distinct InvoiceNo,Description FROM transactions group by InvoiceNo,Description" ) transactions = DescriptionGrp.groupBy("InvoiceNo").agg( collect_list("Description").alias("desc")).rdd.map(lambda x: x.desc) print(transactions.take(5)) transactionsDF = sqlContext.createDataFrame(transactions) print(transactionsDF.take(5)) te = TransactionEncoder() te_ary = te.fit(transactionsDF).transform(transactionsDF) transactionrecord = pd.DataFrame(te_ary, columns=te.columns_) print(transactionrecord) #association_rules_mba=apriori(transactionrecord,min_support=0.002,min_confidence=0.002,min_lift=1.2,min_length=2) #transactionsparaRDD = sc.parallelize(association_rules_mba) rulesDF = apriori(transactionrecord, min_support=0.6, verbose=1, low_memory=True) ptint(rulesDF)
def test_fit(): oht = TransactionEncoder() oht.fit(dataset) assert(oht.columns_ == ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'])
""" import csv dict = dkey w = csv.writer(open("output.csv", "w")) for key, val in dict.items(): w.writerow([key, val]) """ # To create a list of lists from the dictionary values i=10002 while i in range(10002,42580): dkey[i]=list(map(str,dkey[i])) we=list(dkey.values()) #Fitting the association rule learning model from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() dat=we[1:50] te_ary = te.fit_transform(dat,sparse=False) df = pd.DataFrame(te_ary, columns=te.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) print (frequent_itemsets) from mlxtend.frequent_patterns import association_rules t=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
from apyori import apriori rules = apriori(symptoms, min_support=0.1, min_confidence=0.7) results = list(rules) for i in results: for j in i.ordered_statistics: X = j.items_base Y = j.items_add x = ', '.join([item for item in X]) y = ', '.join([item for item in Y]) if x != '': print(x + ' → ' + y) from mlxtend.preprocessing import TransactionEncoder TE = TransactionEncoder() data = TE.fit_transform(symptoms) print(data) import pandas as pd df = pd.DataFrame(data, columns=TE.columns_) df.head() from mlxtend.frequent_patterns import apriori items = apriori(df, min_support=0.1, use_colnames=True) print(items) print(items[items['itemsets'].apply(lambda x: len(x)) >= 2]) from mlxtend.frequent_patterns import association_rules
def main(): dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1") author_title = dblp_data dataset = author_title.to_numpy() list1 = dataset[:,2].tolist() #convert authors to lower case list2 = [] for i in list1: sublist = i.lower().split() list2.append(sublist) te = TransactionEncoder() te_ary = te.fit(list2).transform(list2) df = pd.DataFrame(te_ary, columns=te.columns_) frequent = fpgrowth(df, min_support=0.001, use_colnames=True) frequent = frequent[frequent['itemsets'].str.len()>1] freqauth_list = [] for i in frequent['itemsets']: freqauth_list.append([x for x in i]) freqauth_dict = {} for i in freqauth_list: title_idx_sublist = [] for idx, j in enumerate(list2): if set(i).issubset(j): title_idx_sublist.append(idx) freqauth_dict.update({tuple(i):title_idx_sublist}) freqauth_title_dict = {} kstem = ks.PyKrovetzStemmer() for key, value in freqauth_dict.items(): title_df = author_title.iloc[value]['title'] title_sublist = list(title_df) title_sublists = [] temp_list = [] for x in title_sublist: tempx = re.sub(r'[.]','', x) temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split() temp_list2 = [] if isinstance(temp_list, list): temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist]) title_sublists.extend(temp_list2) else: if not temp_list in stopwordlist: title_sublists.extend([kstem.stem(temp_list)]) freqauth_title_dict.update({key:title_sublists}) # Closed / Top k titles of frequent authors freqauth_title_dict_closed = {} for k, v in freqauth_title_dict.items(): ps = PrefixSpan(v) closed_Seq_pattern = ps.topk(5, closed=True) freqauth_title_dict_closed.update({k:closed_Seq_pattern}) # To get frequent author's context indicators frequentlist = freqauth_list cleanedList = list2 new_author_list = [] for i in range(0,len(frequentlist)): temp_author_list = [] authorlist = list(frequentlist[i]) found = 0 for k in range(0,len(cleanedList)): for j in range(0, len(authorlist)): if (authorlist[j] in(cleanedList[k])): found = 1 else: found = 0 break if found == 1: for jj in range(0,len(authorlist)): if (authorlist[jj] in(cleanedList[k])): cleanedList[k].remove(authorlist[jj]) temp_author_list.append(cleanedList[k]) new_author_list.append(temp_author_list) context_indicator_list = [] for i in range(0,len(new_author_list)): te = TransactionEncoder() te_ary = te.fit(new_author_list[i]).transform(new_author_list[i]) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True) supp = frequent_author_list.support.unique() # all unique support count # Dictionary storing itemset with same support count key freq_dic = {} for i in range(len(supp)): inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets']) freq_dic[supp[i]] = inset # Dictionary storing itemset with support count <= key freq_dic2 = {} for i in range(len(supp)): inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets']) freq_dic2[supp[i]] = inset2 # Find Closed frequent itemset close_freq = [] for index, row in frequent_author_list.iterrows(): isclose = True cli = row['itemsets'] cls = row['support'] checkset = freq_dic[cls] for i in checkset: if (cli != i): if (frozenset.issubset(cli, i)): isclose = False break if (isclose): close_freq.append([x for x in (row['itemsets'])]) context_indicator_list.append(close_freq) freqauth_context_ind_dict = {} for authpair, titlelist in freqauth_title_dict_closed.items(): cleantitlelist = [] for i in titlelist: if isinstance(i, tuple): if isinstance(i[1], list): listtostring = ' '.join(i[1]) cleantitlelist.append(listtostring) freqauth_context_ind_dict.update({authpair:cleantitlelist}) # Merging both titles and Context indicator author for frequent pattern authors for idx, key in enumerate(freqauth_context_ind_dict): newval = [] if len(context_indicator_list[idx])> 0: for i in context_indicator_list[idx]: if len(i) > 0: tempstr = '&'.join(i) newval = freqauth_context_ind_dict[key] newval.append(tempstr) freqauth_context_ind_dict.update({key:newval}) # Context Indicator Weighting CI_list = list(freqauth_context_ind_dict.values()) freqauth_context_in_weights = {} for key, value in freqauth_context_ind_dict.items(): freq_auth_CI_list = value length_of_CI = len(value) temp_dict = {} for i in freq_auth_CI_list: count_tmp = 0 for j in CI_list: if (i in (j)): count_tmp += 1 weight = round(1 - ((count_tmp - 1) / count_tmp), 2) if (weight > 0.1): temp_dict.update({i:weight}) sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) freqauth_context_in_weights.update({key:sorted_weights_dict}) freq_auth_transactions = {} list_of_freq_auth = list(freqauth_context_in_weights.keys()) for i in range(0, len(freqauth_title_dict)): temp_dict = {} title_list = freqauth_title_dict.get(list_of_freq_auth[i]) CI_list = freqauth_context_in_weights[list_of_freq_auth[i]] CI_list_auth = [] for n, c in enumerate(CI_list): CI_list_auth.append(c[0]) for j in range(0, len(title_list)): cos_sim = cos_similarity(CI_list_auth,title_list[j]) cos_sim = round(cos_sim, 3) t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j]) temp_dict.update({t_title:cos_sim}) sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) t_len = len(list(temp_dict.values())) max_len = t_len if (t_len > 4): max_len = 4 sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len]) freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1}) # To find the strongest SSP - Match against similarity of the context units freq_auth_SSPs = {} list_of_freq_auth = list(freqauth_context_ind_dict.keys()) list_of_freq_auth_CI = list(freqauth_context_ind_dict.values()) len_list_of_freq_auth_CI = len(list_of_freq_auth_CI) context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float) for i in range (0,len_list_of_freq_auth_CI): for j in range (0,len_list_of_freq_auth_CI): cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j]) cos_sim = round(cos_sim, 3) if (i != j): context_indicator_similarity[i][j] = cos_sim context_indicator_similarity[j][i] = cos_sim context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int) for i in range(0,len(context_indicator_similarity)): context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:] SSP_Author_List = [] for i in range(0,len(list_of_freq_auth)): temp_author_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]]) SSP_Author_List.append(temp_author_list_ssp) SSP_Title_List = [] CI_list_title = list(freqauth_title_dict_closed.values()) CI_list1 = [] for i in (CI_list_title): temp_list3 = [] for j in i: CI_str = ' '.join(j[1]) temp_list3.append(CI_str) CI_list1.append(list(set(temp_list3))) for i in range(0,len(CI_list1)): temp_title_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): ssp_str = CI_list1[context_indicator_similarity_idx[i][j]] temp_title_list_ssp.extend(ssp_str) SSP_Title_List.append(list(set(temp_title_list_ssp))) # Write the output to a CSV file # a) list_of_freq_auth # b) list_of_freq_auth_CI / freqauth_context_in_weights # c) freq_auth_transactions # d) SSP_Author_List # e) SSP_Title_List #for i in range(0, frequent_author_list): #print(len(SSP_Title_List)) #print(SSP_Title_List) titles_list_with_weight = list(freq_auth_transactions.values()) # Joining SSP authors SSP_authors_formatted = [] for i in range(0,len(SSP_Author_List)): temp_list = [] for j in range(0, len(SSP_Author_List[i])): authors = '&'.join(list(SSP_Author_List[i][j])) temp_list.append(authors) SSP_authors_formatted.append(temp_list) with open("./output.txt", 'w', encoding="utf-8") as f: f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' + 'Transaction 2' + '||' + 'Transaction 3' + '||' + 'Transaction 4' + '||' + 'SSP - Co-Author' + '||' + 'SSP - Title' + '\n') for i in range(0, len(list_of_freq_auth)): authors = ' '.join(list(list_of_freq_auth[i])) f.write(authors + '||') Context_indicators = '; '.join(list_of_freq_auth_CI[i]) f.write(Context_indicators + '||') for j in (titles_list_with_weight[i].keys()): f.write(j + '||') ssp_authors = '; '.join(SSP_authors_formatted[i]) f.write(ssp_authors + '||') ssp_titles = '; '.join(SSP_Title_List[i]) f.write(ssp_titles ) f.write('\n')
def test_inverse_transform(): oht = TransactionEncoder() oht.fit(dataset) np.testing.assert_array_equal(np.array(data_sorted), np.array(oht.inverse_transform(expect)))
def test_fit_transform(): oht = TransactionEncoder() trans = oht.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
def test_transform_sparse(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset, sparse=True) assert(isinstance(trans, csr_matrix)) np.testing.assert_array_equal(expect, trans.todense())
plt.title("Support level of 9%") data = pd.read_csv("BreadBasket_DMS.csv") data = data.set_index(['Item']) filtered = data.drop(['NONE']) data = data.reset_index() filtered = filtered.reset_index() transaction_list = [] # For loop to create a list of the unique transactions throughout the dataset: for i in filtered['Transaction'].unique(): tlist = list(set(filtered[filtered['Transaction'] == i]['Item'])) if len(tlist) > 0: transaction_list.append(tlist) te = TransactionEncoder() te_ary = te.fit(transaction_list).transform(transaction_list) df2 = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df2, min_support=0.09, use_colnames=True) #take minimum threshold rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.0001) rules.sort_values('confidence', ascending=False) #now categorise every rule with different range of confidence. rules['support'] = rules['support'] * 100 rules['confidence'] = rules['confidence'] * 100 rules2 = rules[['antecedents', 'consequents', 'support', 'confidence']]
for num in l: if num not in final_list: final_list.append(num) records_new.append(final_list) from mlxtend.frequent_patterns import apriori,association_rules from mlxtend.preprocessing import TransactionEncoder te=TransactionEncoder() te_data=te.fit(records_new).transform(records_new) data_x=pd.DataFrame(te_data,columns=te.columns_) print(data_x.head()) frequent_items= apriori(data_x, use_colnames=True, min_support=0.0045) rules = association_rules(frequent_items, metric="lift", min_threshold=1) rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x)) rules["consequents_len"] = rules["consequents"].apply(lambda x: len(x))
Dataframe = pd.read_csv( "D:\\Machine Learning_Algoritms\\Apriori\\GroceryStoreDataSet.csv", encoding='latin1', names=['products'], header=None) num_records = len(Dataframe) print(num_records) transactions = [] for i in range(0, num_records): transactions.append([str(Dataframe.values[i, j]) for j in range(0, 3)]) Dataframe = list(Dataframe["products"].apply(lambda x: x.split(','))) from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_data = te.fit_transform(Dataframe) Dataframe = pd.DataFrame(te_data, columns=te.columns_) count = Dataframe.loc[:, :].sum() reverse_count = count.sort_values(0, ascending=False).head(11) reverse_count = reverse_count.to_frame() reverse_count = reverse_count.reset_index() #reverse_count = reverse_count.rename(columns = {“index”: “items” ,0: “count”}) plt.style.available plt.rcParams['figure.figsize'] = (10, 6) plt.style.use('dark_background') ax = reverse_count.plot.barh()
def transaction_encoding(dataset): oht = TransactionEncoder() oht_ary = oht.fit(dataset).transform(dataset) new_df = pd.DataFrame(oht_ary, columns=oht.columns_) return new_df
def setUp(self): database = [['a'], ['b'], ['c', 'd'], ['e']] te = TransactionEncoder() te_ary = te.fit(database).transform(database) self.df = pd.DataFrame(te_ary, columns=te.columns_)
# məhsulları ayrı ayrılıqda list formatına çevirmək transactions = list(items["mehsul_ad"].transform(lambda x: x.split(";"))) # In[12]: transactions[0] # In[13]: # məhsulları pivot formatında səbətə çevirmək: hər bir sətirdə(səbətdə) hansı məhsulların olub olmaması from mlxtend.preprocessing import TransactionEncoder tr_enc = TransactionEncoder() basket = pd.DataFrame(tr_enc.fit_transform(transactions), columns=tr_enc.columns_) # In[14]: basket # In[15]: # səbət analizi üçün lazım olan funksiyalar from mlxtend.frequent_patterns import apriori, association_rules # In[16]:
# In[6]: player_combo = [] #list of lists match players and result for i in range(0, 105): rowItem = [] for j in range(0, 13): rowItem.append(str(match_data.values[i, j])) player_combo.append(rowItem) # In[7]: # player_combo # In[8]: te = TransactionEncoder() te_ary = te.fit(player_combo).transform(player_combo) match_df_freq = pd.DataFrame(te_ary, columns=te.columns_) # In[9]: match_df_freq # In[10]: match_sup = apriori(match_df_freq, min_support=0.1, use_colnames=True) match_sup # In[11]: rules = association_rules(match_sup, metric="lift", min_threshold=1)
all_data = pd.read_csv('dataset_group.csv', header=None) # В файле нет строки с названием столбцов, поэтому параметр header равен None. # Интерес представляет информация об id покупателя - столбец с названием 1 # Название купленного товара хранится в столбце с названием 2 unique_id = list(set(all_data[1])) # print(len(unique_id)) #Выведем количество id items = list(set(all_data[2])) # print(len(items)) #Выведем количество товаров dataset = [[elem for elem in all_data[all_data[1] == id][2] if elem in items] for id in unique_id] te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) # print(df) results = apriori(df, min_support=0.3, use_colnames=True) results['length'] = results['itemsets'].apply(lambda x: len(x)) # добавление размера набора # print(results) results = apriori(df, min_support=0.3, use_colnames=True, max_len=1) # print(results) results = apriori(df, min_support=0.3, use_colnames=True) results['length'] = results['itemsets'].apply(lambda x: len(x)) results = results[results['length'] == 2]
import numpy as np import pandas as pd import matplotlib.pyplot as plt dataset = pd.read_csv('Groceries.csv') listset = dataset.groupby(['Customer'])['Item'].apply(list).values.tolist() print(f"2a.the number of customer is {len(listset)}") from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_ary = te.fit(listset).transform(listset) ItemIndicator = pd.DataFrame(te_ary, columns=te.columns_) print(f"2b.the number of unique items in the market basket across all customers is {ItemIndicator.shape[1]}") Items_count_list=[] for i in listset: Item_count = len(i) Items_count_list.append(Item_count) Customers_index_list=[] for i in range(len(listset)): Customers_index_list.append(i+1) P3,median,P1 = np.percentile(Items_count_list,[75,50,25]) iqr = P3-P1 h = 2*iqr*len(Items_count_list)**(-1/3.0) max_value = max(Items_count_list)
removing_typo(content) analyzing_list = content return analyzing_list lst2 = [] for content in contents_list: lst = listmaker(content) if lst != []: lst2.append(lst) # Association Analysis import pandas as pd from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_ary = te.fit(lst2).transform(lst2) df = pd.DataFrame(te_ary, columns=te.columns_) from mlxtend.frequent_patterns import apriori frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) from mlxtend.frequent_patterns import association_rules rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, support_only=False) rules['length'] = rules['antecedents'].apply(lambda x: len(x)) rules['length2'] = rules['consequents'].apply(lambda x: len(x)) rules = rules[(rules['length'] == 1) & (rules['length2'] == 1)]
import numpy as numpy import matplotlib.pyplot as pyplot from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules file_data_set = "/Users/mendesbarreto/Git/machine-learing-course/introdution-to-python/data/titanic.raw.rdata" reference_to_data_set_from_r_data = r['load'](file_data_set) dataset = r['titanic.raw'] panda_data_frame = pandas2ri.ri2py_dataframe(r['titanic.raw']) records = [] for colum in range(0, 2201): records.append([str(panda_data_frame.values[colum, row]) for row in range(0, 4)]) transaction_encoder = TransactionEncoder() transaction_encoder_array = transaction_encoder.fit(records).transform(records) data_frame = panda.DataFrame(transaction_encoder_array, columns=transaction_encoder.columns_) frequent_itemsets = apriori(data_frame, min_support=0.3, use_colnames=True) association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) association_rules = association_rules association_rules.plot() pyplot.show() # Test libs # file_data_set = "/Users/mendesbarreto/Git/machine-learing-course/introdution-to-python/data/titanic.raw.rdata" # reference_to_data_set_from_r_data = r['load'](file_data_set) # dataset = r['titanic.raw'] # panda_data_frame = pandas2ri.ri2py_dataframe(r['titanic.raw'])
#----------------------------- #libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules import time import logging pd.set_option('display.max_columns',None) #----- transactions = [['I1','I2','I5'],['I2','I4'],['I2','I3'] ,['I1','I2','I4'],['I1','I3'], ['I2','I3'],['I1','I3'], ['I1','I2','I3','I5'],['I1','I2','I3']] transactions #---- te = TransactionEncoder() te_ary = te.fit(transactions).transform(transactions) te_ary te.columns_ df = pd.DataFrame(te_ary, columns=te.columns_) df #this matrix of transactions : T/ F indicate their presence in each Trans ID df.shape #get back orginal transactions orgtrans1 = te_ary[:] te.inverse_transform(orgtrans1) #%%% #frequent itemsets - Most Imp Step support_threshold = 0.01 frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True) frequent_itemsets
df_asi_ent = df_asi_ent.drop(columns = ['opportunity_id','created_at','name_region','opp_background_req','opp_language_req','opp_skill_req','opp_background_pref','opp_language_pref','opp_skill_pref', 'programme_id']) df_asi_ent = df_asi_ent.drop(columns = ['programme_id']) from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules data = df_asi_ent records = [] for i in range(data.shape[0]): records.append([str(data.values[i,j]) for j in range(data.shape[1])]) te = TransactionEncoder() te_ary = te.fit(records).transform(records) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support = 0.009, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2) result = rules[(rules['consequents'] == {'rejected'}) & (rules['confidence'] > 0.5)]
import youtube_process from mlxtend.preprocessing import TransactionEncoder file_US = "USvideos.csv" US_data = pd.read_csv(file_US, keep_default_na=False, low_memory=False) US_data df = US_data[['category_id','views']] df with open("US_category_id.json", 'r') as f: content = json.load(f) category_map = {} for i in content['items']: category_map[int(i['id'])] = i['snippet']['title'] category_map t = df['category_id'].map(category_map) df = pd.concat([df,t],axis=1) df.columns=['category_id','views','category'] grade = [] for i in df['views'].values: views_map = lambda x:{x>=4194399:'A',1823157<=x<4194399:'B',681861<=x<1823157:'C', 242329<=x<681861:'D',549<=x<242329:'E'} grade.append(views_map(i)[True]) df['views_grade'] = grade df = df.drop(['category_id', 'views'], axis = 1) df def deal(data): return data.dropna().tolist() df_arr = df.apply(deal,axis=1).tolist() # 转化成列表 TE = TransactionEncoder() # 定义模型 df_tf = TE.fit_transform(df_arr) df = pd.DataFrame(df_tf,columns=TE.columns_) df
''' Created on 10.10.2019 @author: Tim ''' from mlxtend.frequent_patterns import apriori as mlxtend_apriori from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth from mlxtend.frequent_patterns import association_rules as mlxtend_association_rules from mlxtend.preprocessing import TransactionEncoder import spn_apriori.itemsets_utils transaction_encoder = TransactionEncoder() import pandas as pd import numpy as np from data import real_data, synthetic_data import spn_apriori.apriori_evaluation as evaluation def test_spn_apriori_synthetic(): df, _, _ = synthetic_data.generate_simple_transactions(seed=123) def test_spn_apriori_real(): df, value_dict, parametric_types = real_data.get_adult_41_items() eval = spn_apriori.itemsets_utils.cross_eval(df, 'UCI', [0.01, 0.4],
def calculate(request): if request.method == 'POST': """ Get data inject from html """ # upload and get file uploaded_file = request.FILES['file_name'] fs = FileSystemStorage() name = fs.save(uploaded_file.name, uploaded_file) url = fs.url(name) # get minsupp and minconf minsupp = request.POST['minsupp'] minconf = request.POST['minconf'] # get algorithm selectedAlgorithm = request.POST['selectedAlgorithm'] BASE_DIR = dirname(os.path.dirname(os.path.abspath(__file__))) url_split = url.split("/") for value in url_split: if value == '': url_split.remove(value) for value in url_split: BASE_DIR = BASE_DIR + "\\" + str(value) # read dataset if name.find(".csv") != -1: store_data = pd.read_csv(BASE_DIR, header=None) if name.find(".xlsx") != -1: store_data = pd.read_excel(BASE_DIR, header=None) # change data conform algorithm records = [] for i in range(0, len(store_data)): records.append([ str(store_data.values[i, j]) for j in range(0, len(store_data.columns)) ]) records_withoutNan = [] for i in range(0, len(records)): new = [] for j in range(0, len(records[i])): if str(records[i][j]) != "nan": new.append(str(records[i][j])) records_withoutNan.append(new) te = TransactionEncoder() te_ary = te.fit(records_withoutNan).transform(records_withoutNan) df = pd.DataFrame(te_ary, columns=te.columns_) """ function APRIORI algorithm return alist rules in rules have (super rules, sup rules and confidence every a rules) """ def apriori_find_association_rules(dataset, minsup, minconf): patterns_ap = apriori(df, min_support=float(minsupp) / 100, use_colnames=True) rules_ap = association_rules(patterns_ap, metric="confidence", min_threshold=float(minconf) / 100) rules_ap_sort_descending = rules_ap.sort_values(by="lift", ascending=False) return rules_ap_sort_descending """ function FP-GROWTH algorithm return pattens, rules in rules have (super rules, sup rules and confidence every a rules) """ def fpgrowth_find_association_rules(dataset, minsup, minconf): patterns_fp = fpgrowth(df, min_support=float(minsupp) / 100, use_colnames=True, verbose=0) rules_fp = association_rules(patterns_fp, metric="confidence", min_threshold=float(minconf) / 100) rules_fp_sort_descending = rules_fp.sort_values(by="lift", ascending=False) return rules_fp_sort_descending """ set event use Apriori or FP_Growth """ if selectedAlgorithm == 'Apriori': """ association_results_APRIORI: is a List Object Apriori return after calculate """ association_results_APRIORI = apriori_find_association_rules( df, minsupp, minconf) rules_ap_antecedents_list = list( association_results_APRIORI['antecedents']) rules_ap_consequents_list = list( association_results_APRIORI['consequents']) rules_ap_support_list = list( association_results_APRIORI['support']) rules_ap_confidence_list = list( association_results_APRIORI['confidence']) rules_ap_lift_list = list(association_results_APRIORI['lift']) rules_ap_final = [] for i in range(0, len(rules_ap_antecedents_list)): onerules = {} onerules['antecedents'] = list(rules_ap_antecedents_list[i]) onerules['consequents'] = list(rules_ap_consequents_list[i]) onerules['support'] = round(rules_ap_support_list[i], 3) onerules['confidence'] = round(rules_ap_confidence_list[i], 3) onerules['lift'] = round(rules_ap_lift_list[i], 3) rules_ap_final.append(onerules) return render( request, 'website/show_rules.html', { 'selectedAlgorithm': selectedAlgorithm, 'lenrules': len(rules_ap_final), 'lendata': len(df), 'association_rules': rules_ap_final }) elif selectedAlgorithm == 'FP-Growth': association_results_FPGROWTH = fpgrowth_find_association_rules( df, minsupp, minconf) rules_fp_antecedents_list = list( association_results_FPGROWTH['antecedents']) rules_fp_consequents_list = list( association_results_FPGROWTH['consequents']) rules_fp_support_list = list( association_results_FPGROWTH['support']) rules_fp_confidence_list = list( association_results_FPGROWTH['confidence']) rules_fp_lift_list = list(association_results_FPGROWTH['lift']) rules_fp_final = [] for i in range(0, len(rules_fp_antecedents_list)): onerules = {} onerules['antecedents'] = list(rules_fp_antecedents_list[i]) onerules['consequents'] = list(rules_fp_consequents_list[i]) onerules['support'] = round(rules_fp_support_list[i], 3) onerules['confidence'] = round(rules_fp_confidence_list[i], 3) onerules['lift'] = round(rules_fp_lift_list[i], 3) rules_fp_final.append(onerules) return render( request, 'website/show_rules.html', { 'selectedAlgorithm': selectedAlgorithm, 'lenrules': len(rules_fp_final), 'lendata': len(df), 'association_rules': rules_fp_final })
try: config.read('../config.ini') except: print('Error, could not find config.ini') verbose = config['DEFAULT']['Verbose'].lower() == 'true' # load the playlists as transactions if verbose: print('Loading transactions from disk.') playlist_transactions, unique_songs = load_playlists(config['model']['PlaylistsDir']) # load the transactions into an encoder and get a pandas dataframe if verbose: print('Encoding transactions') te = TransactionEncoder() te_ary = te.fit(playlist_transactions).transform(playlist_transactions) playlist_df = pd.DataFrame(te_ary, columns=te.columns_) # get the support, confidence, and max k values for the itemsets min_sup = float(config['model']['MinSup']) max_k = int(config['model']['ItemsetSize']) # run FP Growth algo on the transactions if verbose: print('Running FP Growth') frequent_itemsets = fpgrowth(playlist_df, min_support=min_sup, use_colnames=True, max_len=max_k) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) k_frequent_itemsets = frequent_itemsets[frequent_itemsets['length'] == max_k]
def calculate_database(request): if request.method == "POST": server = request.POST['ServerName'] database = request.POST['DatabaseName'] username = request.POST['Username'] password = request.POST['Password'] minsupp = request.POST['minsupp'] minconf = request.POST['minconf'] selectedAlgorithm = request.POST['selectedAlgorithm'] conn = pyodbc.connect('Driver={ODBC Driver 13 for SQL Server};' f'Server=' + server + ';' f'Database=' + database + ';' f'UID=' + username + ';' f'PWD=' + password + ';' 'Mars_Connection=Yes;') cursor = conn.cursor() getSaleProduct = cursor.execute( 'SELECT SalesOrderID,SalesOrderDetailID, [Production].[Product].[ProductID], Name FROM [Sales].[SalesOrderDetail] inner join [Production].[Product] on [Production].[Product].[ProductID] = [Sales].[SalesOrderDetail].[ProductID] order by [Sales].[SalesOrderDetail].SalesOrderID asc' ) listSaleProduct = [] for saleProduct in getSaleProduct: listSaleProduct.append(list(saleProduct)) list_saleProduct_parent = [] list_saleProduct_child = [] temp = listSaleProduct[0][0] i = 0 while i < len(listSaleProduct): if listSaleProduct[i][0] == temp: list_saleProduct_child.append(str(listSaleProduct[i][3])) i += 1 else: list_saleProduct_parent.append(list_saleProduct_child) list_saleProduct_child = [] temp = listSaleProduct[i][0] i = i te = TransactionEncoder() te_ary = te.fit(list_saleProduct_parent).transform( list_saleProduct_parent) df = pd.DataFrame(te_ary, columns=te.columns_) def apriori_find_association_rules(dataset, minsup, minconf): patterns_ap = apriori(df, min_support=float(minsupp) / 100, use_colnames=True) rules_ap = association_rules(patterns_ap, metric="confidence", min_threshold=float(minconf) / 100) rules_ap_sort_descending = rules_ap.sort_values(by="lift", ascending=False) return rules_ap_sort_descending def fpgrowth_find_association_rules(dataset, minsup, minconf): patterns_fp = fpgrowth(df, min_support=float(minsupp) / 100, use_colnames=True, verbose=0) rules_fp = association_rules(patterns_fp, metric="confidence", min_threshold=float(minconf) / 100) rules_fp_sort_descending = rules_fp.sort_values(by="lift", ascending=False) return rules_fp_sort_descending """ set event use Apriori or FP_Growth """ if selectedAlgorithm == 'Apriori': """ association_results_APRIORI: is a List Object Apriori return after calculate """ association_results_APRIORI = apriori_find_association_rules( df, minsupp, minconf) # print(association_results_APRIORI) rules_ap_antecedents_list = list( association_results_APRIORI['antecedents']) rules_ap_consequents_list = list( association_results_APRIORI['consequents']) rules_ap_support_list = list( association_results_APRIORI['support']) rules_ap_confidence_list = list( association_results_APRIORI['confidence']) rules_ap_lift_list = list(association_results_APRIORI['lift']) rules_ap_final = [] for i in range(0, len(rules_ap_antecedents_list)): onerules = {} onerules['antecedents'] = list(rules_ap_antecedents_list[i]) onerules['consequents'] = list(rules_ap_consequents_list[i]) onerules['support'] = round(rules_ap_support_list[i], 3) onerules['confidence'] = round(rules_ap_confidence_list[i], 3) onerules['lift'] = round(rules_ap_lift_list[i], 3) rules_ap_final.append(onerules) return render( request, 'website/show_rules.html', { 'selectedAlgorithm': selectedAlgorithm, 'lenrules': len(rules_ap_final), 'lendata': len(df), 'association_rules': rules_ap_final }) elif selectedAlgorithm == 'FP-Growth': association_results_FPGROWTH = fpgrowth_find_association_rules( df, minsupp, minconf) rules_fp_antecedents_list = list( association_results_FPGROWTH['antecedents']) rules_fp_consequents_list = list( association_results_FPGROWTH['consequents']) rules_fp_support_list = list( association_results_FPGROWTH['support']) rules_fp_confidence_list = list( association_results_FPGROWTH['confidence']) rules_fp_lift_list = list(association_results_FPGROWTH['lift']) rules_fp_final = [] for i in range(0, len(rules_fp_antecedents_list)): onerules = {} onerules['antecedents'] = list(rules_fp_antecedents_list[i]) onerules['consequents'] = list(rules_fp_consequents_list[i]) onerules['support'] = round(rules_fp_support_list[i], 3) onerules['confidence'] = round(rules_fp_confidence_list[i], 3) onerules['lift'] = round(rules_fp_lift_list[i], 3) rules_fp_final.append(onerules) return render( request, 'website/show_rules.html', { 'selectedAlgorithm': selectedAlgorithm, 'lenrules': len(rules_fp_final), 'lendata': len(df), 'association_rules': rules_fp_final })
# Import Data import pandas import numpy as np grocer_cust_item = pandas.read_csv( 'C:\\Users\\soura\\OneDrive\\Desktop\\ML\\ML Spring20_Assignemnt\\HW2\\Groceries.csv', delimiter=',') # Convert the Sale Receipt data to the Item List format groc_ListItem = grocer_cust_item.groupby( ['Customer'])['Item'].apply(list).values.tolist() # Convert the Item List format to the Item Indicator format from mlxtend.preprocessing import TransactionEncoder te = TransactionEncoder() te_ary = te.fit(groc_ListItem).transform(groc_ListItem) groc_ItemIndicator = pandas.DataFrame(te_ary, columns=te.columns_) cost_items = [] cost_item_count = [] for item in groc_ListItem: uniq_items = set(item) cost_item_count.append(len(uniq_items)) cost_items.append(list(uniq_items)) uniq_item_df = pandas.DataFrame(cost_items) import matplotlib.pyplot as plt plt.hist(cost_item_count, edgecolor='k')
return dfRules if __name__ == '__main__': dataSet = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] dataSet = [['菜品2', '菜品4', '菜品3'], ['菜品1', '菜品5'], ['菜品1', '菜品4'], ['菜品2', '菜品1', '菜品4', '菜品5'], ['菜品2', '菜品1'], ['菜品1', '菜品4'], ['菜品2', '菜品1'], ['菜品2', '菜品1', '菜品4', '菜品3'], ['菜品2', '菜品1', '菜品4'], ['菜品2', '菜品4', '菜品3']] minSup = 0.3 minConf = 0.5 Rules = Apriori(dataSet, minSup, minConf) Rulesdf = Rules2df(Rules) # 使用mlxtend的apriori算法 from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules tranEncoder = TransactionEncoder() te_ary = tranEncoder.fit(dataSet).transform(dataSet) df = pd.DataFrame(te_ary, columns=tranEncoder.columns_) FreLk_all = apriori(df, min_support=minSup, use_colnames=True) Rules_mlx = association_rules(FreLk_all, min_threshold=minConf)
if key in dict.keys(): return True else: return False start = time.time() dataset = [] # 2-D array for storing the sequences with open('out.txt', 'r') as fobj: # Importing values from txt file containing dataset for line in fobj: numbers = [int(num) for num in line.split()] # Single row of the 2-D array dataset.append(numbers) t = TransactionEncoder() t_ary = t.fit(dataset).transform(dataset) # Convrerting to table of true/false df = pd.DataFrame( t_ary, columns=t.columns_ ) # Converting t_ary table to suitable form for giving input to apriori frequent_set = fpgrowth(df, min_support=0.015, use_colnames=True) frequent_set['length'] = frequent_set['itemsets'].apply(lambda x: len(x)) print(frequent_set) end = time.time() print(end - start) # For generating length v/s Count plot
from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # In[2]: fin = open("T10I4D100K.txt", "r") dataset = [[int(n) for n in line.split()] for line in fin] # In[3]: te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset, sparse=True) sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False) sparse_df # In[4]: frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True) frequent_itemsets5 # In[5]: