def onehot_encoding_transaction_data(transactions): encoder = TransactionEncoder().fit(transactions) onehot = encoder.transform(transactions) onehot = pd.DataFrame(onehot, columns=encoder.columns_) print(onehot) return onehot
def onehot_online_retail_data(transactions): encoder = TransactionEncoder().fit(transactions) onehot = encoder.transform(transactions) onehot = pd.DataFrame(onehot, columns=encoder.columns_) onehot = onehot.sample(frac=0.1) print("onehot.shape", onehot.shape) return onehot
dataset=[["Bread","Milk","Beer"], ["Bread","Diapers","Egg"], ["Milk","Diapers","Beer","Cola"], ["Bread","Milk","Diapers","Beer"], ["Bread","Milk","Cola"]] import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori te=TransactionEncoder() te=te.fit(dataset) te_ary=te.transform(dataset) df=pd.DataFrame(te_ary,columns=te.columns_) frequent_itemsets=apriori(df,min_support=0.6,use_colnames=True) #to apply association rules from mlxtend.frequent_patterns import association_rules rules=association_rules(frequent_itemsets,metric='support',min_threshold=0.6) cartItem=[] init=input("1.Beer\t2.Bread\t3.Cola\t4.Diapers 5.Egg 6.Milk.\n What do you wish to buy? ") init=init.capitalize() cartItem=cartItem+[init] permission='y' checklist=['Beer','Bread','Cola','Diapers','Egg','Milk']
lng = pd.DataFrame(df.explode("speaks").speaks) lng.reset_index(inplace=True) lng.drop(lng[lng.speaks.str.contains("poorly")].index, inplace=True) lng.speaks = lng.speaks.str.split().str[0] missing = sorted(set(range(df.shape[0]))-set(lng["index"])) for i in missing: lng = pd.concat([lng, pd.DataFrame({"index":[i], "speaks":["english"]})]) lng = lng.groupby("index").agg({"speaks": lambda x: x.tolist()}) df.speaks = lng.speaks # onehot dataframe of languages encoder = TransactionEncoder().fit(df.speaks) onehot = encoder.transform(df.speaks) onehot = pd.DataFrame(onehot, columns = encoder.columns_) onehot.mean().sample(10) # 10 most popular languages top10 = onehot.mean().sort_values(ascending=False).iloc[:10] plt.plot(top10, marker='o', color="#5e3a98") plt.xticks(rotation=45) plt.ylabel("% of users") plt.title("top 10 most popular languages") plt.show() # 10 most popular languages besides English and other top12 = onehot.mean().sort_values(ascending=False).iloc[1:12] top12.drop("other", inplace=True) plt.plot(top12, marker='o', color="#5e3a98")
def test_transform(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset) np.testing.assert_array_equal(expect, trans)
def test_transform_sparse(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset, sparse=True) assert(isinstance(trans, csr_matrix)) np.testing.assert_array_equal(expect, trans.todense())
def test_transform_sparse(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset, sparse=True) assert (isinstance(trans, csr_matrix)) np.testing.assert_array_equal(expect, trans.todense())
transacoes = [] for i in range(0, qtdlinhas): linhaTransacao = [] for j in range(0, qtdcols): linhaTransacao.append(str(dataset.values[i, j])) transacoes.append(linhaTransacao) print(transacoes) te = TransactionEncoder() #Coloca em memórias as trasações e interpreta a quantidade de colunas que serão geradas durante o processamento te.fit(transacoes) #O objeto TransactionEncoder faz a conversão das transações em uma matriz binária onde cada linha da matriz representa uma transação matriz_transacoes = te.transform(transacoes) #Cria um dataframe auxiliar com a matriz binária (passo te.transform(transacoes)) de transações e as colunas obtidas (passo te.fit(transacoes)) dfAuxiliar = pd.DataFrame(matriz_transacoes, columns=te.columns_) #Obtêm os itemsets mais frequentes com um suporte mínimo igual a 0.01. O paramêtro use_colnames significa que vamos usar os nomes das colunas do DataFrame dfAuxiliar #para construir as regras de Associação itemsets_freq = apriori(dfAuxiliar, min_support=0.005, use_colnames=True) #Algumas métricas: #- support(A->C) = support(A+C) [aka 'support'], range: [0, 1] #- confidence(A->C) = support(A+C) / support(A), range: [0, 1] #- lift(A->C) = confidence(A->C) / support(C), range: [0, inf] #- leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1] #- conviction = [1 - support(C)] / [1 - confidence(A->C)],
def recommend(): # Import modules. import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from mlxtend.preprocessing import TransactionEncoder # Load orders dataset. orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv') # orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv') products = pd.read_csv(r'./input_original_datasets/olist_products_dataset.csv') # Load translations dataset. translations = pd.read_csv(r'./input_original_datasets/product_category_name_translation.csv') # Print orders header. orders.head() # Print orders info. orders.info() # Print products header. products.head() # Print products info. products.info() # Print translations header. translations.head() # Print translations info. translations.info() # Translate product names to English. products = products.merge(translations, on='product_category_name', how="left") # Print English names. products['product_category_name_english'] # # Convert product IDs to product category names.** # Define product category name in orders DataFrame. orders = orders.merge(products[['product_id','product_category_name_english']], on='product_id', how='left') # Print orders header. orders.head() # Drop products without a defined category. orders.dropna(inplace=True, subset=['product_category_name_english']) # Print number of unique items. len(orders['product_id'].unique()) # Print number of unique categories. len(orders['product_category_name_english'].unique()) # # Construct transactions from order and product data** # Identify transactions associated with example order. example1 = orders[orders['order_id'] == 'fe64170e936bc5f6a6a41def260984b9']['product_category_name_english'] # Print example. example1 # Identify transactions associated with example order. example2 = orders[orders['order_id'] == 'fffb9224b6fc7c43ebb0904318b10b5f']['product_category_name_english'] # Print example. example2 # # Map orders to transactions. # # # Recover transaction itemsets from orders DataFrame. transactions = orders.groupby("order_id").product_category_name_english.unique() # Print transactions header. transactions.head() # Plot 50 largest categories of transactions. transactions.value_counts()[:50].plot(kind='bar', figsize=(15,5)) # Convert the pandas series to list of lists. transactions = transactions.tolist() # Print length of transactions. len(transactions) # Count number of unique item categories for each transaction. counts = [len(transaction) for transaction in transactions] # Print median number of items in a transaction. np.median(counts) # Print maximum number of items in a transaction. np.max(counts) # # Association Rules and Metrics from mlxtend.preprocessing import TransactionEncoder # Instantiate an encoder. encoder = TransactionEncoder() # Fit encoder to list of lists. encoder.fit(transactions) # Transform lists into one-hot encoded array. onehot = encoder.transform(transactions) # Convert array to pandas DataFrame. onehot = pd.DataFrame(onehot, columns = encoder.columns_) # Print header. onehot.head() # # Compute the support metric # # Print support metric over all rows for each column. onehot.mean(axis=0) # # Compute the item count distribution over transactions # Print distribution of item counts. onehot.sum(axis=1).value_counts() # # Create a column for an itemset with multiple items # # Add sports_leisure and health_beauty to DataFrame. onehot['sports_leisure_health_beauty'] = onehot['sports_leisure'] & onehot['health_beauty'] # Print support value. onehot['sports_leisure_health_beauty'].mean(axis = 0) # # **Aggregate the dataset further by combining product sub-categories** # We can use the inclusive OR operation to combine multiple categories. # * True | True = True # * True | False = True # * False | True = True # * False | False = False # Merge books_imported and books_technical. onehot['books'] = onehot['books_imported'] | onehot['books_technical'] # Print support values for books, books_imported, and books_technical. onehot[['books','books_imported','books_technical']].mean(axis=0) # # Compute the confidence metric # # Compute joint support for sports_leisure and health_beauty. joint_support = (onehot['sports_leisure'] & onehot['health_beauty']).mean() # Print confidence metric for sports_leisure -> health_beauty. joint_support / onehot['sports_leisure'].mean() # Print confidence for health_beauty -> sports_leisure. joint_support / onehot['sports_leisure'].mean() # # The Apriori Algorithm and Pruning from mlxtend.frequent_patterns import apriori # Apply apriori algorithm to data with min support threshold of 0.01. frequent_itemsets = apriori(onehot, min_support = 0.01) # Print frequent itemsets. frequent_itemsets # Apply apriori algorithm to data with min support threshold of 0.001. frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True) # Print frequent itemsets. frequent_itemsets # Apply apriori algorithm to data with min support threshold of 0.00005. frequent_itemsets = apriori(onehot, min_support = 0.00005, use_colnames = True) # Print frequent itemsets. frequent_itemsets # Apply apriori algorithm to data with a two-item limit. frequent_itemsets = apriori(onehot, min_support = 0.00005, max_len = 2, use_colnames = True) # # Computing association rules from Apriori output** from mlxtend.frequent_patterns import association_rules # Recover association rules using support and a minimum threshold of 0.0001. rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.0001) # Print rules header. rules.head() rules.to_csv('result_datasets/result_apriori.csv') # # Pruning association rules # Recover association rules using confidence threshold of 0.01. rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.01) # Print rules. rules rules.to_csv('result_datasets/result_Pruning.csv') # Select rules with a consequent support above 0.095. rules = rules[rules['consequent support'] > 0.095] # Print rules. rules # # The leverage metric # # Select rules with leverage higher than 0.0. rules = rules[rules['leverage'] > 0.0] # Print rules. rules # # Visualizing patterns in metrics # Recover association rules with a minimum support greater than 0.000001. rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.000001)
df = pd.DataFrame(data) df = (df.groupby(["Panel ID", "Date"])) list_of_unique = list(df["Category"].unique()) flattened = [i for t in list_of_unique for i in t] groceries = list(set(flattened)) rules = list(permutations(groceries, 2)) rules_df = pd.DataFrame(rules, columns=['antecedents', 'consequents']) print(rules) encoder = TransactionEncoder().fit(list_of_unique) onehot = encoder.transform(list_of_unique) onehot = pd.DataFrame(onehot, columns=encoder.columns_) support = onehot.mean() print(onehot.head()) print(support) def support(x): # Compute support for antecedent AND consequent support = x.mean() return support
# # 본문 예시 예제 # # 데이터 생성 tr_data = [['milk', 'bread'], ['butter'], ['beer', 'diapers'], ['milk', 'bread', 'butter'], ['bread']] # 데이터 전 처리 tr_encoder = TransactionEncoder() tr_encoder.fit(tr_data) tr_encoder.columns_ # ['beer', 'bread', 'butter', 'diapers', 'milk'] tr_encoder_ary = tr_encoder.transform(tr_data) tr_encoder_ary = np.where(tr_encoder_ary == True, 1, 0) df = pd.DataFrame(tr_encoder_ary, columns=tr_encoder.columns_) print(df) # beer bread butter diapers milk # 0 0 1 0 0 1 # 1 0 0 1 0 0 # 2 1 0 0 1 0 # 3 0 1 1 0 1 # 4 0 1 0 0 0 # 연관 모델 적합 freq_items = apriori(df, min_support=0.4, use_colnames=True, n_jobs=-1) freq_items['length'] = freq_items['itemsets'].apply(lambda x: len(x))
data = data.split(',') for values in data: dataset.append(values.strip('\n')) ''' dataset=[["Bread", "Milk", "Beer"], ["Bread", "Diapers", "Eggs"], ["Milk", "Diapers", "Beer", "Cola"], ["Bread", "Milk", "Diapers", "Beer"], ["Bread", "Milk", "Cola"]] ######################################################### te=TransactionEncoder() te=te.fit(dataset) # Reads unique values te_ary=te.transform(dataset) # Converts array into a matrx df=pd.DataFrame(te_ary, columns=te.columns_) ######################################################### frequent_items=apriori(df,min_support=0.6,use_colnames=True) frequent_items rules = association_rules(frequent_items, metric= 'confidence', min_threshold=0.5) ######################################################### # Shopping Part def item(): print("Items available are-", available) print("Your cart contains",cart) buy=input("What do you want to buy? \nItem:") buy=buy.title()