import pandas as pd # 필요 라이브러리 불러오기 from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import association_rules from mlxtend.frequent_patterns import apriori dataset = [] with open('./mart.csv', 'r') as reader: for line in reader: dataset.append(line.strip().split(',')) te = TransactionEncoder() # 거래 정보를 bag of word형식으로 저장 te_ary = te.fit(dataset).transform(dataset) # 거래 정보를 pandas 데이터프레임으로 저장 df = pd.DataFrame(te_ary, columns=te.columns_) # apriori 알고리즘을 이용하여 의미있는 규칙 찾기 frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True) rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01) print(rules) # 규칙 보기
#读取文件 dataset = pd.read_csv('./Market_Basket_Optimisation.csv', header=None) # 查看维度,shape为(7501,20) print(dataset) #将数据存放到transactions中 transactions = [] for i in range(0, dataset.shape[0]): temp = [] for j in range(0, 20): if str(dataset.values[i, j]) != 'nan': temp.append(str(dataset.values[i, j])) transactions.append(temp) # 对数据进行独热编码 temp = TransactionEncoder() temp_hot_encoded = temp.fit_transform(transactions) df = pd.DataFrame(temp_hot_encoded, columns=temp.columns_) print(df.head()) df.to_csv('df.csv') # 挖掘频繁项集 frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False) print('频繁项集:', frequent_itemsets) # 计算关联规则 rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1) rules = rules.sort_values(by='lift', ascending=False) print('关联规则:', rules) rules.to_csv('rules.csv')
def arms(orders, recipes, dept, products): print( 'Association rule mining is used to find the most bought products and their relationship' ) dept_of_interest = dept.query( 'department == "produce" or department == "bakery" or department == "international" or department == "beverages" or department == "dry goods pasta" or department == "bulk" or department == "meat seafood" or department == "pantry" or department == "dairy eggs"' ) #fetching the 'Produce Department' #products = pd.read_csv(xls, 'products') # reading products file #fetching the 'Produce Department' id val = dept_of_interest[ 'department_id'] #val stores the department id for 'Produce Department' dept_of_interest # shows department id filtered_products = products.loc[products['department_id'].isin( val)] #getting products from the 'Produce Department' filtered_products = pd.Series(filtered_products['product_id'] ) # filter products that have a products id filtered_products_list = filtered_products.tolist( ) # converting datafame to list # converting all the values in striing filtered_products_strlist = [] for elem in filtered_products_list: filtered_products_strlist.append(str(elem)) # converting values in product id column as string orders['product_id'] = orders['product_id'].astype(str) # printing orders orders filtered_orders = orders.loc[orders['product_id'].isin( filtered_products_strlist )] #fetching orders which contain products from the filtered products list del filtered_orders[ 'add_to_cart_order'] # removing column add_to_cart_order del filtered_orders['reordered'] # removing reordered column product_id_name_map = products.loc[products['product_id'].isin( filtered_products_strlist)] product_id_name_map # geting product table where 'product id' matches with filtered_products_strlist # converts the value in string product_id_name_map['product_id'] = product_id_name_map[ 'product_id'].apply(str) product_id_name_map['product_id'] filtered_orders['product_id'].apply(str) # merge the filtered orders table and product_id_name_map table with pd.option_context('mode.chained_assignment', None): final_table = pd.merge(filtered_orders, product_id_name_map, on='product_id', how='left') # final table contains order id mapped with product id , proudct name aisle id and department id final_table # group order id by order id and product name matrix = final_table.groupby( ['order_id'])['product_name'].apply(list).values.tolist() matrix # # Transform Dataframe #Change variables from here # create a table with true and false matrix for every transactio te = TransactionEncoder() te_ary = te.fit(matrix).transform(matrix) df = pd.DataFrame(te_ary, columns=te.columns_) df # # Find products that are bought more than Five percent of the times support = apriori(df, min_support=0.05, use_colnames=True) print( 'Products that are bought more than Five percent of the time (Support)' ) print('') display(support) print('') # Below items should always be kept in stock # # Find products that are bought one percent of the time print( 'Products that are bought more than One percent of the time (Support) ' ) print('') frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply( lambda x: len(x)) display(frequent_itemsets) # # Products selected with more than one percent support and twenty percent confidence # Use the above data with products that are bought one percent of the times and select products that are bought in pairs i.e if one product is bought then the other product is also bought. Only give result for product that are bought more than 20 percent of the time together i.e that have a confidence of more than 20 percent print( 'Products that are bought more than Twenty percent of the times together (Confidence)' ) print('') products_association = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.20) display(products_association) print('') # # Products filtered from the above result with more than 2.5 lift # Lift is support of(A and B together )/ (support of A)* (support of B) # We have taken the lift value greater than 2.5 as it shows that chances of A and B together occuring are more independent of A and B print( 'Product which are more than Two and a half times likely to be bought together (Lift) ' ) print('') products_association = products_association[ products_association['lift'] >= 2.5] display(products_association)
def transform(self): te = TransactionEncoder() te_ary = te.fit(self.df).transform(self.df) self.df = pd.DataFrame(te_ary, columns=te.columns_) return self.df
def apriori(): """ 接口请求参数:{ "table_name": "apriori_test", # str,数据库表名 "X": ["x0", "x1", "x2", "x3", "x4", "x5"], # list,自变量 "alg": "fpgrowth', # str,关联规则算法选择["apriori", "fpgrowth"] ==》【默认值:fpgrowth】 "dataconvert": True, # bool,是否需要数据转换 ==》【默认值:True】 "minSupport": "0.05", # str,最小支持度 ==》【默认值:"0.05"】 "max_len": "2", # 频繁项集最大长度 ==》【默认值:None】 "metrics": "confidence", # 关联规则评价指标["support", "confidence", "lift", "leverage", "conviction"] ==》【默认值:confidence】 "min_threshold": "0.8", # 关联规则评价指标最小值 ==》【默认值:"0.8"】 } :return: """ log.info('Apriori_init...') request_data = init_route() try: from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import fpgrowth from mlxtend.frequent_patterns import association_rules except: raise ImportError("cannot import mlxtend") try: table_name = request_data['table_name'] X = request_data['X'] alg = request_data['alg'] dataconvert = request_data['dataconvert'] min_support = float(request_data['minSupport']) max_len = int(request_data['max_len']) metrics = request_data['metrics'] min_threshold = float(request_data['min_threshold']) except Exception as e: log.info(e) raise e try: table_data = exec_sql(table_name, X) table_data.fillna("", inplace=True) data = table_data.values.tolist() if dataconvert: trans = TransactionEncoder() data = trans.fit(data).transform(data) data = pd.DataFrame(data, columns=trans.columns_) log.info("data columns:{}".format(data.columns.values)) if "" in data.columns: data.drop(columns="", axis=1, inplace=True) if alg == "apriori": frequent_itemsets = apriori(data, min_support=min_support, max_len=max_len, use_colnames=True) elif alg == "fpgrowth": frequent_itemsets = fpgrowth(data, min_support=min_support, max_len=max_len, use_colnames=True) else: raise ValueError("input Association rules:{} is not support".format(alg)) rules = association_rules(frequent_itemsets, metric=metrics, min_threshold=min_threshold) rules = rules.replace([np.inf, -np.inf], "") rules = format_dataframe(rules, {"lift": ".4f", "leverage": ".4f"}) res = [ transform_table_data_to_html({ "title": "频繁项集结果", "row": frequent_itemsets.index.tolist(), "col": frequent_itemsets.columns.tolist(), "data": frequent_itemsets.values.tolist(), }), transform_table_data_to_html({ "title": "关联规则结果", "row": rules.index.tolist(), "col": rules.columns.tolist(), "data": rules.values.tolist(), }) ] response_data = {"res": res, "code": "200", "msg": "ok!"} return jsonify(response_data) except Exception as e: log.exception(e) return jsonify({"code": "500", "res": "", "msg": "{}".format(e.args)})
def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None): """ Parameter --------- input : str File path to galaxy tool parameter infile : str File paths of input vector outfile : str File path to output matrix min_support: float Minimum support min_confidence: float Minimum confidence min_lift: float Minimum lift min_conviction: float Minimum conviction max_length: int Maximum length """ warnings.simplefilter('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) input_header = params['header0'] header = 'infer' if input_header else None with open(infile) as fp: lines = fp.read().splitlines() if header is not None: lines = lines[1:] dataset = [] for line in lines: line_items = line.split("\t") dataset.append(line_items) # TransactionEncoder learns the unique labels in the dataset and transforms the # input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array te = TransactionEncoder() te_ary = te.fit_transform(dataset) # Turn the encoded NumPy array into a DataFrame df = pd.DataFrame(te_ary, columns=te.columns_) # Extract frequent itemsets for association rule mining # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length) # Get association rules, with confidence larger than min_confidence rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)] # Convert columns from frozenset to list (more readable) rules['antecedents'] = rules['antecedents'].apply(list) rules['consequents'] = rules['consequents'].apply(list) # The next 3 steps are intended to fix the order of the association # rules generated, so tests that rely on diff'ing a desired output # with an expected output can pass # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents' rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row)) rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row)) # 2) Create two temporary string columns to sort on rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row)) rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row)) # 3) Sort results so they are re-producable rules.sort_values(by=['ant_str', 'con_str'], inplace=True) del rules['ant_str'] del rules['con_str'] rules.reset_index(drop=True, inplace=True) # Write association rules and metrics to file rules.to_csv(outfile, sep="\t", index=False)
def apriori_gen(dataset, min_support): encoder = TransactionEncoder() te_ary = encoder.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=encoder.columns_) apriori(df, min_support=min_support, use_colnames=True).to_csv("test.csv")
def solve_3_2(): global users_gamedicts, gamesofallusers gamesofallusers = [ list(gamedict.keys()) for fid, gamedict in users_gamedicts.items() ] # TODO: Convert the gamedict to a list of lists # Remove common Steam entries that are not games: for game in gamesofallusers: if 'Dota 2 Test' in game: game.remove('Dota 2 Test') if 'True Sight' in game: game.remove('True Sight') if 'True Sight: Episode 1' in game: game.remove('True Sight: Episode 1') if 'True Sight: Episode 2' in game: game.remove('True Sight: Episode 2') if 'True Sight: Episode 3' in game: game.remove('True Sight: Episode 3') if 'True Sight: The Kiev Major Grand Finals' in game: game.remove('True Sight: The Kiev Major Grand Finals') if 'True Sight: The International 2017' in game: game.remove('True Sight: The International 2017') if 'True Sight: The International 2018 Finals' in game: game.remove('True Sight: The International 2018 Finals') from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori te = TransactionEncoder() # TODO: Tinker around with the values te_ary = te.fit(gamesofallusers).transform(gamesofallusers) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True) frequent_itemsets.sort_values(by='support', ascending=False, ignore_index=True, inplace=True) import numpy as np from mlxtend.frequent_patterns import association_rules thresholds = np.arange(0.3, 1.0001, 0.01).tolist() for f in thresholds: f = round(f, 2) filtered_frequent_itemsets = frequent_itemsets[ frequent_itemsets.support >= f] if len(filtered_frequent_itemsets.index) > 0: for t in thresholds: t = round(t, 2) conf_rules = association_rules(filtered_frequent_itemsets, metric="confidence", min_threshold=t) lift_rules = association_rules(filtered_frequent_itemsets, metric="lift", min_threshold=t) print( f'f={f}, t={t}: {len(filtered_frequent_itemsets.index)} | {len(conf_rules.index)} | {len(lift_rules.index)}' ) if len(filtered_frequent_itemsets.index) <= 5: break # TODO: Play around with the treshold value # conf_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.75) # lift_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.75) pass
def recommend(): # Import modules. import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from mlxtend.preprocessing import TransactionEncoder # Load orders dataset. orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv') # orders = pd.read_csv(r'./input_original_datasets/olist_order_items_dataset.csv') products = pd.read_csv(r'./input_original_datasets/olist_products_dataset.csv') # Load translations dataset. translations = pd.read_csv(r'./input_original_datasets/product_category_name_translation.csv') # Print orders header. orders.head() # Print orders info. orders.info() # Print products header. products.head() # Print products info. products.info() # Print translations header. translations.head() # Print translations info. translations.info() # Translate product names to English. products = products.merge(translations, on='product_category_name', how="left") # Print English names. products['product_category_name_english'] # # Convert product IDs to product category names.** # Define product category name in orders DataFrame. orders = orders.merge(products[['product_id','product_category_name_english']], on='product_id', how='left') # Print orders header. orders.head() # Drop products without a defined category. orders.dropna(inplace=True, subset=['product_category_name_english']) # Print number of unique items. len(orders['product_id'].unique()) # Print number of unique categories. len(orders['product_category_name_english'].unique()) # # Construct transactions from order and product data** # Identify transactions associated with example order. example1 = orders[orders['order_id'] == 'fe64170e936bc5f6a6a41def260984b9']['product_category_name_english'] # Print example. example1 # Identify transactions associated with example order. example2 = orders[orders['order_id'] == 'fffb9224b6fc7c43ebb0904318b10b5f']['product_category_name_english'] # Print example. example2 # # Map orders to transactions. # # # Recover transaction itemsets from orders DataFrame. transactions = orders.groupby("order_id").product_category_name_english.unique() # Print transactions header. transactions.head() # Plot 50 largest categories of transactions. transactions.value_counts()[:50].plot(kind='bar', figsize=(15,5)) # Convert the pandas series to list of lists. transactions = transactions.tolist() # Print length of transactions. len(transactions) # Count number of unique item categories for each transaction. counts = [len(transaction) for transaction in transactions] # Print median number of items in a transaction. np.median(counts) # Print maximum number of items in a transaction. np.max(counts) # # Association Rules and Metrics from mlxtend.preprocessing import TransactionEncoder # Instantiate an encoder. encoder = TransactionEncoder() # Fit encoder to list of lists. encoder.fit(transactions) # Transform lists into one-hot encoded array. onehot = encoder.transform(transactions) # Convert array to pandas DataFrame. onehot = pd.DataFrame(onehot, columns = encoder.columns_) # Print header. onehot.head() # # Compute the support metric # # Print support metric over all rows for each column. onehot.mean(axis=0) # # Compute the item count distribution over transactions # Print distribution of item counts. onehot.sum(axis=1).value_counts() # # Create a column for an itemset with multiple items # # Add sports_leisure and health_beauty to DataFrame. onehot['sports_leisure_health_beauty'] = onehot['sports_leisure'] & onehot['health_beauty'] # Print support value. onehot['sports_leisure_health_beauty'].mean(axis = 0) # # **Aggregate the dataset further by combining product sub-categories** # We can use the inclusive OR operation to combine multiple categories. # * True | True = True # * True | False = True # * False | True = True # * False | False = False # Merge books_imported and books_technical. onehot['books'] = onehot['books_imported'] | onehot['books_technical'] # Print support values for books, books_imported, and books_technical. onehot[['books','books_imported','books_technical']].mean(axis=0) # # Compute the confidence metric # # Compute joint support for sports_leisure and health_beauty. joint_support = (onehot['sports_leisure'] & onehot['health_beauty']).mean() # Print confidence metric for sports_leisure -> health_beauty. joint_support / onehot['sports_leisure'].mean() # Print confidence for health_beauty -> sports_leisure. joint_support / onehot['sports_leisure'].mean() # # The Apriori Algorithm and Pruning from mlxtend.frequent_patterns import apriori # Apply apriori algorithm to data with min support threshold of 0.01. frequent_itemsets = apriori(onehot, min_support = 0.01) # Print frequent itemsets. frequent_itemsets # Apply apriori algorithm to data with min support threshold of 0.001. frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True) # Print frequent itemsets. frequent_itemsets # Apply apriori algorithm to data with min support threshold of 0.00005. frequent_itemsets = apriori(onehot, min_support = 0.00005, use_colnames = True) # Print frequent itemsets. frequent_itemsets # Apply apriori algorithm to data with a two-item limit. frequent_itemsets = apriori(onehot, min_support = 0.00005, max_len = 2, use_colnames = True) # # Computing association rules from Apriori output** from mlxtend.frequent_patterns import association_rules # Recover association rules using support and a minimum threshold of 0.0001. rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.0001) # Print rules header. rules.head() rules.to_csv('result_datasets/result_apriori.csv') # # Pruning association rules # Recover association rules using confidence threshold of 0.01. rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.01) # Print rules. rules rules.to_csv('result_datasets/result_Pruning.csv') # Select rules with a consequent support above 0.095. rules = rules[rules['consequent support'] > 0.095] # Print rules. rules # # The leverage metric # # Select rules with leverage higher than 0.0. rules = rules[rules['leverage'] > 0.0] # Print rules. rules # # Visualizing patterns in metrics # Recover association rules with a minimum support greater than 0.000001. rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.000001)
def test_inverse_transform(): oht = TransactionEncoder() oht.fit(dataset) np.testing.assert_array_equal(np.array(data_sorted), np.array(oht.inverse_transform(expect)))
data = pd.read_csv("correct_data.csv") df = pd.DataFrame(data) df = (df.groupby(["Panel ID", "Date"])) list_of_unique = list(df["Category"].unique()) flattened = [i for t in list_of_unique for i in t] groceries = list(set(flattened)) rules = list(permutations(groceries, 2)) rules_df = pd.DataFrame(rules, columns=['antecedents', 'consequents']) print(rules) encoder = TransactionEncoder().fit(list_of_unique) onehot = encoder.transform(list_of_unique) onehot = pd.DataFrame(onehot, columns=encoder.columns_) support = onehot.mean() print(onehot.head()) print(support) def support(x): # Compute support for antecedent AND consequent support = x.mean()
def test_fit_transform(): oht = TransactionEncoder() trans = oht.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
def test_transform_sparse(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset, sparse=True) assert (isinstance(trans, csr_matrix)) np.testing.assert_array_equal(expect, trans.todense())
def test_fit(): oht = TransactionEncoder() oht.fit(dataset) assert (oht.columns_ == [ 'Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice' ])
def encode(data): te = TransactionEncoder() te_ary = te.fit(l).transform(l) df = pd.DataFrame(te_ary, columns=te.columns_) return df
unique_items = set(np.unique(np.concatenate(np_data))) # %% print(unique_items) # %% print(len(unique_items)) # %% [markdown] # ## FPGrowth и FPMax # %% # 1 te = TransactionEncoder() te_ary = te.fit(np_data).transform(np_data) data = pd.DataFrame(te_ary, columns=te.columns_) # %% data # %% # 2 result_fpgrowth = fpgrowth(data, min_support=0.03, use_colnames = True) result_fpgrowth['length'] = np.fromiter(map(len, result_fpgrowth['itemsets']),dtype=int) # %%
print(num_customer.describe()) ############################################################################### #Q2-B print("\n Answer to question 2 part B :\n") print("Unique Items = ", len(set(Groceries_df['Item']))) ############################################################################### #Q2-C print("\n Answer to question 2 part C :\n") plt.hist(x=num_customer) plt.grid(True) plt.show() ############################################################################### ListItem = Groceries_df.groupby(['Customer' ])['Item'].apply(list).values.tolist() tran_encoder = TransactionEncoder() tran_encoder_list = tran_encoder.fit(ListItem).transform(ListItem) ItemIndicator = pd.DataFrame(tran_encoder_list, columns=tran_encoder.columns_) # Q2-D) print("\n Answer to question 2 part D :\n") frq_itemsets = apriori(ItemIndicator, min_support=75 / len(num_customer), use_colnames=True) frq_itemsets['length'] = frq_itemsets['itemsets'].apply(lambda x: len(x)) #number of itemdets print("The number of itemsets: ") print(len(frq_itemsets)) #higest k value print("The higest K value : ") print(max(frq_itemsets['length']))
#Bin Creation and Assignment to the data CGmax, CG0, BOLmax = [], [], [] apriDF = [] for i in range(len(cgm)): CGmax.append(maximum(cgm.loc[i])) BOLmax.append(maximum(bol.loc[i])) CG0.append(cgm.loc[i][5]) apriDF.append([ CalculateBins(max(cgm.loc[i])), CalculateBins(cgm.loc[i][5]), max(bol.loc[i]) ]) #Apriori Algorithm #For Most Frequent Itemsets transEnc = TransactionEncoder() transactions = pd.DataFrame(transEnc.fit(apriDF).transform(apriDF), columns=transEnc.columns_) rules = ar(ap(transactions, min_support=0.00000000001, use_colnames=True), min_threshold=0.0) rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x)) for column in ['antecedents', 'consequents']: rules[column] = rules[column].astype(str) rules[column] = rules[column].str.replace(re.escape('frozenset({'), '') rules[column] = rules[column].str.replace(re.escape('})'), '') rules["SET"] = rules["antecedents"] + ',' + rules['consequents'] rules['SET'] = rules['SET'].str.replace("'", "") rules['SET'] = rules.SET.apply(lambda x: x.split(',')) #rules.to_csv("Rules.csv") li = rules['SET'].tolist() y = [[(float(j)) for j in i] for i in li]
def makedataframe(dataset): transactencoder = TransactionEncoder() transactencoder_array = transactencoder.fit(dataset).transform(dataset) dataframe = pd.DataFrame(transactencoder_array, columns=transactencoder.columns_) return dataframe
# print(temp) transactions.append(temp) # print(transactions) '''使用efficient_apriori工具包''' from efficient_apriori import apriori itemsets, rules = apriori(transactions, min_support=0.05, min_confidence=0.3) print('频繁项集:', itemsets) print('关联规则:', rules) print('-' * 100) '''采用mlxtend.frequent_patterns工具包''' from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules from mlxtend.preprocessing import TransactionEncoder # 进行one-hot编码 te = TransactionEncoder() # 套路,当作工具使用 data = te.fit_transform(transactions) # 套路,当作工具使用 transactions = pd.DataFrame( data, columns=te.columns_) # 使用原list里的数值作为columns,否则将会由数字进行代替 itemsets = apriori(transactions, use_colnames=1, min_support=0.05) # 按照支持度从大到小进行排序 itemsets = itemsets.sort_values(by="support", ascending=False) print('-' * 20, '频繁项集', '-' * 20) print(itemsets) # 根据频繁项集计算关联规则,设置最小提升度 rules = association_rules(itemsets, metric='lift', min_threshold=1.1) # 按照提升度从大到小进行排序 rules = rules.sort_values(by="lift", ascending=False) print('-' * 20, '关联规则', '-' * 20) print(rules)
def get_frequent_set(total_i_list, appointed_output, my_refrigerator): def set_giver(sort_set_list: list): recommended_fequent_set = [] for r_num in range(1, 5): recommended_name = [] for num, i in enumerate(sort_set_list): if r_num == 1: # if 判斷反著寫,不符合的就進入continue if not ((appointed_output.issubset(i)) and (i.issubset(my_refrigerator))): continue elif r_num == 2: if not ((appointed_output.issubset(i)) and (round(sort_set.iloc[num, 0], 3) > 0.3)): continue elif r_num == 3: if not i.issubset(my_refrigerator): continue else: pass # 只有通過前面考驗的set會被append ''' i: 關聯配對 i & my_refrigerator - appointed_output: 找出的配對有除了input以外的其他食材(冰箱內的) i - my_refrigerator: 配對中有使用者沒有的 appointed_output - i:提供的配對有非input的食材 ''' set_info = [ sort_set.index[num], round(sort_set.iloc[num, 0], 3), i, i & my_refrigerator - appointed_output, i - my_refrigerator, appointed_output - i, r_num ] recommended_fequent_set.append(set_info) recommended_name.append(i) # 湊齊5個就回傳 if len(recommended_fequent_set) == 5: return recommended_fequent_set for selected_set in recommended_name: sort_set_list.remove(selected_set) # 不管最後找到幾個結果都回傳 return recommended_fequent_set te = TransactionEncoder() te_ary = te.fit(total_i_list).transform(total_i_list) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) if frequent_itemsets["itemsets"].count() < 2: frequent_itemsets = apriori(df, min_support=1 / (df.count()[0] - 1), use_colnames=True) # print(f"{frequent_itemsets.count()}") frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply( lambda x: len(x)) output_set = frequent_itemsets[(frequent_itemsets['length'] >= len(appointed_output) + 1)] # print(output_set.count()) sort_set = output_set.sort_values(['support'], ascending=False) recommended_fequent_set = set_giver(sort_set['itemsets'].tolist()) return recommended_fequent_set
def setUp(self): database = [['a'], ['b'], ['c', 'd'], ['e']] te = TransactionEncoder() te_ary = te.fit(database).transform(database) self.df = pd.DataFrame(te_ary, columns=te.columns_)
def run(self): te = TransactionEncoder() te_ary = te.fit(self.df).transform(self.df) df = pd.DataFrame(te_ary, columns=te.columns_) df = apriori(df, min_support=0.05) return df
def obterner_lista(self): try: db = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database) #TODO: Borrar esta linea self.id_cliente = 273 self.id_local = 42 db_cursor = db.cursor() #Buscamos las transacciones/productos query = """ SELECT GROUP_CONCAT(DISTINCT f.id_familia) productos FROM pedido_articulo pa JOIN pedido p USING (id_pedido) JOIN articulo a USING (cod_interno) JOIN familia f USING (id_familia) WHERE p.id_cliente = %s AND p.id_local = %s GROUP BY p.id_pedido LIMIT 2 ;""" execute = db_cursor.execute(query, (self.id_cliente, self.id_local)) print(" LA exceute ", execute, '\n') # print(db_cursor.fetchall(), '\n') # row = db_cursor.fetchone() # while row is not None: # for r in row: # print(r) # row = db_cursor.fetchone() te = TransactionEncoder() dataset = [] row = db_cursor.fetchone() print(row) e = '' while row is not None: data_aux = [] for elem in row[0]: if (elem != ','): e = e + elem else: data_aux.insert(len(data_aux), int(e)) e = '' e = '' dataset.insert(len(dataset), data_aux) row = db_cursor.fetchone() print(dataset) te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) pd.set_option("display.max_rows", None, "display.max_columns", None) print(df) fpg = fpgrowth(df, min_support=0.6) print(fpg) except Exception as e: print(e) exit(-1) finally: db.close() db_cursor.close()
################################# # Import necessary python libraries import pandas as pd import csv from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori, association_rules # Read in CSV file into an array of arrays # Make sure that your data is structured like the data given in tutorial dataset = [] with open('apriori_data.csv') as f: reader = csv.reader(f) for row in reader: dataset.append(row) for row in dataset: print(row) # Transform your data for the apriori algorithm oht = TransactionEncoder() oht_ary = oht.fit(dataset).transform(dataset) df = pd.DataFrame(oht_ary, columns=oht.columns_) print(df) frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True) print(frequent_itemsets) rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) print(rules[['antecedents', 'consequents', 'support', 'confidence']])
def return_support(list_sku=list_sku): te = TransactionEncoder() te_ary = te.fit(list_sku).transform(list_sku) df_list = pd.DataFrame(te_ary, columns=te.columns_) support = apriori(df_list, min_support=0.05, use_colnames=True) return support
def reduce_rules(self, rules, instance_quorum, number_of_medoids): """following_breadcrumbs function finds Args: instance: The instance we want to find the paths reduction: The targets of our train data save_plots: The bar and stacked area plots for every feature will be saved Return: """ get_itemsets = [] for pr in rules: itemset = [] for p in pr: itemset.append(p) get_itemsets.append(itemset) te = TransactionEncoder() te_ary = te.fit(get_itemsets).transform(get_itemsets) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = association_rules(apriori(df, min_support=0.1, use_colnames=True), metric="support", min_threshold=0.1).sort_values( by="confidence", ascending=True) size = 0 k = 1 size_of_ar = len(list(list(frequent_itemsets['antecedents']))) items = set() reduced_rules = rules new_feature_list = [] for pr in reduced_rules: for p in pr: items.add(p) new_feature_list = list(items) while size < instance_quorum and k < size_of_ar: feature_set = set() for i in range(0, k): for j in list(list(frequent_itemsets['antecedents'])[i]): feature_set.add(j) new_feature_list = list(feature_set) redundant_features = [ i for i in self.feature_names if i not in new_feature_list ] reduced_rules = [] for i in rules: if sum([1 for j in redundant_features if j in i]) == 0: reduced_rules.append(i) size = len(reduced_rules) k += 1 del get_itemsets, te, te_ary, df, frequent_itemsets if len(reduced_rules) < instance_quorum: reduced_rules = rules for pr in reduced_rules: for p in pr: items.add(p) new_feature_list = list(items) if len(reduced_rules ) > instance_quorum: # If we need more reduction on path level A = [] for k in range(len(reduced_rules)): B = [] for j in range(len(reduced_rules)): if k == j: B.append(0) # or 1? else: sim = path_similarity(reduced_rules[k], reduced_rules[j], new_feature_list, self.min_max_feature_values) # sim = path_distance(reduced_rules[k], reduced_rules[j], new_feature_list, # self.min_max_feature_values) #Tested with distance metric of iForest B.append(1 - sim) A.append(B) A = np.array(A) MS, S = kmedoids.kMedoids(A, number_of_medoids) medoids_sorted = sorted(S, key=lambda k: len(S[k]), reverse=True) k = 0 size = 0 reduced_rules_medoids = [] while size < instance_quorum and k < len(medoids_sorted): for j in S[medoids_sorted[k]]: reduced_rules_medoids.append(reduced_rules[j]) k += 1 size = len(reduced_rules_medoids) items = set() if len(reduced_rules_medoids) >= instance_quorum: reduced_rules = reduced_rules_medoids for pr in reduced_rules_medoids: for p in pr: items.add(p) new_feature_list = list(items) if len(reduced_rules) > instance_quorum: random.shuffle(reduced_rules) reduced_rules = reduced_rules[:instance_quorum] items = set() for pr in reduced_rules: for p in pr: items.add(p) new_feature_list = list(items) return [reduced_rules, new_feature_list]
def main(): train = pd.read_csv('data/train.csv') # test = pd.read_csv('data/test.csv') entity_weight = pd.read_csv('data/entity_weight.csv') train = train.drop(['keyword', 'location'], axis=1) # train_hashtags = train.copy() train['hashtags'] = train['text'].apply(lambda x: extract_hashtags(x)) train_hashtags = train[train['hashtags'].map(lambda d: len(d)) > 0].copy() train_hashtags['target'] = train_hashtags['target'].astype(str) train_hashtags['t'] = train_hashtags['hashtags'] + train_hashtags['target'].apply(lambda x: [x]) # hashtags = [] # for x in train['hashtags']: # hashtags.extend(x) # # hashtags = list(set(hashtags)) cd = CleanData() data_clean = cd.normalize_text(train.copy()) data_clean['keywords'] = data_clean['clean_text'].str.split() data_clean['target'] = data_clean['target'].astype('str') # keywords of all rows keys = [] data_clean['keywords'].apply(lambda x: keys.extend(x)) keys = list(set(keys)) # keywords = extract_ents(keys) # (pd.DataFrame.from_dict(keywords, orient='index')).to_csv('data/keywords.csv') keywords = pd.read_csv('data/keywords.csv') keywords = keywords.merge(entity_weight, how='left', left_on='entity', right_on='entity') keywords_dic = dict(zip(keywords['keyword'], keywords['weight'])) # messages_vector = text_to_vector_weighted_entity(data_clean, keywords_dic) # messages_vector.set_index('id').to_csv('data/messages_vector.csv', header=True) data_clean['t'] = data_clean['keywords'] + data_clean['target'].apply(lambda x: [x]) te = TransactionEncoder() te_ary = te.fit(data_clean['t']).transform(data_clean['t']) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.005, use_colnames=True) rules = association_rules(frequent_itemsets) # rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2) rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x)) rules = rules[((rules['consequents'] == {'1'}) | (rules['consequents'] == {'0'})) & (rules['confidence'] >= 0.55)] tmp = rules[(rules['consequents'] == {'1'}) | (rules['consequents'] == {'0'})] rule_keywords = list(rules['antecedents']) rule_keywords = frozenset().union(*rule_keywords) with open('data/rule_keywords.txt', 'w') as f: for item in rule_keywords: f.write("%s\n" % item) rule_entity_keywords = {key: value for key, value in keywords_dic.items() if key in rule_keywords} # one_rules = rules[((rules['consequents'] == {'1'}) | (rules['consequents'] == {'0'})) & (rules['confidence'] >= 0.55)] # one_rules = one_rules[(one_rules['confidence'] >= 0.55)] # one_rules = one_rules[(one_rules['antecedent_len'] >= 2) & # (one_rules['confidence'] > 0.75) & # (one_rules['lift'] > 1.2)] # zero_rules = rules[(rules['consequents'] == {'0'})] # zero_rules = zero_rules[(zero_rules['confidence'] >= 0.55)] # # # hashtag rules # te_hashtags = TransactionEncoder() # te_ary_hashtags = te_hashtags.fit(train_hashtags['t']).transform(train_hashtags['t']) # df_hashtags = pd.DataFrame(te_ary_hashtags, columns=te_hashtags.columns_) # frequent_itemsets_hashtags = apriori(df_hashtags, min_support=0.005, use_colnames=True) # # rules_hashtags = association_rules(frequent_itemsets_hashtags, metric="confidence", min_threshold=0.6) # # rules_hashtags = association_rules(frequent_itemsets_hashtags, metric="lift", min_threshold=1.2) # # rules_hashtags["antecedent_len"] = rules_hashtags["antecedents"].apply(lambda x: len(x)) # # one_rules_hashtags = rules_hashtags[(rules_hashtags['consequents'] == {'1'})] # one_rules_hashtags = one_rules_hashtags[(one_rules_hashtags['confidence'] >= 0.55)] # # zero_rules_hashtags = rules_hashtags[(rules_hashtags['consequents'] == {'0'})] # zero_rules_hashtags = zero_rules_hashtags[(zero_rules_hashtags['confidence'] >= 0.55)] # # # frozensets of keywords and hashtags # ones_keywords = list(one_rules['antecedents']) # ones_keywords = frozenset().union(*ones_keywords) # ones_hashtags = list(one_rules_hashtags['antecedents']) # ones_hashtags = frozenset().union(*ones_hashtags) # # zeros_keywords = list(zero_rules['antecedents']) # zeros_keywords = frozenset().union(*zeros_keywords) # zeros_hashtags = list(zero_rules_hashtags['antecedents']) # zeros_hashtags = frozenset().union(*zeros_hashtags) # vector of messages messages_vector = text_to_vector_ar(data_clean, ones_keywords, ones_hashtags, zeros_keywords, zeros_hashtags) X_train_df, X_test_df, y_train, y_test = train_test_split(data_clean['text'], data_clean['target'], random_state=0) X_train = messages_vector.iloc[X_train_df.index].values X_test = messages_vector.iloc[X_test_df.index].values # train model clf = LogisticRegression(random_state=0).fit(X_train, y_train) y_predict = clf.predict(X_test) print('LogisticRegression scores:\n') print(classification_report(y_test, y_predict)) clf_svm = svm.SVC() clf_svm.fit(X_train, y_train) y_predict = clf_svm.predict(X_test) print('SVM Results:\n') print(classification_report(y_test, y_predict)) print('done')
data = pd.read_csv(r"C:\Users\acer\Downloads\BreadBasket_DMS.csv") data = data.set_index(['Item']) filtered = data.drop(['NONE']) data = data.reset_index() filtered = filtered.reset_index() transaction_list = [] # For loop to create a list of the unique transactions throughout the dataset: for i in filtered['Transaction'].unique(): tlist = list(set(filtered[filtered['Transaction'] == i]['Item'])) if len(tlist) > 0: transaction_list.append(tlist) te = TransactionEncoder() te_ary = te.fit(transaction_list).transform(transaction_list) df2 = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df2, min_support=0.01, use_colnames=True) #take minimum threshold rules = association_rules(frequent_itemsets, metric='lift') rules.sort_values('confidence', ascending=False) print("Rules:\n") print(rules.head(5)) print("\n") #now categorise every rule with different range of confidence rules['support'] = rules['support'] * 100 rules['confidence'] = rules['confidence'] * 100 rules['lift'] = rules['lift']
def _freqItemsetMining(self): y = None print("Starting FIM") print("Transaction Encoder is started") te = TransactionEncoder() te_ary = te.fit(self._dataSet.to_numpy()).transform( self._dataSet.to_numpy()) transformed_df = pd.DataFrame(te_ary, columns=te.columns_) print("Transaction Encoder is finished") print("Apriori is started") freqItemsets = apriori(transformed_df, min_support=float( self._env.get(key="minSupport")), use_colnames=True) print("Saving Frequent Itemsets...") freqItemsets.to_csv(self._env.get(key="freqItemFilePath")) print("Frequent Itemsets Saved") print("Apriori is finished") print("Association rules mining is started") rules = association_rules(freqItemsets, metric=self._env.get(key="ruleMetric"), min_threshold=float( self._env.get(key="minSupport"))) rules = rules[ (rules['consequents'] == {self._env.get(key="c1")}) | (rules['consequents'] == {self._env.get(key="c2")}) | (rules['consequents'] == {self._env.get(key="c3")}) | (rules['consequents'] == {self._env.get(key="c4")}) | (rules['consequents'] == {self._env.get(key="c5")}) & ((self._env.get(key="c1") not in (rules.antecedents.to_list())) | (self._env.get(key="c2") not in (rules.antecedents.to_list())) | (self._env.get(key="c3") not in (rules.antecedents.to_list())) | (self._env.get(key="c4") not in (rules.antecedents.to_list())) | (self._env.get(key="c5") not in (rules.antecedents.to_list())))] rules['antecedents'] = rules.apply( lambda row: FIMFunctions.convertToStringList( str(list(row['antecedents']))), axis=1) rules['consequents'] = rules.apply( lambda row: FIMFunctions.convertToStringList( str(list(row['consequents']))), axis=1) rules.reset_index(inplace=True) print("Association rules mining is finished") print("One Hot encoding is started") cols = self._dataSet.columns[:-1] print(cols) featureList = [ self._env.get(key="p-tcp"), self._env.get(key="p-http"), self._env.get(key="p-ssh"), self._env.get(key="p-dns"), self._env.get(key="p-ftp"), self._env.get(key="p-sshv2"), self._env.get(key="l0"), self._env.get(key="l1"), self._env.get(key="l2"), self._env.get(key="l3"), # self._env.get(key="r-public"), self._env.get(key="r-private"), self._env.get(key="r-non"), self._env.get(key="c1"), self._env.get(key="c2"), self._env.get(key="c3"), self._env.get(key="c4"), self._env.get(key="c5"), self._env.get(key="d1"), self._env.get(key="d2"), self._env.get(key="d3"), self._env.get(key="d4"), self._env.get(key="d5"), self._env.get(key="d6"), self._env.get(key="d7"), self._env.get(key="d8"), self._env.get(key="d9"), self._env.get(key="d10") ] x = pd.DataFrame(FIMFunctions.oneHot(rules, featureList), columns=featureList) if len(x) > 0: y = rules.apply(lambda row: str(row["consequents"][0].replace( '[', '').replace(']', '')), axis=1) return x, y, rules