def test_cloning(): oht = TransactionEncoder() oht.fit(dataset) oht2 = clone(oht) msg = ("'TransactionEncoder' object has no attribute 'columns_'") assert_raises(AttributeError, msg, oht2.transform, dataset) trans = oht2.fit_transform(dataset) np.testing.assert_array_equal(expect, trans)
def print_frequent_itemsets(jsonfilename): with open('2003_Campaign_Contributions.tsv.json') as f: data = json.load(f) ls = [] for col in data["columns"]: l = [] for tp in col["dataTypes"]: dt = tp["type"] l.append(dt) ls.append(l) te = TransactionEncoder() te_ary = te.fit(ls).transform(ls) df = pd.DataFrame(te_ary, columns=te.columns_) fi = apriori(df, min_support=0.1, use_colnames=True) fi['length'] = fi['itemsets'].apply(lambda x: len(x)) for i in range(2, 4): print(i, "frequent itemset") print("------------------------------------") tm = fi[(fi['length'] == i) & (fi['support'] >= 0.1)] if tm.size == 0: print("No itemsets present") else: print(tm["itemsets"].to_string()) print("------------------------------------\n")
def frequent_set_miner(self, T, K): te = TransactionEncoder() te_ary = te.fit(T).transform(T) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=K, use_colnames=True) #convert to list of list listofList = [] for item in list(frequent_itemsets['itemsets']): list_item = list(item) listofList.append(list_item) #we just want to have the biggest sets patterns = sorted(list(listofList), key=len) if len(patterns) > 0: frequent = [patterns.pop()] else: frequent = [] while len(patterns) > 0: candidate = patterns.pop() super = True for f in frequent: if all(elem in f for elem in candidate): super = False break if super: frequent.append(candidate) return frequent
def mine_rules(itemsets): test = itemsets.copy() te = TransactionEncoder() te_ary = te.fit(test).transform(test, sparse=True) df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_) rules = apriori(df, min_support=0.001, use_colnames=True) return rules
def extract(min_sup=0.4, i_url="./data/chat_server.csv", min_num=1): # 최소 지지도, 인풋 파일 경로, 최소 조합 수 data, raw = load_data(i_url) te = TransactionEncoder() te_ary = te.fit(data).transform(data) df = pd.DataFrame(te_ary, columns=te.columns_) # 자주 나오는 키워드 추출 frequent_items = apriori(df, min_support=min_sup, use_colnames=True) frequent_items['length'] = frequent_items['itemsets'].apply( lambda x: len(x)) wunch_input = [] # print("APRIORI", frequent_items[frequent_items["length"] > min_num]['itemsets']) for line in frequent_items[frequent_items["length"] == 1]['itemsets']: if len(line) < 1: pass temp = "" for word in line: temp = temp + " " + word wunch_input.append(temp[1:]) return wunch_input
def SelectSpec(path1, year, ex=float(1.0)): data = pd.read_excel(path1) data_ex = data[data['顺序'] == ex] data_site = set(data_ex['样地号']) D = {} Selsect_set = [] Site_set = [] for i in data_site: Site_spec = list(set(data_ex[data_ex['样地号'] == i]['物种'])) Site_set.append(Site_spec) te = TransactionEncoder() # 进行one-hot编码,0-1 te_array = te.fit(Site_set).transform(Site_set) df = pd.DataFrame(te_array, columns=te.columns_) # 用apriori找出频繁项集 freq = apriori(df, min_support=0.5, use_colnames=True) Max_len = 3 n = 0 for item in reversed(freq['itemsets']): if len(item) >= Max_len: # Max_len = len(item) Spec_lis = [i for i in item] D[n] = [] Selsect_set.append(Spec_lis) # 保存所有的物种组合 D[n].append(Spec_lis) n = n + 1 else: break # 返回物种集合,以及物种集合所对应的plot return Selsect_set,Site_set,data_site
def SelectSpec(engine, year, ex=str(1.0)): data = pd.read_sql(str(year), con=engine) data_ex = data[data['顺序'] == ex] data_site = set(data_ex['样地号']) D = {} Selsect_set = [] Spec_set = [] for i in data_site: Site_spec = list(set(data_ex[data_ex['样地号'] == i]['物种'])) Spec_set.append(Site_spec) te = TransactionEncoder() # 进行one-hot编码,0-1 te_array = te.fit(Spec_set).transform(Spec_set) df = pd.DataFrame(te_array, columns=te.columns_) # 用apriori找出频繁项集 freq = apriori(df, min_support=0.5, use_colnames=True) Max_len = 0 n = 0 for item in reversed(freq['itemsets']): if len(item) >= Max_len: Max_len = len(item) Spec_lis = [i for i in item] D[n] = [] Selsect_set.append(Spec_lis) D[n].append(Spec_lis) site_temp = [] for i in range(len(Spec_set)): if (set(Spec_set[i]) | item) == set(Spec_set[i]): site_temp.append(list(data_site)[i]) D[n].append(site_temp) n = n + 1 else: break # print(D) return Selsect_set, D
def fpg(sent): x = dict() words = [] for i in range(len(sent)): #print(sent[i]) words.append((preprocess(sent[i]))) #print(words) #print(classe) try: te = TransactionEncoder() #patterns = pyfpgrowth. find_frequent_patterns(words, 10) #rules = pyfpgrowth. generate_association_rules(patterns,0.8) te_ary = te.fit(((words))).transform((words)) words = [] df_r = pd.DataFrame(te_ary, columns=te.columns_) #print(df_r) #print(df_r) #print(rules) fpg = fpgrowth(df_r, min_support=0.1, use_colnames=True, max_len=3) #fpgrowth except ValueError: print('Value Error') print(fpg) return fpg
def recommend2(request): dataset = [] users = User.objects.all() for user in users: liked_movies = Movie.objects.filter(user_like=user) liked_movie_list = [] for movie in liked_movies: liked_movie_list.append(movie.title) dataset.append(liked_movie_list) te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True) association = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3) associations = [] i = 0 while i >= 0: try: associations.append(association.iloc[i]) i += 1 except: break context = { 'liked_movies': liked_movies, 'associations': associations, } return render(request, 'movies/recommend2.html', context)
def main(): baskets = [] infile = open("StudentsPerformance.csv", "r") infile.readline() for line in infile: line = line.strip('\n') basket = line.split(",") n1 = int(basket.pop()) n2 = int(basket.pop()) n3 = int(basket.pop()) avg = (n1 + n2 + n3) / 3 if avg >= 70: basket.append("Pass") else: basket.append("Fail") baskets.append(basket) te = TransactionEncoder() te_ary = te.fit(baskets).transform(baskets) df = pd.DataFrame(te_ary, columns=te.columns_) frequentItemsets = apriori(df, min_support=0.15, use_colnames=True) frequentItemsets['length'] = frequentItemsets['itemsets'].apply( lambda x: len(x)) frequentItemsets = frequentItemsets.sort_values(by='support', ascending=False) frequentItemsets.to_csv("results.csv", encoding='utf-8', index=False)
def create_weather_df(): attributes = [] data = [] data_start = False with open("./data/weather.nominal.arff", "r") as f: for line in f.readlines(): line = line.strip() line = line.replace("TRUE", "windy") line = line.replace("FALSE", "not_windy") line = re.sub(r"yes$", "play", line) line = re.sub(r"no$", "no_play", line) if data_start: data.append(line.split(",")) continue if line.startswith("@attribute"): attributes.append(line.split(" ")[1]) if line.startswith("@data"): data_start = True te = TransactionEncoder() te_ary = te.fit(data).transform(data) df = pd.DataFrame(te_ary, columns=te.columns_) df.to_csv("./data/weather.nominal.csv", index=False)
def convert_dataset(dataset): status('Converting dataset to transaction...') data = dataset.values.tolist() te = TransactionEncoder() te_ary = te.fit(data).transform(data) df = pd.DataFrame(te_ary, columns=te.columns_) return df
def extract(self): transactions_for_te = [] for doc in self.transaction_docs: transaction = doc["transaction"] transactions_for_te.append(transaction.split()) te = TransactionEncoder() oht_ary = te.fit(transactions_for_te).transform(transactions_for_te, sparse=True) sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_) if self.__settings_for_app_exist(): minsup = self.__get_minsup_fraction() else: minsup = 0.1 self.mongo.save_new_settings(self.application_label, self.collection_name_prefix, 10, 2) df_aspects = apriori(sparse_df, min_support=minsup, use_colnames=True, max_len=1) self.aspect_docs = [] for i in df_aspects.index: aspect = ' '.join(list(df_aspects.loc[i, 'itemsets'])) self.aspect_docs.append({ 'aspect': aspect, 'support': df_aspects.loc[i, 'support'] })
def apriori(): """ 使用Apriori算法找出数据的频繁项集,进而分析物品关联度 :return: """ #导包 import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori # 导入关联规则包 from mlxtend.frequent_patterns import association_rules #设置数据集 data_set = [['牛奶', '洋葱', '肉豆蔻', '芸豆', '鸡蛋', '酸奶'], ['莳萝', '洋葱', '肉豆蔻', '芸豆', '鸡蛋', '酸奶'], ['牛奶', '苹果', '芸豆', '鸡蛋'], ['牛奶', '独角兽', '玉米', '芸豆', '酸奶'], ['玉米', '洋葱', '洋葱', '芸豆', '冰淇淋', '鸡蛋']] te = TransactionEncoder() #进行one-hot编码 te_ary = te.fit(data_set).transform(data_set) # print(type(te_ary)) df = pd.DataFrame(te_ary, columns=te.columns_) #利用apriori找出频繁项集 freq = apriori(df, min_support=0.4, use_colnames=True) #计算关联规则 result = association_rules(freq, metric="confidence", min_threshold=0.6) # 排序 result.sort_values(by='confidence', ascending=False, axis=0) print(result) result.to_excel("./result.xlsx") return None
def eb_set1_association(eb_subset): # convert the dataframe to a list eb_list = eb_subset.astype(str).values.tolist() # encode the list to true or false transaction types # give a name to the encoder for easy access TransEncode = TransactionEncoder() # encode the list TE_arr = TransEncode.fit(eb_list).transform(eb_list) # change the list to a dataframe eb_df = pd.DataFrame(TE_arr, columns=TransEncode.columns_) # define a list of min support sup_list = [0.03, 0.05, 0.08] # apply apriori for sup in sup_list: freq_sets = apriori(eb_df, min_support=sup, use_colnames=True) freq_sets['Length'] = freq_sets['itemsets'].apply(lambda x: len(x)) # get the most frequent itemset if sup == max(sup_list): most_freq_sets = freq_sets[(freq_sets['Length'] >= 2) & (freq_sets['support'] >= 0.09)] # print out the result print('********************') print('Most Frequent Set for Eventbrite Category') print(most_freq_sets['itemsets']) print('Support of most frequent set is ') print(most_freq_sets['support'])
def calc_fpgrowth(df, element, min_support): # 원-핫 인코딩 te = TransactionEncoder() te_ary = te.fit(df).transform(df) df = pd.DataFrame(te_ary, columns=te.columns_) print(df.head()) # fpgrowth print("get frequent set by min support =", min_support / 100) frequent_itemsets = fpgrowth(df, min_support=min_support / 100, use_colnames=True, verbose=1) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply( lambda x: len(x)) frequent_itemsets['count'] = len(df) * frequent_itemsets['support'] frequent_itemsets['count'] = frequent_itemsets['count'].apply(np.ceil) frequent_itemsets['count'] = frequent_itemsets['count'].astype('int') frequent_itemsets.sort_values(by=['support', 'length'], ascending=False, inplace=True) print(frequent_itemsets.head()) # association rule rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01) rules['total_set'] = [ frozenset.union(*X) for X in rules[['antecedents', 'consequents']].values ] #rules=rules[rules["consequents"]==frozenset(element)] rules = rules[~rules['consequents']. apply(lambda x: x.isdisjoint(frozenset(element)))] #rules=rules[~rules["antecedents"].apply(lambda x : x.isdisjoint(frozenset(keyword)))] rules.sort_values(by=['confidence', 'antecedent support'], ascending=False, inplace=True) # 지지도 : (동시 포함 수) / (전체 수) rules['count'] = len(df) * rules['support'] rules['support'] = 100 * rules['support'] rules['confidence'] = 100 * rules['confidence'] rules['count'] = rules['count'].apply(np.ceil) rules['count'] = rules['count'].astype('int') rules['support'] = rules['support'].round(2) rules = rules.loc[:, [ 'antecedents', 'consequents', 'support', 'count', 'confidence', 'total_set' ]] rules.columns = [ '연관약품코드(전)', '연관약품코드(후)', '지지도(%)', '출현빈도', '연관도(%)', 'total_set' ] frequent_itemsets["total_set"] = frequent_itemsets["itemsets"] frequent_itemsets['support'] = frequent_itemsets['support'] * 100 frequent_itemsets['support'] = frequent_itemsets['support'].round(2) frequent_itemsets = frequent_itemsets.loc[:, [ 'itemsets', 'support', 'count', 'length', 'total_set' ]] frequent_itemsets.columns = ['출현집합', '지지도(%)', '출현빈도', '품목개수', 'total_set'] frequent_itemsets.reset_index(drop=True) rules.reset_index(drop=True) return frequent_itemsets, rules
def createFreqItems(data): te = TransactionEncoder() te_ary = te.fit(data).transform(data) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = fpgrowth(df, min_support=0.001, use_colnames=True, max_len=5) frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) frequent_itemsets['support'] = frequent_itemsets['support'] return frequent_itemsets
def convert_to_matrix(db): transactions = [] for i, row in db.iterrows(): transactions.append(row["itemsets"]) te = TransactionEncoder() te_ary = te.fit(transactions).transform(transactions) basket_sets = pd.DataFrame(te_ary, columns=te.columns_) return basket_sets
def get_dataset(self, ratings): transactions = [[movie_id for _, movie_id, _ in movies_ids] for user_id, movies_ids in groupby(ratings, key=itemgetter(0))] transaction_encoder = TransactionEncoder() one_hot = transaction_encoder.fit(transactions).transform( transactions) return pd.DataFrame(one_hot, columns=transaction_encoder.columns_)
def return_support(self): print('Calculating support values...') te = TransactionEncoder() te_array = te.fit(self.list_sku).transform(self.list_sku) df_list = pd.DataFrame(te_array, columns=te.columns_) support = apriori(df_list, min_support=0.05, use_colnames=True) support = support.sort_values(['support'], ascending=False) support['length'] = support.apply(lambda x: len(x['itemsets']), axis=1) return support
def applyApriori(dataset, support): rec = dataset.values.tolist() te = TransactionEncoder() te_ary = te.fit(rec).transform(rec) df = pd.DataFrame(te_ary, columns=te.columns_) from mlxtend.frequent_patterns import apriori freq_Itemsets = apriori(df, min_support=support, use_colnames=True) freq_Itemsets['length'] = freq_Itemsets['itemsets'].apply(lambda x: len(x)) return (freq_Itemsets[freq_Itemsets['length'] > 1])
def get_frequent_set(total_i_list, appointed_output, my_refrigerator): def set_giver(sort_set_list: list): recommended_fequent_set = [] for r_num in range(1, 5): recommended_name = [] for num, i in enumerate(sort_set_list): if r_num == 1: # if 判斷反著寫,不符合的就進入continue if not ((appointed_output.issubset(i)) and (i.issubset(my_refrigerator))): continue elif r_num == 2: if not ((appointed_output.issubset(i)) and (round(sort_set.iloc[num, 0], 3) > 0.3)): continue elif r_num == 3: if not i.issubset(my_refrigerator): continue else: pass # 只有通過前面考驗的set會被append ''' i: 關聯配對 i & my_refrigerator - appointed_output: 找出的配對有除了input以外的其他食材(冰箱內的) i - my_refrigerator: 配對中有使用者沒有的 appointed_output - i:提供的配對有非input的食材 ''' set_info = [sort_set.index[num], round(sort_set.iloc[num, 0], 3), i, i & my_refrigerator - appointed_output, i - my_refrigerator, appointed_output - i,r_num] recommended_fequent_set.append(set_info) recommended_name.append(i) # 湊齊5個就回傳 if len(recommended_fequent_set) == 5: return recommended_fequent_set for selected_set in recommended_name: sort_set_list.remove(selected_set) # 不管最後找到幾個結果都回傳 return recommended_fequent_set te = TransactionEncoder() te_ary = te.fit(total_i_list).transform(total_i_list) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) if frequent_itemsets["itemsets"].count() < 2: frequent_itemsets = apriori(df, min_support=1/(df.count()[0]-1), use_colnames=True) # print(f"{frequent_itemsets.count()}") frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) output_set = frequent_itemsets[(frequent_itemsets['length'] >= len(appointed_output) + 1)] # print(output_set.count()) sort_set = output_set.sort_values(['support'], ascending=False) recommended_fequent_set = set_giver(sort_set['itemsets'].tolist()) return recommended_fequent_set
def get_df_items(itemsets): '''Creates a one-hot encoded dataframe of the given itemsets''' transaction_encoder = TransactionEncoder() transaction_encoded_ary = transaction_encoder.fit(itemsets).transform( itemsets) #Dataframe df = pd.DataFrame(transaction_encoded_ary, columns=transaction_encoder.columns_) return df
def apply_apriori(file,mins): df = read(file) te = TransactionEncoder() te_ary = te.fit(df).transform(df) df = pd.DataFrame(te_ary, columns=te.columns_) x = apriori(df, min_support=0.2, use_colnames=True) print x['itemsets']
def solution(): data = pd.read_csv('res/Retail.csv') #print(data.head(10)) print('Total data shape:', data.shape) print('Unscanned Items shape:', data[data['Dept'] == '0999:UNSCANNED ITEMS'].shape) data.drop(data.loc[data['Dept'] == '0999:UNSCANNED ITEMS'].index, inplace=True) print('Data shape after dropping unscanned items:', data.shape) res1_candy = (data[data['Dept'] == '0973:CANDY'].shape)[0] print("number of times ‘0973:CANDY’ sold:", res1_candy) #df = data.groupby('POS Txn') #print(dataset.head()) transaction_list = [] for i in data['POS Txn'].unique(): tlist = list(set(data[data['POS Txn'] == i]['Dept'])) if len(tlist) > 0: transaction_list.append(tlist) te = TransactionEncoder() te_ary = te.fit(transaction_list).transform(transaction_list) df2 = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df2, min_support=0.02, use_colnames=True) rules = association_rules(frequent_itemsets, metric='lift', min_threshold=2) sup_df = rules.sort_values('support', ascending=False).reset_index() res2_maxsupport = round(sup_df['support'][0], 5) #print(sup_df.iloc[:5,:6]) #print(sup_df.iloc[:-5,:6]) print(res2_maxsupport) print('Rules shape:', rules.shape) res3_totrules = rules.shape[0] print(res3_totrules) fildf = rules[(rules['lift'] >= 3) & (rules['confidence'] >= 0.1)] print('Filtered Rules shape:', fildf.shape) res4_filrules = fildf.shape[0] print(res4_filrules) #print(rules) # Creating a list of the answer result = [res1_candy, res2_maxsupport, res3_totrules, res4_filrules] print('Final Result:', result) # NOTE: Here 100, 0.54321, 40, 20 are the answer of 1st, 2nd, 3rd and 4th question respectively. Change it accordingly. # Finally create a dataframe of the final output and write the output to output.csv result = pd.DataFrame(result) # writing output to output.csv result.to_csv('output/output.csv', header=False, index=False)
def question_1(): # (a) print("----Part a----") df = pd.read_csv('Groceries.csv') # count unique items in each customer data = df.groupby(['Customer'])['Item'].count() print(len(data)) # histogram plt.hist(data) plt.xlabel('Unique Items') plt.ylabel('Count') plt.title("Histogram of unique items") plt.show() # get 25,50,75 percentile quarts = np.percentile(data, [25, 50, 75]) print(f"25%: {quarts[0]}, 50%: {quarts[1]}, 75%: {quarts[2]}") # (b) print("----Part b----") # group by items, and make it into a list of lists data_items = df.groupby(['Customer'])['Item'].apply(list).values.tolist() # apriori alg te = TransactionEncoder() te_ary = te.fit(data_items).transform(data_items) item_indicators = pd.DataFrame(te_ary, columns = te.columns_) frequent_item_sets = apriori(item_indicators, min_support = 75 / len(data_items), use_colnames = True, max_len = None) # how to determine max_len? total_item_sets = len(frequent_item_sets) print(f"{total_item_sets} itemsets") largest_k = len(frequent_item_sets['itemsets'][total_item_sets - 1]) print(f"Largest k: {largest_k}") # (c) print("----Part c----") ass_rules = association_rules(frequent_item_sets, metric = "confidence", min_threshold = 0.01) print(f"{len(ass_rules)} Association rules") # (d) print("----Part d----") plt.scatter(ass_rules['confidence'], ass_rules['support'], c = ass_rules['lift'], s = ass_rules['lift']) plt.xlabel("Confidence") plt.ylabel("Support") plt.title("Support vs Confidence") color_bar = plt.colorbar() color_bar.set_label("Lift") plt.show() print("Just a graph for this part") # (e) print("----Part e----") ass_rules_e = association_rules(frequent_item_sets, metric = "confidence", min_threshold = 0.6) print(ass_rules_e.to_string())
def associationRule(df): """ :param df: Input the Gross dataframe - part2cleanedGrosses.csv :return: Print out and save the itemsets for three different support values and calculate the confidence. """ # Generating the itemsets df['week_ending'] = pd.to_datetime( df['week_ending']).dt.strftime('%Y-%m-%d') gross_date = [ x for x in gross['week_ending'].unique() if int(x[0:4]) > 2000 ] all_date = df['week_ending'].unique() itemset = [] for i in all_date: temp = list( gross.loc[(df['week_ending'] == i) & df['percent_of_cap'] >= 0.8, 'show']) itemset.append(temp) itemset2 = [] for i in gross_date: temp = list( gross.loc[(df['week_ending'] == i) & df['percent_of_cap'] >= 0.8, 'show']) itemset2.append(temp) # Perform Apriori algorithm for j in range(0, 2): temp = [itemset, itemset2][j] te = TransactionEncoder() te_ary = te.fit(temp).transform(temp) temp_df = pd.DataFrame(te_ary, columns=te.columns_) value = [0.4, 0.6, 0.8] # support value confidf = pd.DataFrame() supportdf = pd.DataFrame() for i in value: frequent_itemsets = apriori(temp_df, min_support=i, use_colnames=True) confi = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) frequent_itemsets['support_val'] = i confi['support_val'] = i supportdf = supportdf.append(frequent_itemsets) confidf = confidf.append(confi.iloc[:, [0, 1, 5, -1]]) print('#################### Support =', i, '####################') pprint.pprint(frequent_itemsets) print('####### Calculating the Confidence #######') print(confi.iloc[:, [0, 1, 5, -1]]) supportdf.to_csv(str('Itemset_support' + str(j) + '.csv'), index=False) confidf.to_csv(str('Itemset_confidence' + str(j) + '.csv'), index=False)
def supportCount(item_sets): te = TransactionEncoder() te_ary = te.fit(item_sets).transform(item_sets) te_ary = te_ary.astype("int") df = pd.DataFrame(te_ary, columns=te.columns_) frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True) print('{Itemsets} ----> support') for index, row in frequent_itemsets.iterrows(): print(set(row['itemsets']), '----->', round(row['support'], 3))
def getBooleanDF(property_list): """ Transform the nested list into a boolean dataframe with transactions on rows and items on columns :param property_list: The nested list with the wikidata properties :return: A boolean dataframe """ te = TransactionEncoder() te_ary = te.fit(property_list).transform(property_list) boolean_dataframe = pd.DataFrame(te_ary, columns=te.columns_) return boolean_dataframe
def applyAprioriTopic(self, support: float) -> pd.DataFrame: processor = TransactionEncoder() binary = processor.fit(self.rawTopic).transform(self.rawTopic) return association_rules(apriori(pd.DataFrame( binary, columns=processor.columns_), min_support=support, use_colnames=True, low_memory=True), metric='confidence', min_threshold=0.8)
def unitfiy_sample_dataset(self): start = time.perf_counter() print("开始进一步规约样本数据集") te = TransactionEncoder() # 定义模型 te_ary = te.fit(self.sampleList).transform(self.sampleList) self.sample_df = pd.DataFrame(te_ary, columns=te.columns_) elapsed = (time.perf_counter() - start) print("Time used:", elapsed) print("样本数据集已进一步规约完毕")
from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # In[2]: fin = open("T10I4D100K.txt", "r") dataset = [[int(n) for n in line.split()] for line in fin] # In[3]: te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset, sparse=True) sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False) sparse_df # In[4]: frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True) frequent_itemsets5 # In[5]: frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True)
def test_fit(): oht = TransactionEncoder() oht.fit(dataset) assert(oht.columns_ == ['Apple', 'Bananas', 'Beer', 'Chicken', 'Milk', 'Rice'])
def test_transform(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset) np.testing.assert_array_equal(expect, trans)
def test_transform_sparse(): oht = TransactionEncoder() oht.fit(dataset) trans = oht.transform(dataset, sparse=True) assert(isinstance(trans, csr_matrix)) np.testing.assert_array_equal(expect, trans.todense())
def test_inverse_transform(): oht = TransactionEncoder() oht.fit(dataset) np.testing.assert_array_equal(np.array(data_sorted), np.array(oht.inverse_transform(expect)))