def gen_rules_handler(event, context): # Get list of all possible request types types = requests.get(REQUEST_TYPES_ENDPOINT) # Convert them into list types_text = types.text.replace("[", "").replace("]", "").replace("\"", "") types = list(types_text.split(",")) client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) # Build Association rules for each type and save in S3 for type in types: print("Generating association rules for type = " + type) # Get all records for the given type url = BACKED_ENDPOINT + '/' + type content = requests.get(url) content_json = json.loads(content.text) transactions = [tuple((i.split(','))) for i in content_json["fields"]] # Build rules with given min_support and min_confidence itemsets, rules = efficient_apriori.apriori( transactions, min_support=MIN_SUPPORT, min_confidence=MIN_CONFIDENCE) # Save rules in a local file filename = type + '.pkl' file = open('/tmp/' + filename, 'wb') pickle.dump(rules, file) file.close() # Upload rule file to s3 client.upload_file('/tmp/' + filename, S3_BUCKET_NAME, filename) # Genrate rules for all types print("Generating association rules for all types ") url = BACKED_ENDPOINT content = requests.get(url) content_json = json.loads(content.text) transactions = [tuple((i.split(','))) for i in content_json["fields"]] # Build rules with given min_support and min_confidence itemsets, rules = efficient_apriori.apriori(transactions, min_support=MIN_SUPPORT, min_confidence=MIN_CONFIDENCE) # Save rules in a local file filename = 'rules.pkl' file = open('/tmp/' + filename, 'wb') pickle.dump(rules, file) file.close() # Upload rule file to s3 client.upload_file('/tmp/' + filename, S3_BUCKET_NAME, filename) # Return succss return {'statusCode': 200, 'body': json.dumps('Rules created/updated')}
def apriori_one(transcation, **kwargs): if len(kwargs) == 0: itemsets, rules = apriori(transcation, min_support=0.1, min_confidence=0.5) print("频繁项集:", itemsets) print("-" * 30) print("关联规则:", rules) else: itemsets, rules = apriori(transcation, min_support=float(kwargs['support']), min_confidence=float(kwargs['confidence'])) print("频繁项集:", itemsets) print("-" * 30) print("关联规则:", rules)
def rule2(): from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules data = dataset.stack() #转置 # print(data) a = data.reset_index() #更新index print(a) b = a.groupby(['level_0', 0])[0].count().unstack().fillna(0) #使用原订单序号与商品名称进行表格展开 # print(b) itemsets = b.applymap(encode_units) #表格中数据转换成0/1 print(itemsets) frequent_itemsets = apriori(itemsets, min_support=0.02, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) print(frequent_itemsets.sort_values(by="support", ascending=False)) print(rules.sort_values(by="lift", ascending=False))
def rule1(): from efficient_apriori import apriori start = time.time() # 得到一维数组orders_series,以客户ID为index,产品名称为value orders_series = data.set_index('客户ID')['产品名称'] # 将数据集进行格式转换 transactions = [] temp_index = 0 for i, v in orders_series.items(): if i != temp_index: temp_set = set() temp_index = i temp_set.add(v) transactions.append(temp_set) else: temp_set.add(v) #print(temp_index) #print(transactions) # 挖掘频繁项集和频繁规则 itemsets, rules = apriori(transactions, min_support=0.03, min_confidence=0.5) print('频繁项集:', itemsets) print('关联规则:', rules) end = time.time() print("用时:", end - start)
def rule1(): from efficient_apriori import apriori start = time.time() # 得到一维数组orders_series,并且将Transaction作为index, value为Item取值 orders_series = data.set_index('Transaction')['Item'] print("一维数组orders_series:", orders_series) # 将数据集进行格式转换 transactions = [] temp_index = 0 for i, v in orders_series.items(): if i != temp_index: temp_set = set() temp_index = i temp_set.add(v) transactions.append(temp_set) else: temp_set.add(v) # 挖掘频繁项集和频繁规则 itemsets, rules = apriori(transactions, min_support=0.01, min_confidence=0.5) print('频繁项集:', itemsets) print('关联规则:', rules) end = time.time() print("用时:", end - start)
def rule2(): from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules pd.options.display.max_columns = 100 start = time.time() # 格式转换 data_test = pd.DataFrame({'items': []}) for i in range(len(data)): test = str() test_dict = dict() for t in range(len(data.iloc[i])): if t == 0: test = data.iloc[i][t] else: if data.iloc[i][t] == '': continue else: test = test + '|' + data.iloc[i][t] test_dict['items'] = test data_test = data_test.append(test_dict, ignore_index=True) hot_encoded_data = data_test['items'].str.get_dummies('|') hot_encoded_data = hot_encoded_data.applymap(encode_units) # 挖掘频繁项集和频繁规则 frequent_itemsets = apriori(hot_encoded_data, min_support=0.01, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) print("频繁项集:", frequent_itemsets) print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.5)]) #print(rules['confidence']) end = time.time() print("用时:", end - start)
def rule2(s2): from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # 计算时间 start = time.time() # 数据格式转化 # 1)将每行数据都放入一列,并用'-'分隔 data['total'] = data[data.columns[:]].apply(lambda x: '-'.join(x.dropna()),axis=1) # 2)利用get_dummies建立one-hot编码 data_ = data.drop(data.columns[:21],axis=1).join(data.total.str.get_dummies(sep='-')) # 挖掘频繁项集 frequent_itemsets = apriori(data_, min_support = s2, use_colnames=True) # 按支持度大小,降序排列 frequent_itemsets = frequent_itemsets.sort_values(by = 'support',ascending=False) print("频繁项集:", frequent_itemsets) # 求关联规则,选取提升度为度量选项 rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) # 按提升度大小,降序排序 rules = rules.sort_values(by = 'lift',ascending=False) print("关联规则:",rules) end = time.time() print("用时2:", end - start)
def simpleTest(): transactions = [(1, 3, 4), (2, 3, 5), (1, 2, 3, 5), (2, 5)] itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=0.5) print(itemsets) print(rules)
def rule2(): from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules pd.options.display.max_columns = 100 start = time.time() #对数据进行独热编码 temp_df = train.groupby( ['客户ID', '产品名称'])['产品名称'].count().unstack().reset_index().fillna(0) temp_df['ID'] = range(len(temp_df)) hot_encoded_df = temp_df.drop(['客户ID'], 1).set_index('ID') #print(hot_encoded_df.head()) #运用函数对数据进行0/1化处理 hot_encoded_df = hot_encoded_df.applymap(encode_units) #挖掘频繁项集和频繁规则 frequent_itemsets = apriori(hot_encoded_df, min_support=0.02, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) print("频繁项集:", frequent_itemsets) print("关联规则:", rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.5)]) #print(rules['confidence']) end = time.time() print("用时:", end - start)
def findFeature(self): reviewHeadline=self.stringDataProcess(self.data['review_body']) # print(reviewHeadline) # # print(reviewHeadline.values.tolist()) itemsets,rules=apriori(reviewHeadline,min_support=0.02,min_confidence=0.5,max_length=10) # print(itemsets) # print(rules) # f=open(self.filename,'w') # pickle.dump(rules,f,0) # with open(filename,'w') as file_obj: # for i in itemsets: # tmp=itemsets[i] # print(tmp) # tmp = json.dumps({str(k): tmp[k] for k in tmp}) # json.dump(str(itemsets),file_obj) # break # def test(self): # rules=pickle.load(self.filename) with open(self.filename,'w') as f: for rule in rules: l=list(rule.lhs) r=list(rule.rhs) if (" microwave" in l) or (" microwave" in r): f.write(','.join(l)+" -> "+','.join(r)+'\n') print(rule)
def rule1(): from efficient_apriori import apriori start = time.time() # 得到一维数组orders_series,并且将Transaction作为index, value为Item取值 orders_series = data.set_index('FaultNo.')['Alarm Msg.'] # 将数据集进行格式转换 transactions = [] temp_index = 0 for i, v in orders_series.items(): if i != temp_index: temp_set = set() temp_index = i temp_set.add(v) transactions.append(temp_set) else: temp_set = set() temp_set.add(v) # 挖掘频繁项集和频繁规则 itemsets, rules = apriori(transactions, min_support=0.01, min_confidence=0.2) itemsets = pd.DataFrame(itemsets) rules = pd.DataFrame(rules) print('频繁项集:', itemsets) print('关联规则:', rules) itemsets.to_excel('E:\DATA_ENGIN\\apiority\\' + F + '频繁项集.xlsx') rules.to_excel('E:\DATA_ENGIN\\apiority\\' + F + '关联规则.xlsx') end = time.time() print("本次用时:", end - start)
def alg_apriori(self): print('starting analysis apriori') itemsets, rules = apriori(self.__prep_list_of_tuples(), min_support=self.min_supp, min_confidence=self.min_conf) if self.mode == 0: return rules elif self.mode == 1: return itemsets elif self.mode == 2: rules_rhs = filter( lambda rule: len(rule.lhs) == self.rule_lhs and len(rule.rhs) == self.rule_rhs, rules) cols = ['id1', 'id2', 'supp', 'conf', 'lift'] lst_data = list() for rule in sorted(rules_rhs, key=lambda rule: self.sort_by): lst_data.append([ rule.lhs, rule.rhs, rule.support, rule.confidence, rule.lift ]) return pd.DataFrame(lst_data, columns=cols) else: print('Put val mode...')
def perform_apriori(final_df): records = [] for i in range(final_df.shape[0]): records.append([str(final_df.values[i, j]) for j in range(final_df.shape[1])]) association_rules = apriori(records, min_support=0.005, min_confidence=0.2, min_length=2) association_results = list(association_rules) return association_results
def rule2(): from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules pd.options.display.max_columns = 100 start = time.time() transactions = [] for i in range(0, dataset.shape[0]): temp = "" for j in range(0, 20): if str(dataset.values[i, j]) != 'nan': temp += str(dataset.values[i, j]) + "|" transactions.append(temp) dataset_new = pd.DataFrame(data=transactions) dataset_new.columns = ["Market_Basket"] # 对数据进行one-hot编码 dataset_new_hot_encoded = dataset_new.drop("Market_Basket", 1).join( dataset_new.Market_Basket.str.get_dummies("|")) dataset_new_hot_encoded = dataset_new_hot_encoded.dropna(axis=1) frequent_itemsets = apriori(dataset_new_hot_encoded, min_support=0.05, use_colnames=True) frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules = rules.sort_values(by="lift", ascending=False) print("频繁项集:", frequent_itemsets) print("关联规则:", rules) end = time.time() print("用时:", end - start)
def efficient_apr(transactions): #efficient_apr挖掘频繁项集和频繁规则 itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.4) print("频繁项集:", itemsets) print("关联规则:", rules)
def run(item_set, min_sup, min_conf): start = time.time() freqItemSet, rules = apriori(item_set, min_support=min_sup, min_confidence=min_conf) end = time.time() execution_time = end - start return freqItemSet, rules, execution_time
def rule1(): start = time.time() from efficient_apriori import apriori itemsets, rules = apriori(data1, min_support=0.05, min_confidence=0.4) print("频繁项集:", itemsets) print("关联规则:", rules) end = time.time() print("用时:", end - start)
def extract_association_with_apriori(transactions, min_sup, min_conf): transaction_list = [] for items in transactions.values(): transaction_list.append(tuple(items)) itemsets, rules = apriori(transaction_list, min_support=min_sup, min_confidence=min_conf) return itemsets, rules
def eff_apriori(): '''使用efficient_apriori挖掘频繁项集和频繁规则''' from efficient_apriori import apriori itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.4) print("频繁项集:", itemsets) print("关联规则:", rules)
def apriorize(transactions): # finds rules based on transaction list itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1) print('Rules:') for r in rules: print(r) print('Itemsets:') for i in itemsets: print(i)
def main(): dataset = pd.read_csv('./Market_Basket_Optimisation.csv', header=None) transacations = [] for myrow in dataset.values: transacations.append(list(filter(lambda x:x is not np.nan, myrow))) itemsets, rules = apriori(transacations, min_support=0.05, min_confidence=0.25) print('频繁项集:', itemsets) print('关联规则:', rules)
def efficient_apriori(transcation, min_support, min_confidence): itemsets, rules = apriori(transcation, min_support=min_support, min_confidence=min_confidence) print("-" * 60) print('efficient_apriori:') print("频繁项集:", itemsets) print("关联规则:", rules) print("-" * 60)
def calculate(self): transactions = self.get_events()[:8] itemsets, rules = apriori(transactions, min_support=0.35, min_confidence=1) bestFits = {} for rule in rules: bestFits[self.name_to_id( rule.lhs[0])] = [self.name_to_id(r) for r in rule.rhs] return bestFits
def show_apriori_table(min_support, sentence): table = PrettyTable() table.field_names = ['lift', 'confidence', 'support', 'word'] item, rules = apriori(sentence, min_support=min_support) rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules) for rule in sorted(rules_rhs, key=lambda rule: rule.lift): table.add_row([rule.lhs, rule.support, rule.confidence, rule.lift]) print(table)
def use_mlextend2(hot_encode): from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules frequent_itemsets=apriori(hot_encode,min_support=0.05,use_colnames= True) frequent_itemsets = frequent_itemsets.sort_values(by="support" , ascending=False) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5) rules = rules.sort_values(by="lift" , ascending=False) print('-'*20, '关联规则', '-'*20) print(rules) rules.to_csv('关联规则2.csv')
def get_itemsets_for_chunk(chunk, min_support=MIN_SUPPORT): """ Reads a chunk and creates a list of frozensets. """ chunk_data = [line.strip(" \n") for line in chunk] transactions = [frozenset(ls.split(' ')) for ls in chunk_data] itemsets, rules = apriori(transactions, min_support=MIN_SUPPORT, min_confidence=1, max_length=15, verbosity=0) return itemsets
def efficient_apriori(transcation, min_support, min_confidence): itemsets, rules = apriori(transcation, min_support=min_support, min_confidence=min_confidence) # print("-"*60) # print('efficient_apriori:') # print("频繁项集:", itemsets) # print("关联规则:", rules) # print("-"*60) result = {'itemsets': itemsets, 'rules': rules} return result
def countItemSets(support, delay, loop): for l in range(loop): time.sleep(delay) print("Counting Itemsets...") tempDataList = dataList transactions = [x['items'] for x in tempDataList] itemsets, rules = apriori(transactions, min_support=support, min_confidence=1) print("Count Itemsets (Min Support", support, "):") print(itemsets) print()
def save_part_2(dataset_map): itemsets, rules = apriori(dataset_map, min_support=0.01) output = "" for number in itemsets: if number is not 3: for item, count in itemsets[number].items(): line = f"{count}:{';'.join(item)}\n" output += line file = "patterns-2.txt" with open(file, "w") as filetowrite: filetowrite.write(output)
def rule1(): from efficient_apriori import apriori start = time.time() # 挖掘频繁项集和频繁规则 itemsets, rules = apriori(transactions, min_support=0.03, min_confidence=0.2) print('频繁项集:', itemsets) df_results1 = pd.DataFrame(rules) print('关联规则:', df_results1) end = time.time() print("用时:", end - start)
def rule1(): from efficient_apriori import apriori start = time.time() itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.5) print('频繁项集:', itemsets) print('-'*50) print('关联规则:', rules) end = time.time() print("用时:", end-start) end = time.time() print("用时:", end-start)