def task5(pid_to_keyword): word_to_type = {} pid_to_word = {} paper_list = [] with open("entity_types.txt", "r") as f: for line in f: word, e_type = line.strip('\n').split(',') word_to_type[word] = e_type for pid in pid_to_keyword.keys(): for kw in pid_to_keyword[pid]: if kw in word_to_type.keys(): if pid not in pid_to_word.keys(): pid_to_word[pid] = set() pid_to_word[pid].add(kw) for pid in pid_to_word.keys(): paper_list.append(pid_to_word[pid]) result = fp_growth.find_frequent_itemsets(paper_list, 10) for item in result: if len(item ) != 2 or word_to_type[item[0]] != "METHOD" and word_to_type[ item[0]] != "PROBLEM" or word_to_type[item[ 1]] != "METHOD" and word_to_type[item[1]] != "PROBLEM": continue if word_to_type[item[0]] != word_to_type[item[1]]: continue print(item)
def ship_log_analyzer(log_array, customer): array = [] with open("shipping_analysis.csv") as file: for line in file: array.append(line.strip('\n').split(',')) items = find_frequent_itemsets(array, 50) items = [item for item in items if customer in item] for item in items: print(item) out_array = log_array[:] for log_item in log_array: refs = log_item[1].pop(2) + ' ' + log_item[1].pop(1) refs = refs.split(' ') log_item[1] += refs print("Item: ", log_item[1]) distance = 0 for item in items: #print(item) #print(len(set(log_item[1]).intersection(item))) if len(set(log_item[1]).intersection(item)) > distance: distance = len(set(log_item[1]).intersection(item)) print("Rule: ", item, " | Intersect: ", len(set(log_item[1]).intersection(item)), " | New Distance: ", distance) if distance < 3: out_array.remove(log_item) print(out_array) return out_array
def fp_growth(windows, min_support, iterations=0): from fp_growth import find_frequent_itemsets itemsets = [] if 0 < min_support < 1: new_support = math.ceil(min_support * len(windows)) logger.info( "Min support %s%% of %s: %s", min_support * 100, len(windows), new_support) min_support = new_support itemset_gen = find_frequent_itemsets(windows, min_support) if iterations > 1: for x in xrange(0, iterations): template_ids = frozenset(next(itemset_gen)) itemsets.append(template_ids) else: for itemset in itemset_gen: template_ids = frozenset(itemset) itemsets.append(template_ids) logger.info("Removing subsets from fp_growth output...") if len(itemsets): itemsets = get_nonsubsets(itemsets) ret = [Event(id=str(uuid.uuid4()), template_ids=template_ids) for template_ids in itemsets] return ret
def testDuplicate(self): raw = '25,52,274;71;71,274;52;25,52;274,71' transactions = [line.split(',') for line in raw.split(';')] itemsets = list(fp_growth.find_frequent_itemsets(transactions, 2)) self.assertEqual([['25'], ['52', '25'], ['274'], ['71'], ['52']], itemsets)
def fp_growth(windows, min_support, iterations=0): from fp_growth import find_frequent_itemsets itemsets = [] if 0 < min_support < 1: new_support = math.ceil(min_support * len(windows)) logger.info("Min support %s%% of %s: %s", min_support * 100, len(windows), new_support) min_support = new_support itemset_gen = find_frequent_itemsets(windows, min_support) if iterations > 1: for x in xrange(0, iterations): template_ids = frozenset(next(itemset_gen)) itemsets.append(template_ids) else: for itemset in itemset_gen: template_ids = frozenset(itemset) itemsets.append(template_ids) logger.info("Removing subsets from fp_growth output...") if len(itemsets): itemsets = get_nonsubsets(itemsets) ret = [ Event(id=str(uuid.uuid4()), template_ids=template_ids) for template_ids in itemsets ] return ret
def printFre(vips, vipNos, type, support, per): # store the every vip info vipPlus = [] for i in range(len(vips)): vipPlus.append(tolist(vips.get_group(vipNos[i]), type, per)) frequent_items = find_frequent_itemsets(vipPlus, support) # return list(frequent_items)
def testFrequency(): from fp_growth import find_frequent_itemsets k = [] for itemset, support in find_frequent_itemsets(lll, 0.7, True): print itemset, support k.append([itemset, support]) print k
def testFrequency(): from fp_growth import find_frequent_itemsets k=[] for itemset, support in find_frequent_itemsets(lll, 0.7, True): print itemset,support k.append([itemset,support]) print k
def testDuplicate(self): raw = '25,52,274;71;71,274;52;25,52;274,71' transactions = [line.split(',') for line in raw.split(';')] itemsets = list(fp_growth.find_frequent_itemsets(transactions, 2)) # Python 2 - dictionary is sorted by key value (?) # self.assertEqual([['25'], ['52', '25'], ['274'], ['71'], ['52']], itemsets) # Python 3 - dictionary is sorted by insertion order (?) self.assertEqual([['52'], ['274'], ['25'], ['52', '25'], ['71']], itemsets)
def log_frequent_pattern(df_with_log, support_value, min_pattern_len, tokenizer='WITHOUT_LAYER'): """ Find and show frequent patterns of logs from the input DataFrame. :param df_with_log: input DataFrame with a column named log :param support_value: support value of FP-growth algorithm :param min_pattern_len: pattern's minimum length, or say token :param tokenizer: with tokenizer :return: DataFrame with new columns summarize frequent pattern info """ if tokenizer not in ['WITH_LAYER', 'WITHOUT_LAYER']: print("tokenizer must be chosen from 'WITH_LAYER' or 'WITHOUT_LAYER'") return df = df_with_log.copy() try: logs = df.factor.copy() # pd.series except AttributeError as e: print(e) print("The input df must contain a column named 'factor', " + "which gives string representations of random generated factor.") return # tokenize each log in logs for index, log in logs.iteritems(): if tokenizer == 'WITHOUT_LAYER': logs.set_value(index, log_tokenize_without_layer(log)) elif tokenizer == 'WITH_LAYER': logs.set_value(index, log_tokenize_with_layer(log)) logs_as_list = list(logs) # find frequent_pattern frequent_pattern = list(find_frequent_itemsets(logs_as_list, support_value)) filtered_frequent_pattern = filter(lambda fp: len(fp) >= min_pattern_len, frequent_pattern) # construct the DataFrame for fp in filtered_frequent_pattern: fp_exist_list = list() for log in logs: if log_contain_pattern(log, fp): fp_exist_list.append(1) else: fp_exist_list.append(0) df[','.join(fp)] = pd.Series(fp_exist_list, index=df.index) return df
def fp_growth(self): self.assocItems= [] minsup= 5 # transactions= top 9 users transactions= [Person.users[Person.users.index(user[0])].items for user in self.neighbors[:10] if not user[0]==self.id] for itemset in find_frequent_itemsets(transactions, minsup): # if length more than 2, and not subset if the user items, # also has at least one instersection interc= intersection(self.items, itemset) if len(itemset)>=2 and not issubset(self.items, itemset) and interc: # format -> (what you have, what you should read) recItems= set(itemset)-interc pairs= {"own": interc, "rec": recItems} self.assocItems.append(pairs)
def handle_noargs(self, **options): support = 0 all_items = [] for transaction in Transaction.objects.all(): items = TransactionItem.objects.filter( transaction=transaction).values_list('item__id', flat=True) all_items.append(map(str, items)) while True: items = {} itemsets = [] for itemset, support in find_frequent_itemsets( all_items, support, True): itemsets.append(itemset) for item in itemset: if int(item) not in items.keys(): items[item] = [] for index in items.keys(): for itemset in itemsets: if index in itemset: for item in itemset: if item != index and item not in items[index]: items[index].append(item) for main_item in items.keys(): if len(items[main_item]) > 0: for frequent_item in items[main_item]: try: item_set = FrequentItem.objects.get( main_item__id=int(main_item), frequent_item__id=int(frequent_item)) except: item_set = FrequentItem() item_set.main_item = Item.objects.get( id=int(main_item)) item_set.frequent_item = Item.objects.get( id=int(frequent_item)) item_set.support = support item_set.save() print 'SUPPORT:', support print 'ITEMS:', items support += 1 if len(items) < 1: break
def genAssociations(self): for item in find_frequent_itemsets(self.transList, self.minSup): if len(item) in self.F: self.F[len(item)] .append(tuple(item)) else: self.F[len(item)] = [(tuple(item))] set_item = set(item) for t in self.transList: if set_item.issubset(set(t)): if tuple(item) in self.freqList: self.freqList[tuple(item)] +=1 else: self.freqList[tuple(item)] = 1 return self.F
def getFpgrowth(sourcePath, seporator): f1 = open(sourcePath) retDict = {} transactions = [] for line in f1: line = line.strip('\n') # transactions.append([r.encode('utf-8') for r in line.split(' ')]) transactions.append(line.split(seporator)) frequentSet = fp_growth.find_frequent_itemsets(transactions, 10, include_support=True) for item in frequentSet: if not item: break if len(item[0]) == 2 and '' not in item[0] and ' ' not in item[0]: for i in (0, 1): retDict[item[0][i]] = 1 f1.close() return retDict
def solve(self): os.system('pip install fp-growth') import fp_growth as fg data_set = [] with open('A.csv', 'r') as f: lines = f.readlines() for line in lines: data_set.append(line.strip().split(",")) freq_items = fg.find_frequent_itemsets(data_set, len(data_set) * 0.45, include_support=True) L = [] for (items, count) in freq_items: s_items = set(items) if ('republican0' in s_items or 'democrat0' in s_items) and len(s_items) > 1: L.append((items, count)) rules = [] for (items, count) in L: items.sort() rule_lens = len(items) - 1 for i in range(1, 2**rule_lens): left = {items[0]} right = set() bin_form = bin_digits(i, rule_lens) for j in range(rule_lens): if bin_form[j] == '1': right.add(items[j + 1]) else: left.add(items[j + 1]) left_count = 0 for entry in data_set: if left.issubset(entry): left_count += 1 if count * 1.0 / left_count >= 0.9: rules.append([list(left), list(right)]) return rules
def calc_fp_growth(self, attrivute_list, minimum_support=2, num_combo=2): result = dict() tmp_dict = dict() if len(attrivute_list) > 0: frequent_itemsets = fpg.find_frequent_itemsets(attrivute_list, minimum_support, include_support=True) for itemset, support in frequent_itemsets: if len(itemset) == num_combo: if itemset[0] > itemset[1]: tname = itemset[1]+'_'+itemset[0] else: tname = itemset[0]+'_'+itemset[1] if itemset[0] not in tmp_dict: tmp_dict[itemset[0]] = [] tmp_dict[itemset[0]].append(tname) if itemset[1] not in tmp_dict: tmp_dict[itemset[1]] = [] tmp_dict[itemset[1]].append(tname) result[tname] = support return result, tmp_dict
def csv_read(): #核心的函数, with open('E:/yk/test/yk--cz.csv', 'rb') as csvfile: #读取csv 文件. csvreader = _csv.reader(csvfile, delimiter=' ', quotechar='|') negative_chinese = '\xe5\x90\xa6' postive_chinese = '\xe6\x98\xaf' billvalue = '' targetvalue = ''#初始化为空字符. index_number = 0 yangka_data = [['0' for i in range(0,2)] for j in range(0,90000)] fpgrowth_yangka = codecs.open("E:/yk/test/yangka.txt","w") #定义要把发现出来的数据写入的文件. 其中w代表写. for row in csvreader: element = str(', '.join(row)) _element = str(', '.join(row).decode("gb2312") ) # 每条_element 的例子为 ”否,否,否,否,否,否,1,0,0,1,2,2“ #bill_one,bill_two,bill_three ...分别表示一月,二月,三月...是否出账三无的数值为'否'还是'是' bill_one = _element.split(',')[0] #对_element 以','进行分割 ”否,否,否,否,否,否,1,0,0,1,2,2“分割得到的第一项为'否' bill_two = _element.split(',')[1] bill_three = _element.split(',')[2] bill_four = _element.split(',')[3] bill_five = _element.split(',')[4] bill_six = _element.split(',')[5] #target_one,target_two,target_three... 分别表示一月,二月,三月....养卡目标是'0'还是'1'还是'2' target_one = _element.split(',')[6] target_two = _element.split(',')[7] target_three =_element.split(',')[8] target_four = _element.split(',')[9] target_five = _element.split(',')[10] target_six = _element.split(',')[11] print _element #根据上面得到的每个月的状态(是否出账三无),目标生成一个状态变化的数值 和目标变化的数值 billvalue = billvalue_make(bill_one,bill_two,bill_three,bill_four,bill_five,bill_six) targetvalue = targetvalue_make(target_one,target_two,target_three,target_four,target_five,target_six) #生成数值之后赋值给yangka_data这个数组,列1为是否出账生成这个的状态变化#的数据,列2为目标变化生成的数据. index_number每次加1 作为一个浮标 yangka_data[index_number][0] = billvalue yangka_data[index_number][1] = targetvalue index_number = index_number + 1 # print yankga_data # 这是核心部分,需要在运行这个程序之前安装fp_growth这个频繁项挖掘的算法包,其返回的itemset 就是挖掘处来的每个数组集如‘['00000', '30288'] ’,support 为出现次数‘2782’,它们共同组成生成到文件里的记录‘['00000', '30288'] 2782’,下面的数字500 是挖掘出来的记录出现的最低次数,比如像'‘['00000', '30288'] ’这种状态如果出现次数低于500 就不考虑,超过500 就考虑,在这里它出现了'2782'次,所以就会考虑了. for (itemset,support) in find_frequent_itemsets(yangka_data,500,True): print>>fpgrowth_yangka,itemset,support #把这些挖掘得到的记录 生成到文件yangka.txt里.
def fp_growth(self, filepath, minsup): import csv import unicodecsv from fp_growth import find_frequent_itemsets formattedpath = filepath + '.format.csv' with open(formattedpath, 'wb') as outputfile: writer = unicodecsv.writer(outputfile, delimiter='\t', encoding='utf-8') segmentor = segment() with open(filepath) as inputfile: for transaction in csv.reader(inputfile, delimiter='\t'): assert len(transaction) == 1, "Invalid" writer.writerow(segmentor.char_segment(transaction[0])) finalresult = {} with open(formattedpath) as inputfile: for itemset, support in find_frequent_itemsets(csv.reader(inputfile, delimiter='\t'), minsup, True): finalresult[', '.join(itemset)] = support return finalresult
def extractPatternFromCohorte(cohorte, minsup, tag): """ -> Store all frequent pattern (i.e set of items present more than minsup in cohorte) -> cohorte is an array of array, discrete value (obtain via the assemble_CohorteFromAllFiles function) -> minsup is an int between 0 and 100 (% of support) -> tag is a string, insert in the output file name """ saveFileName = "DATA/PATTERN/" + str(tag) + "_pattern_" + str( minsup) + ".csv" numberOfPatient = len(cohorte) minsup = int(minsup) minimumSupport = (minsup * numberOfPatient) / 100 patternFile = open(saveFileName, "w") for itemset in find_frequent_itemsets(cohorte, minimumSupport): line = "" for element in itemset: line = line + str(element) + ";" line = line[:-1] patternFile.write(line + "\n") patternFile.close()
def recommend_recipes(user,n): if user not in dataset.users: ret = [("Bourbon Chicken","http://pictures.food.com/api/file/a2ZX1DphTjK0YuAi916b-149-bourbon-chicken.jpg"),("To Die for Crock Pot Roast","http://img.food.com/img/recipes/27/20/8/large/picVfzLZo.jpg"),("Crock-Pot Chicken With Black Beans & Cream Cheese","http://img.food.com/img/recipes/89/20/4/large/picec1bG3.jpg"),("Creamy Cajun Chicken Pasta","http://img.food.com/img/recipes/39/08/7/large/piccZDaro.jpg"),("\"Whatever Floats Your Boat\" Brownies!","http://img.food.com/img/recipes/32/20/4/large/picblOl7e.jpg"),("Best Ever Banana Cake With Cream Cheese Frosting","http://img.food.com/img/recipes/67/25/6/large/pichIPBA2.jpg"),("Pancakes","http://img.food.com/img/recipes/25/69/0/large/piciUoO07.jpg")] return ret[:n] item = cPickle.load(open("similItem.pkl")) item.setdataset(dataset) items = item.recommend(user,n+1) cont = cPickle.load(open("similCont.pkl")) cont.setdataset(dataset) conts = cont.recommend(user,n+1) recipes = dataset.getItemsForUser(user) alpha = 5 * (n/2) transactions = [] for recipe in recipes: transactions.append(dataset.recipes_ingredients[recipe] * dataset.matrix[user][recipe]) f = find_frequent_itemsets(transactions,alpha) freq = [] for i in f: if len(freq) >= n-1: break freq.append(i) out = conts[:len(freq)+1] out.extend(items) ret = [] i=0 while len(ret) < n: try: ret.append((out[i],dataset.imgs[out[i]])) except Exception: continue; i+=1 return ret
#path = raw_input("Please, write the path to the CSV file \n") path = r"C:\Users\migue\Documents\UC3M\TU Graz\Bachelor thesis\Data\despacho_liencres_out.csv" # read CSV file and store it #instances = pd.read_csv(path, sep=';') #print("csv read") #instances.to_dict('records') #print("Dictionary created") instances = [] with open(path, 'rb') as f: reader = csv.reader(f) instances = list(reader) print("List created") frequent_items = pd.DataFrame() #minsup=0.15 for idx, itemset in find_frequent_itemsets(instances, 4): frequent_items.append(itemset) if (idx % 100 == 0): print(idx) print(frequent_items) # ---------------------------------------------------------------------------------------------- #path = input("Please, write the path to the CSV file \n") #csvout = "C:\\Users\\migue\\Documents\\UC3M\\TU Graz\\Bachelor thesis\\Data\\" + input("Please, write the name of the CSV output file \n") ## open files & create outfile #counter = 0 #sound = [] #barometer = [] #temperature = []
from fp_growth import find_frequent_itemsets transaction = [] minsup = 2 transactions = [[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3], [1, 3], [1, 2, 3, 5], [1, 2, 3]] for itemset in find_frequent_itemsets(transactions, minsup): print itemset
def find_global_freq_itemsets(transactions,minsup): global_freq_itemset=[]; for itemset in find_frequent_itemsets(transactions, minsup): global_freq_itemset.append(itemset); return global_freq_itemset;
print(freqSet - conseq, '-->', conseq, 'conf:', conf) brl.append((freqSet - conseq, conseq, conf)) prunedH.append(conseq) return prunedH def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): m = len(H[0]) if (len(freqSet) > (m + 1)): #try further merging Hmp1 = aprioriGen(H, m + 1) #create Hm+1 new candidates Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) if (len(Hmp1) > 1): #need at least two sets to merge rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) def pntRules(ruleList, itemMeaning): for ruleTup in ruleList: for item in ruleTup[0]: print(itemMeaning[item]) print(" -------->") for item in ruleTup[1]: print(itemMeaning[item]) print("confidence: %f" % ruleTup[2]) print() #print a blank line A, B = apriori(loadDataSet(), 0.4) print(A, B) print(list(find_frequent_itemsets(loadDataSet(), 0.2, True)))
# Assign to event format # EventFormat['WHERE'] = Where # EventFormat['WHO'] = Who # EventFormat['WHEN'] = When # EventFormat['TOPIC'] = TopicResults #print >> outputFile, EventFormat #convert set to a list outputStringList = list(itemSet) if len(outputStringList) >= 3 and len(Where)>0 and len(Who)>0: outputString = ';'.join(outputStringList) print outputString csvWriter.writerow([outputString]) f = open(OutputFileName) try: for itemset, support in find_frequent_itemsets(csv.reader(f), 2, True): print '{' + ', '.join(itemset) + '} ' + str(support) finally: f.close() except mdb.Error, e: print "Error %d: %s" % (e.args[0],e.args[1]) sys.exit(1) finally: if con: con.close()
def main(): #TODO 获取当前目录 path = os.path.split(os.path.realpath(__file__))[0] path.decode('gbk') #实例化 logger1 = logger(path) #调用实例化里面的函数 log = logger1.logger #TODO 开始运行 log.info('run begin') #TODO 创建结果目录 resultPath = path + u'\\result' mkdir(resultPath, log) #TODO 读取配置文件 log.info('read configuration file begin') cf = ConfigParser() localPath = path + u'\\test.ini' cf.read(localPath) database = cf.get(u'db', 'database') password = cf.get(u'db', 'password') user = cf.get(u'db', 'user') port = cf.getint(u'db', 'port') host = cf.get(u'db', 'host') log.info('read configuration file end') # TODO 开始分析 log.info('read data begin') x = getdata(host, port, user, password, database) path1 = path + u'\\drugloc.xlsx' path2 = path + u'\\druglocgroup.xlsx' drugloc = pd.read_excel(path1) druglocgroup = pd.read_excel(path2) log.info('Read data end') log.info('Begin analysis') hoslist = [ '攀钢集团总医院密地院区', '攀钢集团总医院长寿路院区', '攀枝花市中心医院', '中国十九冶集团有限公司职工医院', '攀枝花煤业(集团)有限责任公司总医院', '米易县人民医院', '攀枝花市中西医结合医院', '攀枝花市第二人民医院' ] fredegree = 0.2 for hos in hoslist: log.info('this is hospital %s' % hos) xx = x[x['kb01_ckb519'] == hos] # 构建药品组编码 data = pd.merge(xx, drugloc, left_on='ka20_ake001', right_on='drugId', how='left') data_1 = pd.merge(data, druglocgroup, left_on='locId', right_on='locId', how='left') # 去除druggroupid为空值的 data_2 = data_1[(data_1['drugGroupId'].isnull() == False) & (data_1['kc22_aaz217'].isnull() == False)] data_3 = data_2[['kc22_aaz217', 'drugGroupId']] data_4 = data_3.drop_duplicates() #找到频繁药品 druglist = getitems(data_4, 'kc22_aaz217', 'drugGroupId') transactions = [line.split(',') for line in druglist] itemsets = list( fp_growth.find_frequent_itemsets(transactions, fredegree * len(transactions))) # 去掉重复交换的频繁项即(a,b),(b,a) for i in range(len(itemsets)): for j in range(i + 1, len(itemsets)): if j < len(itemsets): if (len(itemsets[i]) > 1) & (set(itemsets[i]) == set( itemsets[j])): itemsets.remove(itemsets[i]) # 去掉频繁1项集 table = [] num = [] for items in itemsets: if len(items) > 1: table.append(items) num.append(len(items)) freitem = pd.DataFrame({'fre': table, 'num': num}) freitem.sort_values(by=['num'], ascending=[False], inplace=True) maxfre = max(freitem['num']) # 找到频繁n项集,中各项之间的交集或并集 freitems = freitem[freitem['num'] == maxfre] unionfre = set(freitems.iloc[0, 0]).union(*freitems.iloc[1:, 0]) # 计算权重 日均使用频率,平均花费,费用比率等 # 计算日均使用频率 drugcount = data_2['kc22_aaz217'].groupby( [data_2['kc22_aaz217'], data_2['drugGroupId']]).count().reset_index() drugcount.rename(columns={0: 'count'}, inplace=True) days = data_2[['kc22_aaz217', 'days']].drop_duplicates() days.replace(0, 1, inplace=True) drug = pd.merge(drugcount, days, left_on='kc22_aaz217', right_on='kc22_aaz217', how='left') drug['meanCount'] = drug['count'] / drug['days'] drugmeandays = drug['meanCount'].groupby( drug['drugGroupId']).mean().reset_index() # 计算平均花费 drugcost = data_2['kc22_ckc526'].groupby( [data_2['kc22_aaz217'], data_2['drugGroupId']]).sum().reset_index() drugcost.rename(columns={'kc22_ckc526': 'cost'}, inplace=True) drugmeancost = drugcost['cost'].groupby( drug['drugGroupId']).mean().reset_index() # 计算平均药品价格 drugprice = data_2['kc22_cke521'].groupby( data_2['drugGroupId']).mean().reset_index() drugprice.rename(columns={'kc22_cke521': 'price'}, inplace=True) # 计算费用比率 sumcost = data_2['kc22_ckc526'].groupby( data_2['kc22_aaz217']).sum().reset_index() sumcost.rename(columns={'kc22_ckc526': 'sumcost'}, inplace=True) drugratio = pd.merge(drugcost, sumcost, left_on='kc22_aaz217', right_on='kc22_aaz217', how='left') drugratio['ratio'] = drugratio['cost'] / drugratio['sumcost'] ratio = drugratio['ratio'].groupby( drugratio['drugGroupId']).mean().reset_index() # 查看这几个药品组中对应了几个药, druggroup = data_2['kc22_cke521'].groupby([ data_2['drugGroupId'], data_2['drugGroupName'], data_2['ka20_ake002'] ]).mean().reset_index() drugnum = druggroup['ka20_ake002'].groupby( druggroup['drugGroupId']).count().reset_index() drugnum.rename(columns={0: 'drugnum'}, inplace=True) # 计算最高价格的是哪个药品 druggroup.sort_values(by=['kc22_cke521'], ascending=[False], inplace=True) drugmaxprice = druggroup.drop_duplicates(subset=['drugGroupId'], keep='first') # 计算频率出现最高的是哪个药 drugmaxfre = data_2['kc22_aaz217'].groupby([ data_2['drugGroupId'], data_2['drugGroupName'], data_2['ka20_ake002'] ]).count().reset_index() drugmaxfre.rename(columns={0: 'drugmaxfre'}, inplace=True) drugmaxfre.sort_values(by=['drugmaxfre'], ascending=[False], inplace=True) drugmaxfre = drugmaxfre.drop_duplicates(subset=['drugmaxfre'], keep='first') drugname = [] meandayfre = [] meancost = [] meanprice = [] costratio = [] drugnums = [] drugpricemax = [] drugfremax = [] for i in unionfre: drugname.append( list(data_2[data_2['drugGroupId'] == i]['drugGroupName'])[0]) meandayfre.append( list(drugmeandays[drugmeandays['drugGroupId'] == i] ['meanCount'])[0]) meancost.append( list( drugmeancost[drugmeancost['drugGroupId'] == i]['cost'])[0]) meanprice.append( list(drugprice[drugprice['drugGroupId'] == i]['price'])[0]) costratio.append( list(ratio[ratio['drugGroupId'] == i]['ratio'])[0]) drugnums.append( list(drugnum[drugnum['drugGroupId'] == i]['drugnum'])[0]) drugpricemax.append( list(drugmaxprice[drugmaxprice['drugGroupId'] == i] ['ka20_ake002'])[0]) drugfremax.append( list(drugmaxfre[drugmaxfre['drugGroupId'] == i]['ka20_ake002']) [0]) data = { 'drugname': drugname, 'meandayfre': meandayfre, 'meancost': meancost, 'meanprice': meanprice, 'costratio': costratio, 'drugnums': drugnums, 'drugpricemax': drugpricemax, 'drugfremax': drugfremax } df = pd.DataFrame(data, columns=[ 'drugname', 'meandayfre', 'meancost', 'meanprice', 'costratio', 'drugnums', 'drugpricemax', 'drugfremax' ]) pathfile = resultPath + '\\result_' + hos + '.csv' log.info('The result is saved in %s' % pathfile) df.to_csv(pathfile, encoding='gbk') log.info('End analysis') log.info('run end')
#print(len(high_ach_ordered_list)) plot_centroids(np.transpose(km_4.cluster_centers_), correct_order) #For k = 5 # plot_centroids(np.transpose(km_5.cluster_centers_), correct_order) # #run silhouette plots on both to determine which is best # silhouette_plots(x_normalized[0::5],5) #running only on 30000 items as the dataset is too big ! # silhouette_plots(x_normalized[30000:30000+30000],3) ########################################## ASSOCIATION CODE ########################################################### # # #build FP_Tree the min_sup that works is 50% which is way too low cluster_list = [item['clusters'] for item in high_ach_ordered_list] # root = FPtree_construction(cluster_list,0.50) #print(cluster_list) patterns = find_frequent_itemsets(cluster_list, 2000, include_support=True) for items in patterns: print items ############################################ CLASSIFICATION CODE ####################################################### clf = RandomForestClassifier(random_state=255) student_features = [] student_label = [] incomplete_students = [] for student in student_cluster: clusters = student_cluster[student] if clusters["result"] is None: incomplete_features = [] incomplete_features.append(clusters[0])
with open('AssocationMatrixCtrl.csv', 'rb') as f: reader = csv.reader(f, delimiter=',') for row in reader: # print row for item in row: # print item my_list = item.split(",") my_tuple = tuple(my_list) transactions.append(my_list) tupleTransac = tuple(tuple(x) for x in transactions) # print tupleTransac # dataframe = pd.DataFrame(transactions) # print dataframe # print transactions # for transaction in transactions: # print transaction # with open freqItemsets = [] report = find_frequent_itemsets(transactions, 4) for itemset in report: freqItemsets.append(itemset) print len(freqItemsets) dataFrame = pd.DataFrame(freqItemsets) dataFrame.to_csv('fpgrowth.csv') # relim_input = itemmining.get_relim_input(tupleTransac) # report = itemmining.relim(relim_input, min_support=2) # print report
def generateFrequentItemsets(minsup): # print BV.items positiveTransactions = [] negativeTransactions = [] positiveFrequentItemsets = [] negativeFrequentItemsets = [] positiveSupports = [] negativeSupports = [] for item in BV.items: transaction = [] count = 0 if item[-1] == 1: for attribute in item[:-1]: if attribute == 0.0: transaction.append(count * 2) else: transaction.append(count * 2 + 1) count = count + 1 positiveTransactions.append(transaction) else: for attribute in item[:-1]: if attribute == 0.0: transaction.append(count * 2) else: transaction.append(count * 2 + 1) count = count + 1 negativeTransactions.append(transaction) # print "here\n\n" # print len(positiveTransactions) # print len(negativeTransactions) for positiveFrequentItemset, positiveSupport in find_frequent_itemsets(positiveTransactions, int(minsup * len(positiveTransactions)), True): positiveFrequentItemsets.append(positiveFrequentItemset) positiveSupports.append(positiveSupport) for negativeFrequentItemset, negativeSupport in find_frequent_itemsets(negativeTransactions, int(minsup * len(negativeTransactions)), True): negativeFrequentItemsets.append(negativeFrequentItemset) negativeSupports.append(negativeSupport) # print "FP done\n\n" # for itemset in positiveFrequentItemsets: # print itemset # print "negative" # for itemset in negativeFrequentItemsets: # print itemset # print "done" print len(positiveFrequentItemsets) print len(negativeFrequentItemsets) positiveDict = {} negativeDict = {} removePositive = {} removeNegative = {} idx = 0 for itemset in positiveFrequentItemsets: num = 0 for item in itemset: num = num + 2**item # print "positive num" # print num useless = False for key in positiveDict: if key & num == num: useless = True elif key & num == key: removePositive[key] = 1 if useless == False: positiveDict[num] = positiveSupports[idx] idx = idx + 1 idx = 0 for itemset in negativeFrequentItemsets: num = 0 for item in itemset: num = num + 2**item useless = False for key in negativeDict: if key & num == num: useless = True elif key & num == key: removeNegative[key] = 1 # if num in positiveDict and positiveDict[num] != 1 and useless == False: if useless == False: negativeDict[num] = negativeSupports[idx] # print positiveDict # print negativeDict for key in removePositive: if key in positiveDict: del positiveDict[key] for key in removeNegative: if key in negativeDict: del negativeDict[key] # print "dumping\n\n" json.dump(positiveDict, open("positiveFrequentItemsets.txt","w")) json.dump(negativeDict, open("negativeFrequentItemsets.txt","w")) json.dump(len(positiveTransactions), open("numOfPosResults.txt","w")) json.dump(len(negativeTransactions), open("numOfNegResults.txt","w")) # for itemset in find_frequent_itemsets(transactions, minsup): # print itemset
def main(): # TODO 获取当前目录 path = os.path.split(os.path.realpath(__file__))[0] path.decode('gbk') # 实例化 logger1 = logger(path) # 调用实例化里面的函数 log = logger1.logger # TODO 开始运行 log.info('run begin') # TODO 创建结果目录 resultPath = path + u'\\result' mkdir(resultPath, log) # TODO 读取配置文件 log.info('read configuration file begin') cf = ConfigParser() localPath = path + u'\\config.ini' cf.read(localPath) freqs = cf.getint('parameter', 'freqs') username = cf.get('database', 'username') password = cf.get('database','password') tns = cf.get('database', 'tns') databasename = cf.get('database', 'databasename') table = cf.get('database', 'table') tabledetail = cf.get('database', 'tabledetail') log.info("Read configuration file end") # TODO 读取数据 log.info('Read data begin') datapath = path + u'\\data' listDir = os.listdir(datapath) filePath = os.path.join(datapath,listDir[0]) df, rawdata = readdata(filePath) log.info('Read Data end') # TODO 开始分析 items = getitems(rawdata) print(len(items)) transactions = [line.split(',') for line in items] result = [] itemsets = [] for itemset, support in fp_growth.find_frequent_itemsets(transactions, freqs, True): result.append((itemset, support)) results = sorted(result, key=lambda i: len(i[0]), reverse=True) # 去除频繁1项 resl = [] for itemset, support in results: if len(itemset) > 1: resl.append((itemset, support)) # print resl print(len(resl)) # 去除所有子集 res1 = sorted(resl, key=lambda i: len(i[0]), reverse=True) res2 = [resl[0][0]] frItems = [resl[0]] for i, support in resl: TF = [] for j in res2: TF.append(str(np.in1d(i, j).all())) if 'True' in TF: continue else: res2.append(i) frItems.append((i, support)) print(len(frItems)) print frItems lists = [] for index,(i,support) in enumerate(frItems): df_person = df[df['indv_id'].isin(i)] df_person['patientCount'] = len(i) df_person['id'] = shortuuid.uuid() df_person['preNum'] = len(i) #找出频繁一起入院的时间 time = df_person['indv_id'].groupby(df_person['intime']).count().reset_index() time.rename(columns={'indv_id':'times'},inplace=True) presonNum = len(i)-1 time = time[time['times']>presonNum] timelist = time['intime'] df_person = df_person[df_person['intime'].isin(timelist)] df_person['sumCost'] = sum(df_person[u'医疗总费用']) df_person['sumClaimCost'] = sum(df_person[u'报销金额']) df_person['fre'] = len(df_person)/len(i) lists.append(df_person) i.append(support) result = pd.concat(lists) result['department']= '' result['doctor'] = '' print result.head() # #TODO 输出 filename1 = u'\\su_illegal_seek_exception.csv' filename2 = u'\\su_illegal_seek_exception_detail.csv' outputfile1 = resultPath + filename1 outputfile2 = resultPath + filename2 # #主表输出 res1 = result[['id',u'患者姓名']].drop_duplicates() res1 = res1[u'患者姓名'].groupby(res1['id']).apply(lambda x: ','.join(x)).reset_index() res2 = result[['id', 'intime']].drop_duplicates() res2 = res2['intime'].groupby(res2['id']).apply(lambda x: ','.join(x)).reset_index() result2 = pd.merge(res1,res2,on='id',how='left') res3 = result[['id', u'医院名称']].drop_duplicates() res3 = res3[u'医院名称'].groupby(res3['id']).apply(lambda x: ','.join(x)).reset_index() result2 = pd.merge(result2, res3, on='id', how='left') res4 = result[['id',u'医院名称', u'医院所在分中心',]].drop_duplicates() res4 = res4[u'医院所在分中心'].groupby(res4['id']).apply(lambda x: ','.join(x)).reset_index() result2 = pd.merge(result2, res4, on='id', how='left') res5 = result[['id', u'医院等级',u'医院名称']].drop_duplicates() res5 = res5[u'医院等级'].groupby(res5['id']).apply(lambda x: ','.join(x)).reset_index() result2 = pd.merge(result2, res5, on='id', how='left') result2.rename(columns={u'患者姓名':'involvedPatientNames', 'intime':'inHosTime',u'医院名称':'hospitalName',u'医院等级':'hospitalLevel',u'医院所在分中心':'hospitalArea'},inplace=True) result3 = result[['id','sumCost','sumClaimCost','doctor','fre','patientCount']] su_illegal_seek_exception = pd.merge(result2,result3,left_on='id',right_on='id',how='left') su_illegal_seek_exception.drop_duplicates(inplace=True) su_illegal_seek_exception.rename(columns={'sumCost':'allMoney','sumClaimCost':'bcMoney','doctor':'doctorName','fre':'frequency'},inplace=True) su_illegal_seek_exception.to_csv(outputfile1,encoding='gb18030',index=False)
def get_freq_itemsets(self, min_sup, max_len): fi = [] if self.path == '../datasets-space/kosarak-full-space.data': f = open('kosarak-minsupp-0.6-percent.txt') for line in f: tokens = line.split(',') itemsetstr = tokens[0] itemsetstr = itemsetstr.strip('[]') itemset = [] for item in itemsetstr.split(): itemset.append(int(item)) sup = int(tokens[1]) if sup < min_sup: break if len(itemset) <= max_len: fi.append(self.Itemset(itemset, sup)) f.close() return fi if self.path == '../datasets-space/aol-full-space.data': f = open('aol-minsupp-0.2-percent.txt') for line in f: tokens = line.split() itemset = [] for item in tokens[:-2]: itemset.append(int(item)) sup = int(tokens[-2]) if sup < min_sup: break if len(itemset) <= max_len: fi.append(self.Itemset(itemset, sup)) #print "test loading aol, fi = ", fi f.close() return fi if max_len == 1: x = 0 for item, coverage in self.items.iteritems(): sup = len(coverage) if sup >= min_sup: x += 1 fi.append(self.Itemset([ item, ], sup)) else: x = 0 for itemset, sup in find_frequent_itemsets(self.transactions, min_sup, include_support=True): if len(itemset) > max_len: continue fi.append(self.Itemset(itemset, sup)) x += 1 # if x % 100 == 0: #logging.debug('%d frequent itemsets obtained'%x) # logging.debug('Total for min sup %d = %d frequent itemsets'%(min_sup,x)) fi.sort(key=lambda (i): i.sup, reverse=True) return fi
def getFrequentItem(filepath): labelDic={} dicfile=open('alllabelDic.pkl','rb') labelDic=pickle.load(dicfile) invertlabelDic=dict(izip(labelDic.itervalues(),labelDic.iterkeys())) # for k in invertlabelDic: # print invertlabelDic[k] # # for k in labelDic: # print labelDic[k] dic_labelTag=[] # labelTag=np.loadtxt(filepath,dtype=str,delimiter=',',usecols=(4,)) # dic_labelTag=[] # for item in labelTag: # if item in labelDic.keys(): # #print (labelDic[item]) # dic_labelTag.append(labelDic[item]) # print(dic_labelTag) # frequentSet=[] #frequentSet is the set of frequent item,like:[[['1'], 4], [['2', '1'], 4]] first is frequent tag,second is the support dgree. # for itemset, support in find_frequent_itemsets(dic_labelTag, 4, True): # #frequentSet.append([itemset,support]) # print itemset,support labelTag=np.loadtxt(filepath,dtype=str,delimiter=',',usecols=(3,4)) time=1 while time<=48: tempLabel=[] for item in labelTag: labtltime=str2timeNum(item[0]) # here labeltime is a number as we cut one hour into two pices of time(30 min) if labtltime>= (time-1)*30 and labtltime<=time*30: if item[1] in labelDic.keys(): tempLabel.append(labelDic[item[1]]) else: print item[1],'-------------------------------------' tempLabel.append('999999999') if tempLabel: dic_labelTag.append(tempLabel) time+=1 #print(dic_labelTag) # dic_labelTag=[] # for item in labelTag: # if item in labelDic.keys(): # #print (labelDic[item]) # dic_labelTag.append(labelDic[item]) # print(dic_labelTag) frequentSet=[] #frequentSet is the set of frequent item,like:[[['1'], 4], [['2', '1'], 4]] first is frequent tag,second is the support dgree. for itemset, support in find_frequent_itemsets(dic_labelTag,0.2, True): frequentSet.append([itemset,support]) savefile=open(filepath.replace('RCed_stoppoint.txt','itemfrequence.txt'),'w') for item, support in sorted(frequentSet, key=lambda (item, support): support): #print item,support if len(item)==1: savefile.write(invertlabelDic[item[0]]) savefile.write('\n') else: for index in range(len(item)-1): savefile.write(invertlabelDic[item[index]]) savefile.write(',') savefile.write(invertlabelDic[item[len(item)-1]] ) savefile.write('\n') savefile.close() return dic_labelTag,frequentSet
#------------------------------------------------------- def parse_file(path): '''Parse the json file and create a dict''' data = dict() with open(path, 'r') as infile: data = json.load(infile) infile.close() print('Total No of transactions: ', len(data)) trans_list = list() for i in range(len(data)): trans_list.append(list(map(str, data[i]['Items']))) return trans_list #---------------------------------------------------------- if __name__ == "__main__": transactions = parse_file('trans_data.json') result = [] transac = transactions[0:20] minsup = int(input('Enter minimum support: ')) for itemset, support in find_frequent_itemsets( transac, minsup, True): # 2nd parameter is the minimum support value. result.append((itemset, support)) result = sorted(result, key=lambda i: i[0]) for itemset, support in result: print(str(itemset) + ' ' + str(support))
for conseq in H: conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence if conf >= minConf: print(freqSet-conseq,'-->',conseq,'conf:',conf) brl.append((freqSet-conseq, conseq, conf)) prunedH.append(conseq) return prunedH def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): m = len(H[0]) if (len(freqSet) > (m + 1)): #try further merging Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) if (len(Hmp1) > 1): #need at least two sets to merge rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) def pntRules(ruleList, itemMeaning): for ruleTup in ruleList: for item in ruleTup[0]: print(itemMeaning[item]) print(" -------->") for item in ruleTup[1]: print(itemMeaning[item]) print("confidence: %f" % ruleTup[2]) print() #print a blank line A, B = apriori(loadDataSet(), 0.4) print(A, B) print (list(find_frequent_itemsets(loadDataSet(), 0.2, True)))
def find_frequent_pattern(read_filename1, read_filename2, write_filename1, write_filename2, write_filename3): #频繁项集挖掘的支持度 ##### 可调整该值输出不同的结果,作为实验对比 minimun_support = 25 ##### #频繁项及其对应的长度、支持度的列表 frequent_patterns = [] #二维int列表 length_all = [] support_all = [] word_list = [] f0 = open(read_filename2, 'rb') line = f0.readline() while line: word_list.append(line.split()[0]) line = f0.readline() f0.close() ''' #挖掘频繁项集,并得到相应的频繁项集的支持度 #频繁项集挖掘采用FP-Growth算法 #参考 https://github.com/enaeseth/python-fp-growth ''' trans = generate_transactions(read_filename1, word_list) #find_frequent_itemsets返回的结果类型为"generator" #The type of the return of the "find_frequent_itemsets" is "generator" for each, support in find_frequent_itemsets(trans, minimun_support, include_support=True): each.sort() frequent_patterns.append(each) length_all.append(len(each)) support_all.append(support) print 'Total frequent patterns: %d' % len(frequent_patterns) #频繁项按照长度由高到低排序 fl = zip(frequent_patterns, length_all, support_all) fl1 = sorted(fl, key = itemgetter(1), reverse = True) frequent_patterns = [] result_length_support = [] #频繁项集过滤 for each in fl1: tag = 0 for each1 in frequent_patterns: if len(set(each[0]) & set(each1)) == len(each[0]): tag = 1 break elif np.true_divide(len(set(each[0]) & set(each1)), len(set(each[0]) | set(each1))) > 0.4999: ##### 可调整该相似度,作为实验对比 ##### tag = 1 break else: pass if tag == 0: frequent_patterns.append(each[0]) result_length_support.append(str(each[1]) + " " + str(each[2])) #result_length_support = [] real_word_trans = [] trans_to_string = [] for each in frequent_patterns: trans_to_string.append(" ".join([str(x) for x in each])) real_word_list = map_trans_to_word(each, word_list) real_word_trans.append(" ".join(real_word_list)) quick_write_list_to_text(trans_to_string, write_filename1) quick_write_list_to_text(result_length_support, write_filename2) quick_write_list_to_text(real_word_trans, write_filename3)
filtered = [w for w in result if w not in english_stopwords and re.match(r'^\d+$', w) is None] yield filtered def content_from_file(filename): for x in transform_data(read_json(filename), get_item=itemgetter('object')): yield x if __name__ == '__main__': p = ArgumentParser() p.add_argument('input', nargs=1) p.add_argument('-s', '--minsup', type=int, default=1000, help='Minimum itemset support') p.add_argument('-c', '--category', type=str, default="spam", help='Class to perform search on') args = p.parse_args() input = args.input[0] if isdir(input): content_gen = content_from_dir(input, category=args.category) else: content_gen = content_from_file(input) for itemset, support in \ find_frequent_itemsets(content_gen, args.minsup, True): print str(support) + ' ' + ' '.join(itemset)
def spur(read_directory, write_directory1, write_directory2): ''' SPUR压缩 Summarization via Pattern Utility and Ranking Summarize a batch of transactions with low compression ratio and high quality. Xintian Yang, Amol Ghoting, Yiye Ruan, A Framework for Summarizing and Analyzing Twitter Feeds, KDD'12,August 12–16, 2012, Beijing, China. :param read_directory: VSM文件目录 :param write_directory1: 压缩结果文件目录 :param write_directory2: 压缩比例文件目录 ''' #频繁项集挖掘的支持度 minimun_support = 60 #误报率 f = 0.1 #压缩比率 ratio = [] #压缩时间 compress_time = [] #文件总数 file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)]) #循环遍历所有VSM文件 for i in range(file_number): print 'Batch: %d' % (i + 1) start = time.clock() ''' #挖掘频繁项集,并得到相应的频繁项集的支持度 #频繁项集挖掘采用FP-Growth算法 #参考 https://github.com/enaeseth/python-fp-growth ''' o_trans, trans_size = generate_transactions(read_directory + '/' + str(i + 1) + '.txt') #压缩预算,压缩的上限是原始事务中的总项数items的0.6 #M = 0.7 * trans_size M = 85000 #频繁项及其对应的长度、支持度的列表 frequent_patterns = [] #二维int列表 length_all = [] support_all = [] #find_frequent_itemsets返回的结果类型为"generator" #The type of the return of the "find_frequent_itemsets" is "generator" for each, support in find_frequent_itemsets(o_trans, minimun_support, include_support=True): each.sort() frequent_patterns.append(each) length_all.append(len(each)) support_all.append(support) print len(frequent_patterns) #频繁项按照长度由高到低排序 fl = zip(frequent_patterns, length_all, support_all) fl1 = sorted(fl, key = itemgetter(1), reverse = True) ''' #为便于表示原始事务,每一个频繁项pattern用一个字符串来表示,作为其id #每一个pattern的表示格式"p*",*为数字,从0开始 ''' #pattern的id与pattern所代表的事务中的项关系的字典 #类型:'id':int[item] id_pattern_dict = {} #pattern的id与pattern的长度关系的字典 #类型:'id':int pattern_length_dict = {} #pattern的id与pattern的支持度关系的字典 #类型:'id':int pattern_support_dict = {} id1 = 0 for each in fl1: id_pattern_dict['p' + str(id1)] = each[0] pattern_length_dict['p' + str(id1)] = each[1] pattern_support_dict['p' + str(id1)] = each[2] id1 += 1 #pattern的id与含有该pattern的事务关系的字典 #事务的编号的排列方式以原始事务顺序为依据,为int型 #类型:'id':int[trans] pattern_trans_dict = {} for each in id_pattern_dict.keys(): value_list = [] for j in range(len(o_trans)): if set(id_pattern_dict[each]).issubset(o_trans[j]): #后面无需集合化 value_list.append(j) pattern_trans_dict[each] = value_list #获取每个频繁项的子频繁项,不包含本身 #类型:'id':str[id] sub_pattern_dict = {} #获取每个频繁项的父频繁项,不包含本身 #类型:'id':str[id] super_pattern_dict = {} #获取每个与每个频繁项相交的但不属于以上2种情况的频繁项 #类型:'id':str[id] overlap_pattern_dict = {} for each in id_pattern_dict.keys(): value_list1 = [] value_list2 = [] value_list3 = [] for each1 in id_pattern_dict.keys(): if each != each1: intersection = set(id_pattern_dict[each1]) & set(id_pattern_dict[each]) if intersection == set(): pass elif set(id_pattern_dict[each1]) == intersection: value_list1.append(each1) elif set(id_pattern_dict[each]) == intersection: value_list2.append(each1) else: value_list3.append(each1) else: pass sub_pattern_dict[each] = value_list1 super_pattern_dict[each] = value_list2 overlap_pattern_dict[each] = value_list3 ''' 初始化utility值 返回pattern的id与该pattern的utility值关系的字典 返回pattern的id与包含该pattern的事务列表关系的字典 ''' pattern_utility , pattern_coverage_set = utility_f(id_pattern_dict, pattern_trans_dict, pattern_support_dict, sub_pattern_dict, f) #获取utility值最大的pattern max_index = np.argmax(pattern_utility.values()) Q_top = pattern_utility.keys()[max_index] #pattern_utility的复制 Q_utility = pattern_utility.copy() ''' 将原始事务用当前pattern表示,根据utility进行 同时不断更新utility值 ''' #current_size = trans_size current_size = 0 iter_count = 0 while current_size < M: #当前选择的pattern this_pattern = Q_top if Q_utility[this_pattern] >= 0.0: ''' #用当前频繁项this_pattern表示原始事务 #this_pattern是该pattern的键,是一个字符串 ''' replace_trans_with_pattern(o_trans, this_pattern, id_pattern_dict[this_pattern], pattern_coverage_set[this_pattern]) #此时,o_trans已经改变 #注意,之后o_trans中既包含int型,又包含string型 ''' 当前pattern表示完后,更新其余pattern的utility值 ''' for each1 in super_pattern_dict[this_pattern]: covered_set = set(pattern_coverage_set[each1]) & set(pattern_coverage_set[this_pattern]) pattern_utility[each1] = pattern_utility[each1] - len(id_pattern_dict[this_pattern]) * len(covered_set) if each1 in Q_utility.keys(): Q_utility[each1] = pattern_utility[each1] for each2 in sub_pattern_dict[this_pattern]: covered_set = set(pattern_coverage_set[each2]) & set(pattern_coverage_set[this_pattern]) pattern_utility[each2] = pattern_utility[each2] - (len(id_pattern_dict[each2]) - 1) * len(covered_set) if each2 in Q_utility.keys(): Q_utility[each2] = pattern_utility[each2] pattern_coverage_set[each2] = [x for x in pattern_coverage_set[each2] if x not in covered_set] if (len(pattern_coverage_set[each2]) == 0) and (each2 in Q_utility.keys()): del Q_utility[each2] for each3 in overlap_pattern_dict[this_pattern]: covered_set = set(pattern_coverage_set[each3]) & set(pattern_coverage_set[this_pattern]) pattern_utility[each3] = pattern_utility[each3] - len(covered_set) * len(set(id_pattern_dict[each3]) & set(id_pattern_dict[this_pattern])) if each3 in Q_utility.keys(): Q_utility[each3] = pattern_utility[each3] #if len(pattern_coverage_set[this_pattern]) == 0: #flag += 1 #else: #flag = 0 current_size = current_size + len(pattern_coverage_set[this_pattern]) iter_count += 1 if iter_count >= 50000: break #if flag == 3: #break #current_size = np.sum([len(x) for x in o_trans]) #print current_size #当前pattern已表示过 删除之 del Q_utility[this_pattern] #重新按照utility值降序排序 选取utility值最大的一项 if Q_utility != {}: max_index = np.argmax(Q_utility.values()) Q_top = Q_utility.keys()[max_index] else: break else: break #final_size = np.sum([len(x) for x in o_trans]) final_size = current_size print 'Final size: ', final_size this_ratio = np.true_divide(final_size, trans_size) print 'Ratio: ', this_ratio ratio.append(str(this_ratio)) interval = time.clock() - start print 'Time: %f' % interval compress_time.append(str(interval)) write_list_to_text_by_row(o_trans, write_directory1 + '/' + str(i + 1) + '.txt') quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt') quick_write_list_to_text(compress_time, write_directory2 + '/compress_time.txt')
def find_frequent_pattern(read_filename1, read_filename2, write_filename1, write_filename2, write_filename3): #频繁项集挖掘的支持度 ##### 可调整该值输出不同的结果,作为实验对比 minimun_support = 25 ##### #频繁项及其对应的长度、支持度的列表 frequent_patterns = [] #二维int列表 length_all = [] support_all = [] word_list = [] f0 = open(read_filename2, 'rb') line = f0.readline() while line: word_list.append(line.split()[0]) line = f0.readline() f0.close() ''' #挖掘频繁项集,并得到相应的频繁项集的支持度 #频繁项集挖掘采用FP-Growth算法 #参考 https://github.com/enaeseth/python-fp-growth ''' trans = generate_transactions(read_filename1, word_list) #find_frequent_itemsets返回的结果类型为"generator" #The type of the return of the "find_frequent_itemsets" is "generator" for each, support in find_frequent_itemsets(trans, minimun_support, include_support=True): each.sort() frequent_patterns.append(each) length_all.append(len(each)) support_all.append(support) print 'Total frequent patterns: %d' % len(frequent_patterns) #频繁项按照长度由高到低排序 fl = zip(frequent_patterns, length_all, support_all) fl1 = sorted(fl, key=itemgetter(1), reverse=True) frequent_patterns = [] result_length_support = [] #频繁项集过滤 for each in fl1: tag = 0 for each1 in frequent_patterns: if len(set(each[0]) & set(each1)) == len(each[0]): tag = 1 break elif np.true_divide(len(set(each[0]) & set(each1)), len(set(each[0]) | set(each1))) > 0.4999: ##### 可调整该相似度,作为实验对比 ##### tag = 1 break else: pass if tag == 0: frequent_patterns.append(each[0]) result_length_support.append(str(each[1]) + " " + str(each[2])) #result_length_support = [] real_word_trans = [] trans_to_string = [] for each in frequent_patterns: trans_to_string.append(" ".join([str(x) for x in each])) real_word_list = map_trans_to_word(each, word_list) real_word_trans.append(" ".join(real_word_list)) quick_write_list_to_text(trans_to_string, write_filename1) quick_write_list_to_text(result_length_support, write_filename2) quick_write_list_to_text(real_word_trans, write_filename3)
def searchForPattern(cohorte, maxTry, maxNumberOfFrequentPattern, patternSaveFileName): """ -> Generate pattern (i.e frequent itemsets) from cohorte, with maxTry. results are saved in a .csv file (patternSaveFileName). -> cohorte is a list of list -> maxTry is an int -> patternSaveFileName is a string, save file should be located in DATA/PATTERN folder. -> maxNumberOfFrequentPattern is an int, the max number of frequent pattern to Generate (setup ti avoid memory problem) -> TODO: - re-check the algorithm - limit the retrieval of the same patterns - clean doublon in patternSaveFileName """ # Initialisation des parametres minsup = len(cohorte) minLenOfPattern = len(cohorte[0]) numberOfTry = 0 tunningPatternLen = 1 tunnigMinSup = 0 pattern_save = open(patternSaveFileName, "a") pattern_save.close() while (1 > 0): ###################################### # controle le nombre de try effectue # ###################################### if (numberOfTry >= maxTry): break ########################### # Generation des patterns # ########################### listOffrequentItemset = [] for itemset in find_frequent_itemsets(cohorte, minsup): listOffrequentItemset.append(itemset) ###################################################### # controle le nombre de pattern, si aucun pattern # # n est genere on baisse la valeur du minsup employe # # pour genere les patterns # ###################################################### if (len(listOffrequentItemset) > 0): listOfItemSize = [] print "Found " + str( len(listOffrequentItemset )) + " frequent itemsets with minsup = " + str(minsup) ############################################################## # Ecriture des pattern dans un fichier de sauvegarde # # chaque ligne correspond a un pattern, chaque # # element du pattern est separe des autres par un ; # # le dernier terme de la ligne correspond au minsup # # utilise pour genere le pattern (i.e le support du pattern) # ############################################################## pattern_save = open(patternSaveFileName, "a") for element in listOffrequentItemset: lineToWrite = "" for item in element: lineToWrite = lineToWrite + item + ";" listOfItemSize.append(len(element)) lineToWrite = lineToWrite + str(minsup) pattern_save.write(lineToWrite + "\n") pattern_save.close() if (len(listOffrequentItemset) > maxNumberOfFrequentPattern): print "max number of patterns reached, cancel mining" break ###################################################### # controle de la taille des pattern, si la taille # # du plus gros pattern ne passe pas le controle # # on adapte alternativement le score minsup employer # # et la taille des pattern attendues # ###################################################### maxSize = max(listOfItemSize) if (maxSize < minLenOfPattern): if (tunnigMinSup): tunnigMinSup = 0 tunningPatternLen = 1 minsup = minsup - 1 triedToIcreaseMinLenPattern = 0 elif (tunningPatternLen): tunningPatternLen = 0 tunnigMinSup = 1 minLenOfPattern = minLenOfPattern - 1 else: if (not triedToIcreaseMinLenPattern): ################################################### # si la taille des pattern est bonne mais # # on viens juste de changer de minsup alors # # on augmente la taille attendue des pattern # # pour voir si on pas pecher un plus gros pattern # ################################################### minLenOfPattern = minLenOfPattern + 1 triedToIcreaseMinLenPattern = 1 else: ###################################################### # Si la taille des pattern est bonne et on a deja # # essayer d augmenter la taille attendue des pattern # # on arrete la recherche ici # ###################################################### print "found a good pattern" break else: minsup = minsup - 1 numberOfTry += 1
for i in range(numOfFreSet): if len(sortedFreSet[i][0]) <= 1: sortedFreSet = sortedFreSet[:i] break # for i in range(numOfFreSet): items = sortedFreSet[0][0] sup = sortedFreSet[0][1] itemList = [] for i in range(1,len(items)): itemList += list(itertools.combinations(items, i)) print(itemList) dict = {} for i in range(len(itemList)): count = 0 for j in range(numOfData): if set(itemList[i]) in set(data[j]): count += 1 dict[itemList[i]] = count data_1 = data[0] print(set(data_1)) print(set(itemList[0])) ''' import fp_growth as fpg fre = fpg.find_frequent_itemsets(data, 0.01, True) print(list(fre))
def enaeseth_fpgrowth(minsup, item_no): start = datetime.now() transactions, y_res = merge_data(item_no) for itemset in find_frequent_itemsets(transactions, minsup): print(itemset) print(datetime.now() - start)
items = line.split() # pulling a single line or transaction and putting it in items using spaces a delimiters for item in items: transaction.append(item) # appending each item in items to the list transactions to from a list of items in a single transaction transactions.append(transaction) # appends each transaction into transactions forming a list of lists transaction = [] # resetting the value of transactions from fp_growth import find_frequent_itemsets # importing the fp-tree library frequent_sets = [] for itemset in find_frequent_itemsets(transactions, 100, include_support=True): # running creating an item set that satisfied a min-sup of 100 from our transaction list frequent_sets.append(itemset[1]) # appends the amount of support an itemset has to the list frequency_sets frequent_sets.append(itemset[0]) # appends the itemset to the list frequent_sets print frequent_sets # prints the frequent sets, items in the list with odd indexes are the itemsets that are frequent # the index of a frequent itemset minus 1 is the support for that itemset. # the csv file is formatted with k and k+1 indexes being support and itemset pairs to make finding the closed and maximum sets easier. print len(frequent_sets) / 2 # print the number of frequent sets found print datetime.now() - startTime
def csv_read(): current_dir = os.getcwd() counter = 0 csv_file = current_dir + '/my/yk--my.csv' counter = 0 yangka_txt = current_dir + '/yangka.txt' file_yangka = codecs.open(yangka_txt, "w") with open(csv_file, 'rb') as csvfile: csvreader = _csv.reader(csvfile, delimiter=' ', quotechar='|') negative_chinese = '\xe5\x90\xa6' postive_chinese = '\xe6\x98\xaf' billvalue = '' targetvalue = '' index_number = 0 yangka_data = [['0' for i in range(0, 2)] for j in range(0, 90000)] for row in csvreader: element = str(', '.join(row)) _element = str(', '.join(row).decode("gb2312")) bill_one = _element.split(',')[0] bill_two = _element.split(',')[1] bill_three = _element.split(',')[2] bill_four = _element.split(',')[3] bill_five = _element.split(',')[4] bill_six = _element.split(',')[5] target_one = _element.split(',')[6] target_two = _element.split(',')[7] target_three = _element.split(',')[8] target_four = _element.split(',')[9] target_five = _element.split(',')[10] target_six = _element.split(',')[11] print _element # billvalue = billvalue_make(bill_one,bill_two,bill_three,bill_four,bill_five,bill_six) # targetvalue = targetvalue_make(target_one,target_two,target_three,target_four,target_five,target_six) billvalue = billvalue_make1(bill_one, bill_two, bill_three, bill_four) targetvalue = targetvalue_make1(target_one, target_two, target_three, target_four) yangka_data[index_number][0] = billvalue yangka_data[index_number][1] = targetvalue index_number = index_number + 1 for (itemset, support) in find_frequent_itemsets(yangka_data, 100, True): print >> file_yangka, itemset, support #temp_arrya :temporary array && used for reading the initial data generating from fp_growth ,then adjust the data sequence # example :[T1001,'BYYYY'] 528 ->['BYYYY', 'T1001'] 528 temp_array = ['0' for i in range(0, 10000)] temp_array_indexer = 0 file_data = open(yangka_txt) for data in file_data: print 'data ' + str(data.replace('\n', ' ').replace('\r', '')) if len(str(data.split(']')[0])) > 8: # adjust_write_sequence : [T1001,'BYYYY'] 528 ->['BYYYY', 'T1001'] 528 # .replace('\n', ' ').replace('\r', '') ->get rid of the '\n' in the end of each line of yangka.txt temp_array[temp_array_indexer] = adjust_write_sequence( data).replace('\n', ' ').replace('\r', '') temp_array_indexer = temp_array_indexer + 1 counter = counter + 1 #read data from temp_array , sort them and write into yangka_sort.txt yanka_sort_file = current_dir + '/yangka_sort.txt' yangka_sort = codecs.open(yanka_sort_file, "w") yangka_data_mining = [['0' for i in range(0, 2)] for j in range(0, counter)] index_counter = 0 for m in range(0, len(temp_array)): data = str(temp_array[m]) # read data from temp_array if data != '0': yangka_data_mining[index_counter][0] = data.split( ']')[0] + ']' # pattern, such as '[T1001,'BYYYY']' yangka_data_mining[index_counter][1] = int(data.split( ']')[1]) # the occuring time of pattern ,such as '528' index_counter = index_counter + 1 yangka_data_mining.sort(key=operator.itemgetter(1), reverse=True) # sort them. for yangka_data in yangka_data_mining: yangka_data[1] = str(yangka_data[1]) for data in yangka_data_mining: print >> yangka_sort, data # write into file yangka_sort.txt
#with open("./training_filenames","r") as f: #for line in f.readlines(): #trainFileName_lang[line.split()[0]]=line.split()[1] #f = codecs.open("./allLangFiles/"+str(trainFileName_lang[lang]),'r',encoding='utf-8') #filewords=f.read() #corpus = [SnowballStemmer(languageMapping[lang]).stem(word) for word in re.split(';.?',filewords)] #train=corpus #fdist = nltk.FreqDist(w for w in corpus) #vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) #train = map(lambda x: x if x in vocabulary else "*unknown*", train) #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #lm2 = NgramModel(2, train, estimator=estimator) #lm3=NgramModel(3, train, estimator=estimator) g = codecs.open('./fpg'+"_"+str(lang)+".txt",'w',encoding='utf-8') frequent_pattern_list=[] for itemset,support in find_frequent_itemsets(line_list_swr_stem_ofwr,100,True): #if len(itemset)== 2 : #print (lm2.prob(itemset[0],itemset[1:]) #print str(itemset[0])+' ####'+str(itemset[1:]) for item in itemset: g.write(item+" " ) g.write(': ' + str(support)+' \n') #elif len(itemset)== 3 : #print (lm3.prob(itemset[0],itemset[1:]) #print str(itemset[0])+' ####'+str(itemset[1:]) # g.write(str(itemset) + ' : ' + str(support)+' \n' ) # + str(lm3.prob(itemset[0],itemset[1:]))+' \n') #else: # g.write(str(itemset) + ' : ' + str(support)+' \n' ) #frequent_pattern_list.append(str(itemset)) #dictionary = corpora.Dictionary(frequent_pattern_list) #corpus = [dictionary.doc2bow(text) for text in frequent_pattern_list]
def testDuplicate(self): raw = "25,52,274;71;71,274;52;25,52;274,71" transactions = [line.split(",") for line in raw.split(";")] itemsets = list(fp_growth.find_frequent_itemsets(transactions, 2)) self.assertEqual([["25"], ["52", "25"], ["274"], ["71"], ["52"]], itemsets)