def process(data_path="../data/DBLP.pkl.gz", dir_path="../result/group_of_year/"): ''' Function:寻找关联项集。 Principle: 运行Apirori算法,寻找频繁项集,然后进行关联项集的挖掘 Use: 用于寻找关联项集 ''' _data, stat = load_data() s_data = split_by_item(_data, stat, "year") # 加载数据,并且按照author、year等进行划分 for s in stat["year"]: print("year = %d" % s) data = s_data[s] _input = [] for d in data: _input.append([]) for author in d['author']: _input[-1].append(author) #加载Apriori模型 miner = apriori() #以最小支持度为3个进行一阶频繁项集的挖掘 miner.L_1(_input, 3) #循环直到没有更多频繁项被挖掘出 while not miner.L[miner.k] == []: print("iter %d, size of C = %d, size of L = %d" % (miner.k, len(miner.C[miner.k]), len(miner.L[miner.k]))) miner.C_k_p_1() miner.L_k_p_1(_input, 3) #保存结果 path = dir_path + str(s) f = gzip.open(path, 'wb') pickle.dump(miner.L, f) f.close()
def main(): """ Step 0: makeFileName() will take the directory where medical records are stored and ... returns path to text file that stores file paths to Medical records. """ filenames = makeFileName() print "\n\nstep 0 done... 6 more to go " print "From the directory of data files : a file containing their path is generated." print "=========================================================================\n\n" """ Step 1: extractSent(filename) will take the text file containing file paths to Medical records.. writes 3 files and returns their paths : -- file1 = Candidate sentences (stop words included) -- file2 = Candidate sentences (stop words removed and Record Id added) -- file3 = Run record file -that writes summary of the run """ fullSentFile, sentFile, runFile = extractSent(filenames) print "step 1 done... 5 more to go " print " This step writes three files: " print "1) candidate sentences (full i.e including stop words)" print "2) candidate sentences (no stop words and contains Id -indicating medical record)" print "3) Run file that summarizes the run process" print "================================================================================\n\n" """ Step 2: makeBagOfWords() takes the sentence file with ids and makes bag of words file This is a csv file. Returns the path to the file. """ bagFile = makeBagOfWords(sentFile) print "step 2 done... 4 more to go " print "This step creates bag of words-csv file for Apriori alg." print "=================================================================================\n\n" """ Step 3: apriori - takes the bag of the word calculates the sets that satisfy the support . apriori(dataset, idList, min_support=5, verbose=True) Returns : file path as string """ mydirectory = os.path.dirname(bagFile) # get the directory of bagFile apOut = apriori(bagFile, mydirectory, min_support=5, verbose=False) print apOut print "step 3 done... 3 more to go " print "This step takes bag of words with ids and runs apriori algorithm." print "All association sets that satisfy support> 5 are generated ." print "ID's are used to count the support at the Report level" print "=================================================================================\n\n" """ Step 4: checkspan(): """ filespan3 = checkSpan(fullSentFile, apOut, mydirectory, 3 ) print "3-span satisfying sets stored in "+ filespan3 filespan5 = checkSpan(fullSentFile, apOut, mydirectory, 5 ) print "5-span satisfying sets stored in "+ filespan5 filespan10 = checkSpan(fullSentFile, apOut, mydirectory, 10 ) print "10-span satisfying sets stored in "+ filespan10 print "step 4 done... 2 more to go " print "Checks if association words lie in k span. " print "K values 3, 5 and 10 are checked - generates 3 files" print "=================================================================================\n\n" """ Step 5: OrderCheck(): """ freq_3 = checkOrder(fullSentFile, filespan3, mydirectory, 3) freq_5 = checkOrder(fullSentFile, filespan5, mydirectory, 5) freq_10 = checkOrder(fullSentFile, filespan10, mydirectory, 10) print "step 5 done... 1 more to go " print " The association sets permutations for e.g. {mother, diabetes} and {diabetes, mother} are checked for frequency in candidate sentences" print " Two files for each run : " print "1) one with all permutations" print "2) ordered Wordlists " print "for each span (3,5,10) are generated." print "=================================================================================\n\n" getTopLists(freq_3, 3, mydirectory) getTopLists(freq_5, 5, mydirectory) getTopLists(freq_10, 10, mydirectory) print "Step 6 part 1 done : " print "5 files are generated , each answering the respective questions: " print " a.txt : contain at least one family member?" print " b.txt : contain at least one of the diseases? (disease file in package)" print " c.txt : one family member but no disease? " print " d.txt : both a family member and a disease? " print " e.txt : neither a family nor a disease?" print "check out Put files in : " + mydirectory print "=================================================================================\n\n" familyReport(freq_3, 3, mydirectory) familyReport(freq_5, 5, mydirectory) familyReport(freq_10, 10, mydirectory) print "Step 6 part 2 done : PROCESS COMPLETE !! " print "=================================================================================\n\n"
__author__ = 'Mentu' from Apriori import loadDataset from Apriori import apriori from Apriori import generateRules dataset = loadDataset() L, supportdata = apriori(dataset) rules = generateRules(L, supportdata) print(rules)
__author__ = 'simon.hughes' import GwData import WordTokenizer from Apriori import apriori, print_rules from sklearn.ensemble import GradientBoostingClassifier code = "50" data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, spelling_correct=False) ys = ["CODE_50" if y == 1 else "NOT_50" for y in data.labels_for(code)] inputs = [ x + [y] for x,y in zip(xs, ys) ] rules, support_data= apriori(inputs, min_support=0.025, max_k=5) print_rules(rules, support_data, inputs, min_size=2) #print len(inputs)
def RunningExpr02(): OurputRslt = apriori(MainFunc4(), 0.02) from operator import itemgetter for item in sorted(OurputRslt, key=itemgetter(1, 0), reverse=True): if len(item[0]) > 1: print str(item[0]).replace('[', '').replace(']', '').replace('\'', '') + '\t' + str(item[1])
# -*- coding: utf-8 -*- """ Created on Wed May 20 09:26:13 2020 @author: bululu """ from Apriori import apriori import pandas as pd import numpy as np datafile = 'visit-patterns-by-census-block-group\cbg_patterns.csv' #数据导入 data =pd.read_csv(datafile,encoding='utf-8') data.info() brand=data['top_brands'] print(brand) dataSet=brand L,suppData=apriori(dataSet) i=0 for one in L: print("项数为 %s 的频繁项集:" % (i+1),one,"\n") i +=1 print "minConf=0.7时:" rules = generateRules(L,suppData, minConf=0.7) print "\nminConf=0.5时:" rules = generateRules(L,suppData, minConf=0.5) plot(rules,method='graph')
__author__ = 'simon.hughes' import GwData import WordTokenizer from Apriori import apriori, print_rules from sklearn.ensemble import GradientBoostingClassifier code = "50" data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, spelling_correct=False) ys = ["CODE_50" if y == 1 else "NOT_50" for y in data.labels_for(code)] inputs = [x + [y] for x, y in zip(xs, ys)] rules, support_data = apriori(inputs, min_support=0.025, max_k=5) print_rules(rules, support_data, inputs, min_size=2) #print len(inputs)
def process(data_path = "../data/DBLP.pkl.gz", dir_path = "../result/topic_of_year/"): ''' Function:寻找关联项集。 Principle: 运行Apirori算法,寻找频繁项集,然后进行关联项集的挖掘 Use: 用于寻找关联项集 ''' _data, stat = load_data() s_data = split_by_item(_data, stat, "year") for s in s_data.keys(): # 加载数据,并且按照author、year等进行划分 s_data[s] = split_by_item(s_data[s], stat, "Conference") for s_ in s_data[s].keys(): print ("year = %d, Conference = %s" % (s, s_)) data = s_data[s][s_] _input = [] #使用停止词去除一些数据 for d in data: _input.append([]) str_ = re.sub("[\:\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+", "", d["title"]) del_list = ["a", "an", "the", "in", "of", "with", "on", "for", "and", "to", "under", "beyond", "as", "from", "-", "by", "using", "when", "is", "are", "about", "at", "via", "aaai", "papers", "usa", "arizona"] words = str_.split() for word in words: if word.lower() not in del_list: _input[-1].append(word.lower()) #加载Apriori模型 miner = apriori() #选择最小支持度为5篇论文 miner.L_1(_input, 5) while not miner.L[miner.k] == []: print ("iter %d, size of C = %d, size of L = %d" % (miner.k, len(miner.C[miner.k]), len(miner.L[miner.k]))) miner.C_k_p_1() miner.L_k_p_1(_input, 5) #保存结果 path = dir_path + str(s) + "_" + str(s_) f = gzip.open(path, 'wb') pickle.dump(miner.L, f) print ("result saved at %s" % path) f.close() def get_result(dir_path="../result/topic_of_year/"): ''' Function:从文件中读取结果 Pinciple:已经执行完之后,从文件中读取结果,进行显示 ''' files = os.listdir(dir_path) ret = {} for file in files: #print(dir_path + file) year, conf = file.split("_") if conf not in ret.keys(): ret[conf] = {} ret[conf][int(year)] = get_L(dir_path + file) return ret def get_topic(res): ''' Funtion:已经执行完之后,获得topic信息 ''' ret = {} for k in res.keys(): ret[k] = {} for k_ in res[k].keys(): ret[k][k_] = res[k][k_][-2] return ret def save_topic_txt(ts, path="../result/topic.txt"): ''' Function:保存分组信息到txt文件 ''' f = open(path, "w", encoding="utf8") for key in ts.keys(): f.write("\n\nConference %s:\n" % key) for key_ in ts[key].keys(): f.write("year %d:\n" % key_) for g in range(len(ts[key][key_])): f.write(("%d, " % (g+1))) for words in ts[key][key_][g]: f.write(words + " ") f.write("\n") f.close() if __name__ == "__main__": #执行关联规则挖掘 process() Ls = get_result() ts = get_topic(Ls) save_topic_txt(ts) #显示结果 for key in ts.keys(): print ("\nConference %s:\n" % key) for key_ in ts[key].keys(): print ("year %d:" % key_) for g in range(len(ts[key][key_])): print (("%d" % (g+1)), end=". ") for words in ts[key][key_][g]: print (words, end=', ') print ("")