def fi(data): print("Using apriori for fim : ") freq_list = fim.apriori(tracts=data, supp=5) print("The frequent item list is : ") print(freq_list) rules = fim.apriori(tracts=data, target='r', eval='c', report='c') print("The rules are : ") print(rules) rules = fim.apriori(tracts=data, target='r', eval='l', report='l') print("The rules are (evaluated with lift): ") print(rules) print("lfi using apriori : ") lfi(freq_list) print("Using fp-growth for fim : ") freq_list = fim.fpgrowth(tracts=data, supp=5) print("The frequent item list is : ") print(freq_list) rules = fim.fpgrowth(tracts=data, target='r', eval='c', report='c', conf=60) print("The rules are (evaluated with confidence): ") print(rules) rules = fim.fpgrowth(tracts=data, target='r', eval='l', report='l', conf=60) print("The rules are (evaluated with lift): ") print(rules) print("lfi using fpgrowth is : ") lfi(freq_list)
def generate_associations(transactions, min_sup, problems, methods, datasets): apriori_patterns = apriori(transactions, supp=-min_sup) print '-------- Apriori --------' output = [] for (pattern, support) in sorted(apriori_patterns, key=lambda x: -x[1]): print pattern, support print 'Number of patterns:', len(apriori_patterns) rules = apriori(transactions, target='r', supp=-5, conf=90, report='sc') print '-------- One-to-Many Association Rules --------' counter = 0 for (ruleleft, ruleright, support, confidence) in sorted(rules, key=lambda x: x[0]): if ruleleft in datasets: for rule in ruleright: if rule not in datasets: counter += 1 print ruleleft, '-->', ruleright, support, confidence elif ruleleft in problems: for rule in ruleright: if rule not in problems: counter += 1 print ruleleft, '-->', ruleright, support, confidence elif ruleleft in methods: for rule in ruleright: if rule not in methods: counter += 1 print ruleleft, '-->', ruleright, support, confidence #print ruleleft,'-->',ruleright,support,confidence print 'Number of rules:', len(rules)
def main(): ############################################################################### # Some basic data analysis ############################################################################### data = read_file("/Users/zxj/cs535/data/marketing.data", lambda x: x.split(",")) frequent_itemset = apriori(data, supp=-3, zmin=2, target='s', report='a') rules = apriori(data, supp=-3, zmin=2, target='r', report='rCL') print("Frequent itemsets are: ") print(frequent_itemset) print("Rules are:") print(rules)
def mine_frequent_itemsets(self, pandas_df, minsup): txns_classless = TransactionDB.from_DataFrame(pandas_df.iloc[:, :-1]) frequent_itemsets = fim.apriori(txns_classless.string_representation, supp=minsup * 100, report="s") return frequent_itemsets
def get_onetomany_rules(transactions, output_file): # One-to-Many Association Rules (ResponseBot 8) rules = apriori(transactions, target='r', supp=-1000, conf=70, report='sc') #output for (ruleleft, ruleright, support, confidence) in sorted(rules, key=lambda x: x[0]): #p = ','.join(pattern) output_file.write('{}' + '-->' + ' {} {} {}\n'.format( ruleleft, ruleright, str(support), str(confidence)))
def run_fim_apriori(df, minsup): print("running fim apriori function") processed_df = process_dataset(df) print("dataset processed") result_raw = fim.apriori(processed_df, supp=(minsup * 100)) print("apriori runned") result = list(map(lambda i: list(i[0]), result_raw)) print("apriori results processed") return result
def Apriori_main(data_fname, minSupport, out_fname='Apriori_results.txt'): lines, tid = readDataset(data_fname) t1 = clock() temp_freq = apriori(tid, target='s', supp=float(minSupport * 100), conf=100) CPU_time = clock() - t1 freq_items = convert2dic(temp_freq, lines) printResults(data_fname, minSupport, CPU_time, freq_items, out_fname) return (freq_items, CPU_time)
def do_apriori(transactions, output_file): print("Preforming Apriori...") # http://www.borgelt.net/pyfim.html patterns = apriori(transactions, supp=-1000) # +: percentage -: absolute number for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): #pattern is a tuple if len(set(pattern)) <= 1: continue p = ','.join(pattern) output_file.write('{} {} \n'.format(p, str(support))) print('Number of patterns: {}'.format(len(patterns)))
def run_apriori(transactions, min_sup, author_number): apriori_patterns = apriori(transactions, supp=-min_sup) print '-------- Apriori --------' output = [] for (pattern, support) in sorted(apriori_patterns, key=lambda x: -x[1]): if len(pattern) < author_number: continue print pattern, support output.append([pattern, support]) print 'Number of patterns:', len(apriori_patterns) return output
def find_by_year(years): print 'For year %s:' %years t_years = get_transactions('%s.csv' %years) report_years = fim.apriori(t_years,supp=0,conf=0,zmax=4) result_years = {} for i in range(1,5): result_years[i] = {} for words,counts in report_years: result_years[len(words)][words] = counts[0] for i in result_years: print 'Top 20 most frequent patterns with length %d' %i print_top_n(result_years[i],20)
def _generate_rules_for_user(recs, settings): tracts = [] for rec in recs: if rec.selected and rec.forecast: tracts.append(_process_recommendations(rec, settings)) metadata = {'total_trips': len(tracts)} if not tracts: return None, metadata return [listify(rule) for rule in apriori(tracts, supp=-1)], metadata
def pass_one(inputfile, support): result = [] p = 0.05 # 0.6 baskets = generate_baskets(inputfile) sample_baskets = sampling(baskets, p) result.append(len(sample_baskets)*1./len(baskets)) adjusted_support = 0.9 * p * support # use lower threshold frequent_items = [x[0] for x in fim.apriori(sample_baskets, supp=adjusted_support)] result.append(frequent_items) print frequent_items negative_border_items = generate_negative_border(frequent_items, sample_baskets) print negative_border_items result.append(negative_border_items) return result
def mine(self, df, supp, zmin, zmax, is_count=False): """return association rules from netflow df Arguments: df {dataframe} -- netflow dataframe supp {number} -- [description] zmin {number} -- minimum number in itemset zmax {number} -- max number in itemset Keyword Arguments: is_count {bool} -- true if minsup is absolute number, else percentage (default: {False}) """ transacts = self.netflow_to_transc(df) supp = -supp if is_count else supp # (positive: percentage, negative: absolute number) return apriori(transacts, target='m', supp=supp, zmin=zmin, zmax=zmax)
def run_fim_apriori(df: pd.DataFrame, min_suppport_thr: float) -> List[Transaction]: try: import fim except Exception as e: raise e print("running fim apriori function") dataset_transactions: List[Transaction] = dataframe_to_list_of_transactions(df) print("dataset processed") frequent_itemsets_raw = fim.apriori(dataset_transactions, supp=(min_suppport_thr*100)) # List[Tuple[]] print("apriori runned") frequent_itemsets: List[Transaction] = list(map(lambda i: list(i[0]), frequent_itemsets_raw)) # Li print("apriori results processed") return frequent_itemsets
def pass_one(transactions, support, epsilon, delta, mu): result = [] n = calculate_bound(epsilon, delta) adjusted_support = support - lower_by(n, mu) sampled_transactions = random.sample(transactions, n) # TODO why? result.append(len(sampled_transactions)*1./len(transactions)) frequent_items =\ [x[0] for x in\ apriori(sampled_transactions, supp=adjusted_support*100)] result.append(frequent_items) negative_border_items = generate_negative_border(frequent_items, sampled_transactions)
def generateCARs(transactionDB: TransactionDB, support: float = 1, confidence: float = 50, maxlen: int = 10, **kwargs): """Function for generating ClassAssociationRules from a TransactionDB Parameters ---------- :param transactionDB : TransactionDB support : float minimum support in percents if positive absolute minimum support if negative confidence : float minimum confidence in percents if positive absolute minimum confidence if negative maxlen : int maximum length of mined rules **kwargs : arbitrary number of arguments that will be provided to the fim.apriori function Returns ------- list of CARs """ appear = transactionDB.appeardict rules = fim.apriori(transactionDB.string_representation, supp=support, conf=confidence, mode="o", target="r", report="sc", appear=appear, **kwargs, zmax=maxlen) return createCARs(rules)
tid = int(argv[1]) if tid < -2: print(fpgrowth.__doc__) elif tid < -1: print(eclat.__doc__) elif tid < 0: print(apriori.__doc__) else: tracts = [[1, 2, 3], [1, 4, 5], [2, 3, 4], [1, 2, 3, 4], [2, 3], [1, 2, 4], [4, 5], [1, 2, 3, 4], [3, 4, 5], [1, 2, 3]] print('transactions:') for t in tracts: print(t) if tid < 1: print('apriori(tracts, supp=-3, zmin=2):') for r in apriori(tracts, supp=-3, zmin=2): print r elif tid < 2: print('eclat(tracts, supp=-3, zmin=2):') for r in eclat(tracts, supp=-3, zmin=2): print r elif tid < 3: print('fpgrowth(tracts, supp=-3, zmin=2):') for r in fpgrowth(tracts, supp=-3, zmin=2): print r else: print('fim(tracts, supp=-3, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
#SUPPORT = -3 #ZMIN = 2 SUPPORT = -0.01 * len(data) ZMIN = 1 #ZMAX = 5 CONF = 80 LIFT = 1.03 ITEMSET_REPORT = 'a' #RULES_REPORT='rCL' RULES_REPORT = 'C' #frequent_itemset = apriori(data, supp=SUPPORT, zmin=ZMIN, conf=CONF, eval='l', thresh=LIFT, target='s', report=ITEMSET_REPORT) frequent_itemset = apriori(data, supp=SUPPORT, zmin=ZMIN, target='s', report=ITEMSET_REPORT) #rules = apriori(data, supp=SUPPORT, zmin=ZMIN, conf=CONF, eval='l', thresh=LIFT, target='r', report=RULES_REPORT) rules = apriori(data, supp=SUPPORT, zmin=ZMIN, target='r', report=RULES_REPORT) #for item in data: # print(item) print('========================') #for itemset in frequent_itemset: print(itemset) for i in range(5): print(random.choice(frequent_itemset)) print('------------------------') #for rule in rules: print(rule) for i in range(5):
transactions.append([x for x in line.rstrip('\n').split(',') if filter_fun_str(x) ]) trcount = len(transactions) print(trcount) transactions = filter(lambda x: len(x) >= 1, transactions) print(len(transactions)) def ele_to_str(ele): global db return util.ele_to_str(db, ele) sets = map(set, transactions) print('running algorithm', file=sys.stderr) before = time.time() if algo == "apriori": s = fim.apriori(transactions, supp=2) s = sorted(s, key=lambda x:x[1]) for items,supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp/float(trcount), "|".join(items)).encode('utf-8')) if algo == "eclat": s = fim.eclat(transactions, supp=2) s = sorted(s, key=lambda x:x[1]) for items,supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8')) elif algo == "eclat-rules": rules = fim.eclat(transactions, target='r', report='aC') rules = sorted(rules, key = lambda x: x[3]) for consequence, condition, support_count, confidence_percent in rules: condition = map(ele_to_str, condition)
__author__ = 'ssatpati' import re from fim import apriori baskets = [] with open('ProductPurchaseData.txt', 'r') as f: for line in f: items = re.split(r'\s', line) items.sort() baskets.append(items) for r in apriori(baskets, target='r', zmax=2, supp= -100, report='c', eval='c', conf=90): if r[0]: print '%s\t%s\t%s' %(r[0],r[1],r[2])
import fim import sys from data import load_transactions if __name__ == '__main__': filename = sys.argv[1] threshold = int(sys.argv[2]) transactions = load_transactions(filename) fis = fim.apriori(transactions) print fis
__author__ = 'Daniel Bernardes, Mamadou Diaby, Raphael Fournier, Francoise Fogelman Soulie and Emmanuel Viennet' __version__ = '1.0' import sys, fim if __name__ == "__main__": if len(sys.argv) > 1: threshold = 100 * float(sys.argv[1]) else: threshold = 1 # default: 1% fin = sys.stdin profiles = [] for line in fin: tokens = line.split() profiles.append(map(int, tokens[1:])) fin.close() confidence = fim.apriori(profiles, max=2, supp=threshold, report='e', eval='c', thresh=threshold) for triplet in confidence: print '{0:d} {1:d} {2:.5f}'.format(triplet[0][0], triplet[0][1], triplet[1][0])
# edges pos={} pos.update(pos_a) pos.update(pos_b) nx.draw_networkx_edges(g,pos,edgelist=nx.edges(g),width=1,alpha=0.8,edge_color='g') nx.draw_networkx_labels(g,pos,font_size=10,font_family='sans-serif') plt.title('Graph representation') plt.show() ############################################################################### # Now do rule finding ############################################################################### frequent_itemset = apriori(data, supp=-3, zmin=2, target='s', report='a') rules = apriori(data, supp=-3, zmin=2, target='r', report='rCL') print(frequent_itemset) print(rules) frequent_itemset_1=[] frequent_itemset_2=[] frequent_itemset_3=[] ''' original apriori, generate 1 item ''' def apriori(data,supp=3,zmin,target,report): frequent_itemset=[]
transaction.add(words[i] + '_' + words[i + 1]) i += 2 continue #if words[i] in stopwords or len(words[i]) == 1 or words[i].isdigit() or not words[i].isalnum() or len(words[i]) > 25: # i += 1 # continue if not check_string_guality(words[i]): i += 1 continue transaction.add(words[i]) i += 1 if not transaction: continue author_transactions.append(list(transaction)) # output #for transaction in author_transactions: # print transaction print "Done with author transactions" #patterns = apriori(author_lists, supp=-12) patterns = apriori(author_transactions, supp=-10) print '-------- Author Affiliation Apriori --------' #for (pattern,support) in sorted(patterns,key=lambda x:-x[1]): # if len(pattern) <= 1: continue # print pattern,support for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): #pattern is a tuple if len(pattern) <= 1: continue output_file.write('{} {} \n'.format(pattern, str(support))) print 'Number of patterns:', len(patterns)
plt.subplot(121) nx.draw_networkx_nodes(g,pos_a,nodelist=a,node_color='r',node_size=300,alpha=0.8) nx.draw_networkx_nodes(g,pos_b,nodelist=b,node_color='b',node_size=300,alpha=0.8) # edges pos={} pos.update(pos_a) pos.update(pos_b) nx.draw_networkx_edges(g,pos,edgelist=nx.edges(g),width=1,alpha=0.8,edge_color='g') nx.draw_networkx_labels(g,pos,font_size=10,font_family='sans-serif') plt.title('Graph representation') plt.show() ''' ############################################################################### # Now do rule finding ############################################################################### #frequent_itemset = apriori(data, supp=10, zmin=2, target='s', report='a') rules = apriori(data, supp=10, zmin=2, zmax=5, target='r', report='SCl') #print(frequent_itemset) #print(rules) ############################################################################### #sort the result r = sorted(rules, reverse=True, key=lambda x: x[4]) print(r) print(len(r))
def build(self): """ Takes variables from constructor and outputs anomaly scores for each row/observation as a pandas data frame """ # create variables which hold number of rows and columns rows = len(self.data.index) cols = len(self.data.columns) # default value of mlen parameter is equal to number of columns if self.mlen == 0.5: self.mlen = cols # adding column name to each row data2 = pd.DataFrame({col: str(col) + '=' for col in self.data}, index=self.data.index) + self.data.astype(str) # transforming dataset to list of lists records = [] for i in range(0, rows): records.append([str(data2.values[i, j]) for j in range(0, cols)]) # creating transaction dataset print("Creating transactions from a dataset") t = time.process_time() te = TransactionEncoder() oht_ary = te.fit(records).transform(records, sparse=True) elapsed_time = time.process_time() - t print("Transactions created in: " + str(elapsed_time)) # creating sparse data frame from transaction encoder sparse_df = pd.SparseDataFrame(oht_ary, columns=te.columns_, default_fill_value=False) # using apriori to find frequent itemsets supp = self.support / 100 print("Running apriori with settings: support={}, maxlen={}".format( supp, self.mlen)) t = time.process_time() apr = fim.apriori(records, target="s", supp=self.support, zmax=self.mlen, report="s") elapsed_time = time.process_time() - t print("Apriory finished in: " + str(elapsed_time)) # adding new column length of the rule frequent_itemsets = pd.DataFrame(apr) frequent_itemsets['length'] = frequent_itemsets[0].apply( lambda x: len(x)) print(frequent_itemsets.index) # creating a numpy array of lengths and qualities so operation such as multiplication can be done fiLenghts = np.array([frequent_itemsets['length']], np.int8) fiQualities = np.array([frequent_itemsets[1]], np.float16) # converting itemsets to frozensets so subsetting can be done print("Converting to datasets frozensets and computing coverages") t = time.process_time() items_list = [] fi = frequent_itemsets[0] for i in fi: items_frozen = frozenset(i) items_list.append(items_frozen) # converting transactions to frozensets transactions = [] for i in records: i = frozenset(i) transactions.append(i) # list that will temporarily store coverages tmp = [] print("Computing coverages") # comparing each transaction with itemsets for i in items_list: for i2 in transactions: if i.issubset(i2): tmp.append(1) else: tmp.append(0) # converting coverages to numpy array coverages = np.array([tmp]) elapsed_time = time.process_time() - t print("Computing coverages finished in: " + str(elapsed_time)) # converting coverages to valid shape and creating transpose matrix fiCoverages = coverages.reshape(len(frequent_itemsets), rows) fiCoveragesT = np.array(np.transpose(fiCoverages)) fiQualitiesT = np.transpose(fiQualities) # compute basic score for each coverage t = time.process_time() print("Computing results for each coverage") result = np.array(1 / (fiLenghts * np.transpose(fiQualities)), dtype=np.float16) print(result) elapsed_time = time.process_time() - t print("Computing results finished in: " + str(elapsed_time)) # create matrix with results on diagonal result2 = np.diagonal(result) shape = (len(frequent_itemsets), len(frequent_itemsets)) # it was necessary to create matrix with zeros to have matrix with particular shape with values only on the diagonal diagonalHelper = np.zeros(shape) np.fill_diagonal(diagonalHelper, result2) # matrix multiplication print("Computing individual scores") scores = np.array(np.matmul(fiCoveragesT, diagonalHelper)) print("Done") # prepare items for subsetting data_items = sparse_df.columns.values.tolist() dataItems = pd.DataFrame(data_items) # coverage of each data item dataItemsList = [] # converting to frozenset so subsetting can be done for i in range(0, len(dataItems.values)): dataItemsList.append( frozenset([str(dataItems.values[i, j]) for j in range(0, 1)])) dataItemsCoverage = [] # subsetting columns with items for i in dataItemsList: for i2 in items_list: if i2.issubset(i): dataItemsCoverage.append(1) else: dataItemsCoverage.append(0) # converting coverages to numpy array dataItemsCoverageArr = np.array([dataItemsCoverage]) tmp4 = dataItemsCoverageArr.reshape(len(dataItems.values), len(frequent_itemsets)) # variable that stores sum of columns print("Computing penalizations") t = time.process_time() colSums = np.array(self.data.count(axis=1)) # variable that stores sum of rows rowSums = np.array([fiCoveragesT.sum(axis=1)]) # preparing parts of the equation part1 = np.matmul(fiCoveragesT, np.transpose(tmp4)) part2 = part1.sum(axis=1) # compute how many items of each transaction is not covered by appropriate frequent itemsets fiC = colSums - part2 elapsed_time = time.process_time() - t print("Computing penalizations finished in: " + str(elapsed_time)) # compute final score as a mean value of scores and penalizations: (sum of scores + penalization*number of transactions)/(number of scores + penalization) print("Computing scores for each row") t = time.process_time() scorings = (scores.sum(axis=1) + fiC * rows) / (rowSums + fiC) elapsed_time = time.process_time() - t print("Computing final scores finished in: " + str(elapsed_time)) # creating pandas data frame with Scores column columnOutput = ["Scores"] output = pd.DataFrame(data=np.transpose(scorings), index=data2.values, columns=columnOutput, dtype=object) # print anomaly scores for each row/observation print(output) # returns maximum value of anomaly scores print(output[output['Scores'] == output['Scores'].max()]) print(fiC) return output
hotel = json.load(jsonHotel) hotelAddress = hotel.get('HotelInfo').get('Address').encode('ascii', 'ignore').lower() hotelAddress = re.split(r'<.[^>]+>([^<]*)<.[^>]+>', hotelAddress) hotelName = hotel.get('HotelInfo').get('Name').encode('ascii','ignore').lower() nameTokens = tokenizer.tokenize(hotelName) stopset = set().union(stopset, nameTokens) stopset = set().union(stopset, hotelAddress) #tokens = word_tokenize(hotel.get('Reviews')[0].get('Content').encode('ascii', 'ignore')) #print removeStopWords(tokens) test = [('l1', 'l2', 'l5'), ('l2', 'l4'), ('l2', 'l3'), ('l1', 'l2', 'l4'), ('l1', 'l3'), ('l2', 'l3'), ('l1', 'l3'), ('l1', 'l2', 'l3', 'l5'),('l1', 'l2', 'l3')] #trans = [] # #for review in hotel.get('Reviews'): # content = review.get('Content').encode('ascii', 'ignore') # allTokens = tokenizer.tokenize(content.lower()) # strippedTokens = removeStopWords(allTokens) # trans += [(strippedTokens)] setmax = 2 support = 35 confidence = 50 #feats = apriori(trans, zmax= setmax,supp=support, conf = confidence, target='r', report = 'CS') testFeats = apriori(test, zmin=2, zmax=setmax, supp=support, conf = confidence, target = 'r', report = 'Cabh') #file = open('Hotel'+fileNum+'_z'+str(setmax)+'_s'+str(support)+'_c'+str(confidence)+'.txt', 'w') #file.write(str(feats)) #file.close() print testFeats
if words[i] in stopwordset or len(words[i]) == 1 or words[i].isdigit(): i += 1 continue transaction.add(words[i]) i += 1 transactions.append(list(transaction)) # output for transaction in transactions: print transaction # In[5]: # http://www.borgelt.net/pyfim.html from fim import apriori, fpgrowth patterns = apriori(transactions, supp=-3) # +: percentage -: absolute number # output print '-------- Apriori --------' for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): print pattern, support print 'Number of patterns:', len(patterns) # In[6]: patterns = fpgrowth(transactions, supp=-3) # output print '-------- FP-Growth --------' for (pattern, support) in sorted(patterns, key=lambda x: -x[1]): print pattern, support print 'Number of patterns:', len(patterns)
trcount = len(transactions) print(trcount) transactions = filter(lambda x: len(x) >= 1, transactions) print(len(transactions)) def ele_to_str(ele): global db return util.ele_to_str(db, ele) sets = map(set, transactions) print('running algorithm', file=sys.stderr) before = time.time() if algo == "apriori": s = fim.apriori(transactions, supp=2) s = sorted(s, key=lambda x: x[1]) for items, supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp / float(trcount), "|".join(items)).encode('utf-8')) if algo == "eclat": s = fim.eclat(transactions, supp=2) s = sorted(s, key=lambda x: x[1]) for items, supp in s: items = map(ele_to_str, items) print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8')) elif algo == "eclat-rules": rules = fim.eclat(transactions, target='r', report='aC') rules = sorted(rules, key=lambda x: x[3]) for consequence, condition, support_count, confidence_percent in rules:
__author__ = 'chengmin' import fim,os #with open(os.getcwd()+'/APRIORIREADME.TXT',"w")as fout: #fout.write(fim.apriori.__doc__) #fout.write(str(help(fim.apriori))) #fout.write(fim.apriori) fin=open("FIMtest").read() print(fim.apriori(fin))
df[i] = i + " " + df[i].astype(str) df["credit_default"] = "default " + df["credit_default"].astype(str) baskets = df.values.tolist() ''' itemsets = apriori(baskets, supp=10, zmin=2, target='m') print (len(itemsets)) foo = open("itemsets", "w") for itemset in itemsets: foo.write(""+str(itemset)) foo.write("\n\n") ''' rules = apriori(baskets, supp=10, zmin=1, target='r', conf=40, report='ascl') f = open("rules", "w") count = 0 lista = list() for rule in rules: if rule[5] > 2 and "female" in rule[1]: count += 1 f.write("" + str(rule)) f.write("\n\n") lista.append(rule[0]) print(count) print(set(lista)) ''' #FREQUENT ITEMSET
def mbasket(self, data_p, support_par, confidence_par, method='apriori', lift_par=1.2): """ :param :return: """ start0 = time() ## Apriori analysis + association rules creation # find association rules with default settings rules = pd.DataFrame() if method == 'fpgrowth': start = time() frequent_itemsets = pd.DataFrame(fpgrowth(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s', mode='o'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("fpgrowth() -", run_time, "s") elif method == 'eclat': start = time() frequent_itemsets = pd.DataFrame(eclat(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s', mode='o'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("eclat() -", run_time, "s") elif method == 'relim': start = time() frequent_itemsets = pd.DataFrame(relim(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("relim() -", run_time, "s") elif method == 'sam': start = time() frequent_itemsets = pd.DataFrame(sam(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("sam() -", run_time, "s") elif method == 'ista': start = time() frequent_itemsets = pd.DataFrame(ista(data_p[1], supp=support_par * 100, zmin=1, report='s'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) print("ista() -", run_time, "s") else: start = time() frequent_itemsets = pd.DataFrame(apriori(data_p[1], supp=support_par * 100, zmin=1, target='s', report='s', mode='o'), columns=['itemsets', 'support']) run_time = round(time() - start, 3) rules = self.find_rules(frequent_itemsets, lift_par, confidence_par) print("apriori() -", run_time, "s") # users with antedecents from the rules calculated above if rules.shape[0] > 0: pivot_binary_tr = data_p[0].transpose() recom = {} pb = {} rules['antecedents'] = rules['antecedents'].apply( lambda x: frozenset(x)) for user in pivot_binary_tr.columns: products_bought = pivot_binary_tr.index[pivot_binary_tr[user] == 1] pb[user] = products_bought suitable_rules = [] for ante in rules['antecedents'].iteritems(): if ante[1].issubset(products_bought): # do poprawy suitable_rules.append(ante[0]) recom[user] = suitable_rules recom = pd.DataFrame.from_dict(recom, orient='index').stack().reset_index( level=1, drop=True).reset_index() recom.columns = ['review_profilename', 'Rule'] # products bought - zeby wykluczyc te produkty z rekomendacji pb2 = pd.DataFrame.from_dict(pb, orient='index').stack().reset_index( level=1, drop=True).reset_index() pb2.columns = ['review_profilename', 'antecedents1'] rule_cons = rules[['antecedents', 'consequents']].reset_index() rule_cons['consequents'] = [ i for i, *_ in rule_cons['consequents'] ] # change format from frozensets to normal rule_cons['antecedents'] = [ list(i) for i in rule_cons['antecedents'] ] rule_cons.columns = ['Rule', 'antecedents', 'consequents'] recom = recom.merge(rule_cons, on='Rule') recom.drop_duplicates(['review_profilename', 'consequents'], keep='first', inplace=True) # exclude from recommendations products already bought recom_already_satisfied = pb2.merge( recom, left_on=['review_profilename', 'antecedents1'], right_on=['review_profilename', 'consequents']) recom_already_satisfied['beer_already_known'] = 1 sum_recom_already_satisfied = recom_already_satisfied[ 'beer_already_known'].sum() recom_new = recom.merge( recom_already_satisfied[[ 'review_profilename', 'Rule', 'consequents', 'beer_already_known' ]], on=['review_profilename', 'Rule', 'consequents'], how='left') recom_new = recom_new[recom_new['beer_already_known'] != 1][[ 'review_profilename', 'Rule', 'antecedents', 'consequents' ]] else: rule_cons = 0 recom_new = 0 sum_recom_already_satisfied = 0 mba_time = round(time() - start0, 2) print("mbasket() -", mba_time, "s") return [ rule_cons, recom_new, mba_time, sum_recom_already_satisfied, run_time ]
item_counts = {} frequencies = [] rules = [] # read input file by line and split to # store each line as list of items # fim apriori expects this data structure as input baskets = [ line.split() for line in open('ProductPurchaseData.txt').read().strip().split('\n') ] # target = 's' -> frequent item sets # supp = negative -> minimum support of an item set # zmax = number -> maximum number of items per item set item_sets = fim.apriori(baskets, target='s', supp=-100, zmax=2) for r in item_sets: # apriori reports in the format ((itemset), support) item_set, item_count = r # k = 1 if len(item_set) == 1: item_counts[item_set[0]] = item_count # k = 2 elif len(item_set) == 2: item1, item2 = item_set # lexicographial ordering of the rules # report the rule a->b but not the rule b->a if item1 < item2: frequencies.append(((item1, item2), float(item_count)))
import sqlite3 from fim import apriori, fpgrowth def extract_data(db): command = 'SELECT keyword, paper_id from Keywords' output = db.execute(command) transactions = {} for t in output: if t[0] not in transactions: transactions[t[1]] = [t[0]] else: transactions[t[1]].append(t[0]) list_tran = [] for t in transactions: list_tran.append(transactions[t]) return list_tran if __name__ == '__main__': conn = sqlite3.connect('data/database.db') c = conn.cursor() transactions = extract_data(c) print 'loaded data' apriori_patterns = apriori(transactions, supp=-7) print '-------- Apriori --------' output = [] for (pattern,support) in sorted(apriori_patterns,key=lambda x:-x[1]): if len(pattern) > 1: print pattern,support print 'Number of patterns:',len(apriori_patterns)
def BBMax_Accuracy_main(fname1, fname2, fname3, sup, m_time): global tid global lines change_raw_data = 0 lines,tid = readDataset(fname3) abs_supp = ceil(sup*lines-0.5) F = readLargeData(fname1) S = minSet(readSensitiveSet(fname2)) SS = supersets(S, F.keys()) Rev_Fd = list(set(F) - SS) start_time = clock() Rev_pos_bord = convert2frozen_m(apriori(Rev_Fd, target = 'm', supp = float(0.0), conf=100)) sens_ind = [] for i in xrange(lines): flag = True for itemset in S: if itemset.issubset(tid[i]): sens_ind.append(i) flag = False break if flag: for itemset in Rev_pos_bord: if itemset.issubset(tid[i]): sens_ind.append(i) break sens_ind = list(set(sens_ind)) N = len(sens_ind) cpx = cplex.Cplex() cpx.set_results_stream(None) cpx.objective.set_sense(cpx.objective.sense.minimize) cpx.variables.add(obj = (1,)*N + (lines,)*len(Rev_pos_bord), lb =(0,)*(N+len(Rev_pos_bord)), ub=(1,)*N+(cplex.infinity,)*len(Rev_pos_bord), types=(cpx.variables.type.integer,)*(N+len(Rev_pos_bord))) for itemset in S: ind = [] cur_supp = 0 for i in xrange(N): if itemset.issubset(tid[sens_ind[i]]): ind.append(i) cur_supp += 1 ## print(ind) ## print(itemset) ## print("GreaterEq than ",cur_supp - abs_supp + 1) cpx.linear_constraints.add(lin_expr = [SparsePair(ind = ind, val=(1,)*len(ind))], senses=["G"], rhs=[cur_supp - abs_supp + 1]) rpb_c = 0 for itemset in Rev_pos_bord: ind = [] cur_supp = 0 for i in range(N): if itemset.issubset(tid[sens_ind[i]]): ind.append(i) cur_supp += 1 ind.append(N+rpb_c) rpb_c += 1 ## print(ind) ## print(itemset) ## print("LessEq than ",cur_supp - abs_supp) cpx.linear_constraints.add(lin_expr = [SparsePair(ind = ind, val=(1,)*(len(ind)-1)+(-1,))], senses=["L"], rhs=[cur_supp - abs_supp]) cpx.parameters.mip.pool.relgap.set(0) ## cpx.parameters.preprocessing.presolve.set(cpx.parameters.preprocessing.presolve.values.off) ## cpx.populate_solution_pool() cpx.solve() if any([i for i in map(int, cpx.solution.get_values())[lines:(lines+len(Rev_pos_bord))]]): print("System would be infeasible!!") print("Number of solutions: ", cpx.solution.pool.get_num()) ## print(map(int, cpx.solution.get_values())) ## print("Objective: ", cpx.solution.get_objective_value()) for i in get_indices(map(int, cpx.solution.get_values())[0:N], 1): temp_set = set() for itemset in S: if itemset.issubset(tid[sens_ind[i]]): temp_set.add(itemset) while len(temp_set) > 0: item_dic = {} for itemset in temp_set: for item in itemset: if item not in item_dic: item_dic[item] = 0 item_dic[item] += 1 max_val = 0 for item, freq in item_dic.items(): if max_val < freq: max_val = freq element = frozenset([item]) if item_dic.values().count(max_val) > 1: candidates = [frozenset([item]) for item, freq in item_dic.items() if freq==max_val] element = candidates[randrange(0, len(candidates))] tid[sens_ind[i]] = tid[sens_ind[i]] - element change_raw_data += 1 for itemset in temp_set: if element.issubset(itemset): temp_set = temp_set - set([itemset]) exec_time=((clock()-start_time)) total_time = exec_time + m_time,"sec" exec_time = exec_time,"sec" cpx = None ######----create out files-----###### out_file = open('BBMax_Accuracy_results.txt', 'w') out_file2 = open('BBMax_Accuracy_visible.txt','w') print('Border-Based Max-Accuracy Results\n---------------\n',file = out_file2) print('\nThe Sanitized DB is:\n',file = out_file2) for i in xrange(lines): k = ' '.join(sorted(tid[i])) z = '{'+ k + '}' print(k, file = out_file) print(z, file = out_file2) out_file.close() print(file = out_file2) m_time = m_time, "sec" print('changes in raw data:', change_raw_data, file = out_file2) print('data min. alg. time = ', m_time, file = out_file2) print('hiding alg. time = ', exec_time, file = out_file2) print('total execution time = ', total_time, file = out_file2) out_file2.close() return(tid, change_raw_data, Rev_Fd)
for sentences in content: for word, pos in sentences: if word.isalpha() and word.lower() not in stopset: trimmedTokens += [(word.lower(), pos)] trans += [lemmatize.getLemmas(trimmedTokens)] trimmedTokens = [] lemmatize.saveLemmaDict() freqDict = cityFreq(city, corpus_path, files) TotalHotels = freqDict['TotalHotels'] TotalReviews = freqDict['TotalReviews'] support = ceil(0.01*TotalReviews)/len(trans)*25 feats = apriori(trans, zmin = setmin, zmax= setmax,supp=support, conf = confidence, target='r', report = 'CS') for rule in feats: ruleSupp = rule[2][1] ruleConf = rule[2][0] word1 = rule[0] word2 = rule[1][0] if not wordLookup.has_key(word1): addWord(word1, i) i+=1 if not wordLookup.has_key(word2): addWord(word2, i) i+=1 if len(rule[1])==2: word3 = rule[1][1] if not wordLookup.has_key(word3):
def associationMining(papers): textFolder = 'data/text/' support = 9 confidence = 10 rules = {} # Create stopwords list stopwordsFile = open('stopwords.txt', 'r') stopwords = set() for line in stopwordsFile: word = line.strip('\r\n').lower() stopwords.add(word) stopwordsFile.close() transactions = [] for key, value in papers.items(): if 'folder' in papers[key] and 'filename' in papers[key]: # Get candidates candidates = [] dataFile = open(textFolder + papers[key]['folder'] + papers[key]['filename']) for line in dataFile: text = line.strip('\r\n') words = easy_tokenizer(text) candidates.append(words) dataFile.close() # Compute words dict wordDict = {} for words in candidates: for word in words: if word in stopwords or len(word) == 1 or word.isdigit(): continue if word not in wordDict: wordDict[word] = 0 wordDict[word] += 1 # Compute bigrams bigrams = {} L = 0 for words in candidates: n = len(words) L += n for i in range(0, n - 1): if words[i] in wordDict and words[i + 1] in wordDict: bigram = words[i] + '_' + words[i + 1] if bigram not in bigrams: # bigram's count, first word's count, second word's count, significance score bigrams[bigram] = [ 0, wordDict[words[i]], wordDict[words[i + 1]], 0.0 ] bigrams[bigram][0] += 1 # Readjust bigrams scores for bigram in bigrams: bigrams[bigram][3] = (1.0 * bigrams[bigram][0] - \ 1.0 * bigrams[bigram][1] * bigrams[bigram][2]/L) / \ ((1.0 * bigrams[bigram][0])**0.5) # Compute transactions bigramDict = {} for bigram in bigrams: if bigrams[bigram][0] > 1: first, second = bigram.split('_') if first not in bigramDict: bigramDict[first] = set() bigramDict[first].add(second) # Compute quality entities transactions = [] for words in candidates: transaction = set() # set of words/bigrams n = len(words) i = 0 while i < n: if words[i] in bigramDict and i + 1 < n and words[ i + 1] in bigramDict[words[i]]: transaction.add(words[i] + '_' + words[i + 1]) i += 2 continue if words[i] in stopwords or len( words[i]) == 1 or words[i].isdigit(): i += 1 continue transaction.add(words[i]) i += 1 transactions.append(list(transaction)) rules = apriori(transactions, target='r', supp=support, conf=confidence, report='sc') print '--------- One-to-Many Assocation Rules ------------' for left, right, support, confidence in sorted(rules, key=lambda x: x[0]): print left, '-->', right, support, confidence print 'Number of rules: ', len(rules)
print(fpgrowth.__doc__) elif tid < -1: print(eclat.__doc__) elif tid < 0: print(apriori.__doc__) else: tracts = [ [ 1, 2, 3 ], [ 1, 4, 5 ], [ 2, 3, 4 ], [ 1, 2, 3, 4 ], [ 2, 3 ], [ 1, 2, 4 ], [ 4, 5 ], [ 1, 2, 3, 4 ], [ 3, 4, 5 ], [ 1, 2, 3 ] ] print('transactions:') for t in tracts: print(t) if tid < 1: print ('apriori(tracts, supp=-3, zmin=2):') for r in apriori(tracts, supp=-3, zmin=2): print r elif tid < 2: print ('eclat(tracts, supp=-3, zmin=2):') for r in eclat(tracts, supp=-3, zmin=2): print r elif tid < 3: print ('fpgrowth(tracts, supp=-3, zmin=2):') for r in fpgrowth(tracts, supp=-3, zmin=2): print r else: print ('fim(tracts, supp=-3, zmin=2, report=\'#\'):') for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
import fim try: item_counts = {} frequencies = [] rules = [] # read input file by line and split to # store each line as list of items # fim apriori expects this data structure as input baskets = [ line.split() for line in open('ProductPurchaseData.txt').read().strip().split('\n')] # target = 's' -> frequent item sets # supp = negative -> minimum support of an item set # zmax = number -> maximum number of items per item set item_sets = fim.apriori(baskets, target='s', supp=-100, zmax=2) for r in item_sets: # apriori reports in the format ((itemset), support) item_set, item_count = r # k = 1 if len(item_set) == 1: item_counts[item_set[0]] = item_count # k = 2 elif len(item_set) == 2: item1, item2 = item_set # lexicographial ordering of the rules # report the rule a->b but not the rule b->a if item1 < item2: frequencies.append(((item1, item2), float(item_count)))