for item in transaction: if not [item] in c1: c1.append([item]) c1.sort() return map(frozenset, c1) cd /Users/KarenJung/Documents/loscanadienses import Apriori.apriori as apriori %paste globals() dataSet ds suppData reload(apriori) dataSet = ds dataSet L, suppData = apriori.apriori(dataSet,minSupport=0.5) L suppData rules=apriori.generateRules(L,suppData,minConf=0.7) ds rules=apriori.generateRules(L,suppData,minConf=0.4) import pandas as pd sant_df = pd.read_csv('Data/train_ver2.csv') sant_df.head() sant_df.describe() len(sant_df) sant_df.columns zip(range(len(sant_df.columns)),sant_df.columns) baskets_df = sant_df.iloc[:10000,24:] len(baskets_df) baskets_df.head()
from Apriori import apriori, generateRule def loadDataSet(): return [[1, 3, 4], [3, 4, 5], [1, 3, 4, 5], [1, 4, 6]] if __name__ == "__main__": dataSet = loadDataSet() L, supportData = apriori.apriori(dataSet, 0.5) #print(supportData) M = generateRule.generateRules(L, supportData, 0.5) for t in M: print(t)
pwd cd /Users/KarenJung/Documents/loscanadienses import Apriori.apriori as apriori import Santander.baskets as sbaskets sant_df = pd.read_csv('Data/train_ver2.csv') customers = sbaskets.customers(sant_df) april = sbaskets.onlyMonth(sant_df,'2016-04-28',customers) april = sbaskets.onlyMonth(sant_df,'2016-04-28',customers[:10]) reload(sbaskets) april = sbaskets.onlyMonth(sant_df,'2016-04-28',customers[:10]) april = sbaskets.onlyMonth(sant_df,'2016-04-28',customers[:1000]) reload(sbaskets) april = sbaskets.onlyMonth(sant_df,'2016-04-28',customers[:1000]) %%time sb = sbaskets.makeBaskets(sant_df,[0,1000000],[24,48]) Ls, supp = apriori.apriori(sb,minSupport=0.01) %%time sb = sbaskets.makeBaskets(sant_df,[0,100000],[24,48]) %%time Ls, supp = apriori.apriori(sb,minSupport=0.01) %%time sb = sbaskets.makeBaskets(sant_df,[0,10000],[24,48]) %%time Ls, supp = apriori.apriori(sb,minSupport=0.05) sb[:10] %%time Ls, supp = apriori.apriori(sb,minSupport=0.05,withNcodpers=False) help(sbaskets.makeBaskets) %%time sb = sbaskets.makeBaskets(sant_df,[0,10000],[24,48],withNcodpers=False) %%time
min_supp = 0.02 min_conf = 0.25 rule_batches = [] nbatches = 20 batchsize = 10000 seed = 666 # random state (to set seed for repeatable training) ran_state = np.random.RandomState(seed) for i in xrange(nbatches): print('Training with batch number ... %d' % i) sample_df = sant_df.sample(n=batchsize, random_state=ran_state) # convert batch of customer data into "baskets" of financial products sb = sbaskets.makeBaskets(sample_df, (0, batchsize), (24, 48), withNcodpers=False) # find all sets of products that occur in at least min_supp of the baskets Ls, supp = apriori.apriori(sb, minSupport=min_supp) # find all rules that have confidence at least min_conf # (i.e. a rule A=>B is a pair of subsets A and B such that B occurs in # at least a fraction minConf of the baskets in which A occurs) rule_batches.append(apriori.generateRules(Ls, supp, minConf=min_conf)) # flatten list of rules into list of pairs of form ((ls,rs),conf), # where ls and rs are *sorted* lists of left-hand-sides and right-hand-sides # of the rules rules = [((sorted(list(rule[0])), sorted(list(rule[1]))), rule[2]) for rules in rule_batches for rule in rules] # loop over rules and average confidence of each rule rules_unique = [] confs_average = [] for rule in rules: if rule[0] in rules_unique: confs_average[rules_unique.index(rule[0])] += rule[1]