Esempio n. 1
0
def get_freqitemsets(fname,minsupport,maxlhs):
  #minsupport is an integer percentage (e.g. 10 for 10%)
  #maxlhs is the maximum size of the lhs
  #first load the data
  data,Ydata = load_data(fname)
  #Now find frequent itemsets
  #Mine separately for each class
  data_pos = [x for i,x in enumerate(data) if Ydata[i,0]==0]
  data_neg = [x for i,x in enumerate(data) if Ydata[i,0]==1]
  assert len(data_pos)+len(data_neg) == len(data)
  Y = [0,0]
  Y[0] = sum([1<<i for i,x in enumerate(data) if Ydata[i,0]==1])
  Y[1] = sum([1<<i for i,x in enumerate(data) if Ydata[i,1]==1])
  itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)]
  itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)])
  itemsets = list(set(itemsets))
  print len(itemsets),'rules mined'

  #Now form the data-vs.-lhs set
  #X[j] is the bit vector of data points that contain itemset j (that is, satisfy rule j)
  X = [ 0 for j in range(len(itemsets)+1)]
  global trainingSize
  trainingSize = len(data)
  X[0] = (1<<trainingSize) - 1 #the default rule satisfies all data, so all bits are 1's
  for (j,lhs) in enumerate(itemsets):
    X[j+1] = sum([1<<i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)])
  #now form lhs_len
  lhs_len = [0]
  for lhs in itemsets:
    lhs_len.append(len(lhs))
  nruleslen = Counter(lhs_len)
  lhs_len = array(lhs_len)
  itemsets_all = ['null']
  itemsets_all.extend(itemsets)
  return X,Y,nruleslen,lhs_len,itemsets_all
Esempio n. 2
0
def construct_fp_train(filename):
    setfile = open(filename)
    reader = csv.reader(setfile, delimiter=";")

    #General Statistics
    set_ids = []
    measures = []
    notes_fp = []
    results = []

    for row in reader:
        id_file = row[0]

        set_ids.append(id_file)
        measures.append(get_measures(id_file))

    measures = itertools.chain(*measures)
    notes_fp = fpgrowth(list(measures), report='S', zmin=2)
    print len(notes_fp)
    setfile = open(filename)
    reader = csv.reader(setfile, delimiter=";")
    for row in reader:
        id_file = row[0]
        measure = get_measures(id_file)
        fp_song = fpgrowth(measure, report='S', zmin=2)
        result = compare_fp(notes_fp, fp_song)
        results.append(result)

    return results, notes_fp
def get_freqitemsets_1(dataset, Y, minsupport, maxlhs):
    #minsupport is an integer percentage (e.g. 10 for 10%)
    #maxlhs is the maximum size of the lhs
    #first load the data
    #Now find frequent itemsets
    #Mine separately for each class
    data_pos = [x for i, x in enumerate(dataset) if Y[i] == 0]
    data_neg = [x for i, x in enumerate(dataset) if Y[i] == 1]
    print 'ok'
    #data_pos = dataset[0:72]
    #data_neg = dataset[72:144]
    assert len(data_pos) + len(data_neg) == len(dataset)
    try:
        itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)]
        itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)])
    except TypeError:
        itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, max=maxlhs)]
        itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, max=maxlhs)])
    itemsets = list(set(itemsets))
    print(len(itemsets), 'rules mined')
    #Now form the data-vs.-lhs set
    #X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
    X = [set() for j in range(len(itemsets) + 1)]
    X[0] = set(range(len(dataset)))  #the default rule satisfies all data
    for (j, lhs) in enumerate(itemsets):
        X[j + 1] = set([i for (i, xi) in enumerate(dataset) if set(lhs).issubset(xi)])
    #now form lhs_len
    lhs_len = [0]
    for lhs in itemsets:
        lhs_len.append(len(lhs))
    nruleslen = Counter(lhs_len)
    lhs_len = np.array(lhs_len)
    itemsets_all = ['null']
    itemsets_all.extend(itemsets)
    return X, Y, nruleslen, lhs_len, itemsets_all
Esempio n. 4
0
def construct_fp_train(filename):
    setfile= open(filename)
    reader = csv.reader(setfile, delimiter=";")

    #General Statistics
    set_ids = []
    measures = []
    notes_fp = []
    results = []

    for row in reader:
        id_file = row[0]

        set_ids.append(id_file)
        measures.append(get_measures(id_file))

    measures = itertools.chain(*measures)
    notes_fp = fpgrowth(list(measures), report='S', zmin=2)
    print len(notes_fp)
    setfile= open(filename)
    reader = csv.reader(setfile, delimiter=";")
    for row in reader:
        id_file = row[0]
        measure = get_measures(id_file)
        fp_song = fpgrowth(measure ,report='S', zmin=2)
        result = compare_fp(notes_fp, fp_song)
        results.append(result)

    return results, notes_fp
Esempio n. 5
0
def fi(data):
    print("Using apriori for fim : ")
    freq_list = fim.apriori(tracts=data, supp=5)
    print("The frequent item list is : ")
    print(freq_list)
    rules = fim.apriori(tracts=data, target='r', eval='c', report='c')
    print("The rules are : ")
    print(rules)
    rules = fim.apriori(tracts=data, target='r', eval='l', report='l')
    print("The rules are (evaluated with lift): ")
    print(rules)
    print("lfi using apriori : ")
    lfi(freq_list)

    print("Using fp-growth for fim : ")
    freq_list = fim.fpgrowth(tracts=data, supp=5)
    print("The frequent item list is : ")
    print(freq_list)
    rules = fim.fpgrowth(tracts=data,
                         target='r',
                         eval='c',
                         report='c',
                         conf=60)
    print("The rules are (evaluated with confidence): ")
    print(rules)
    rules = fim.fpgrowth(tracts=data,
                         target='r',
                         eval='l',
                         report='l',
                         conf=60)
    print("The rules are (evaluated with lift): ")
    print(rules)

    print("lfi using fpgrowth is : ")
    lfi(freq_list)
Esempio n. 6
0
def gen_rule(df_combine, Y, Supp, Maxlen, N):

    # generate rules using FP-growth algorithm

    df_combine = 1 - df_combine

    itemMatrix = [[item for item in df_combine.columns if row[item] == 1]
                  for i, row in df_combine.iterrows()]

    pindex = np.where(Y == 1)[0]
    nindex = np.where(Y != 1)[0]

    prules = fpgrowth([itemMatrix[i] for i in pindex],
                      supp=Supp,
                      zmin=1,
                      zmax=Maxlen)
    prules = [np.sort(x[0]).tolist() for x in prules]

    nrules = fpgrowth([itemMatrix[i] for i in nindex],
                      supp=Supp,
                      zmin=1,
                      zmax=Maxlen)
    nrules = [np.sort(x[0]).tolist() for x in nrules]

    prules, pRMatrix, psupp, pprecision, perror = screen_rules(
        prules, df_combine, Y, N, Supp)
    nrules, nRMatrix, nsupp, nprecision, nerror = screen_rules(
        nrules, df_combine, 1 - np.array(Y), N, Supp)

    premined_rules = prules
    premined_rules.extend(nrules)

    return premined_rules
Esempio n. 7
0
def get_freqitemsets(fname, minsupport, maxlhs, verbose=True):
    # minsupport is an integer percentage (e.g. 10 for 10%)
    # maxlhs is the maximum size of the lhs
    # first load the data
    data, Y = load_data(fname)
    # Now find frequent itemsets
    # Mine separately for each class
    data_pos = [x for i, x in enumerate(data) if Y[i, 0] == 0]
    data_neg = [x for i, x in enumerate(data) if Y[i, 0] == 1]
    assert len(data_pos) + len(data_neg) == len(data)
    try:
        itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)]
        itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)])
    except TypeError:
        itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, max=maxlhs)]
        itemsets.extend([r[0] for r in fpgrowth(data_neg, supp=minsupport, max=maxlhs)])
    itemsets = list(set(itemsets))
    if verbose:
        print(len(itemsets), 'rules mined')
    # Now form the data-vs.-lhs set
    # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
    X = [set() for j in range(len(itemsets) + 1)]
    X[0] = set(range(len(data)))  # the default rule satisfies all data
    for (j, lhs) in enumerate(itemsets):
        X[j + 1] = set([i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)])
    # now form lhs_len
    lhs_len = [0]
    for lhs in itemsets:
        lhs_len.append(len(lhs))
    nruleslen = Counter(lhs_len)
    lhs_len = array(lhs_len)
    itemsets_all = ['null']
    itemsets_all.extend(itemsets)
    return X, Y, nruleslen, lhs_len, itemsets_all
Esempio n. 8
0
 def generate_rulespace(self,
                        supp,
                        maxlen,
                        N,
                        need_negcode=False,
                        njobs=5,
                        method='fpgrowth',
                        criteria='IG',
                        add_rules=[]):
     if method == 'fpgrowth':
         if need_negcode:
             df = 1 - self.df
             df.columns = [name.strip() + 'neg' for name in self.df.columns]
             df = pd.concat([self.df, df], axis=1)
         else:
             df = 1 - self.df
         pindex = np.where(self.Y == 1)[0]
         nindex = np.where(self.Y != 1)[0]
         itemMatrix = [[item for item in df.columns if row[item] == 1]
                       for i, row in df.iterrows()]
         prules = fpgrowth([itemMatrix[i] for i in pindex],
                           supp=supp,
                           zmin=1,
                           zmax=maxlen)
         prules = [np.sort(x[0]).tolist() for x in prules]
         nrules = fpgrowth([itemMatrix[i] for i in nindex],
                           supp=supp,
                           zmin=1,
                           zmax=maxlen)
         nrules = [np.sort(x[0]).tolist() for x in nrules]
     else:
         print('Using random forest to generate rules ...')
         prules = []
         for length in range(2, maxlen + 1, 1):
             n_estimators = 250 * length  # min(5000,int(min(comb(df.shape[1], length, exact=True),10000/maxlen)))
             clf = RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=length)
             clf.fit(self.df, self.Y)
             for n in range(n_estimators):
                 prules.extend(
                     extract_rules(clf.estimators_[n], self.df.columns))
         prules = [list(x) for x in set(tuple(np.sort(x)) for x in prules)]
         nrules = []
         for length in range(2, maxlen + 1, 1):
             n_estimators = 250 * length  # min(5000,int(min(comb(df.shape[1], length, exact=True),10000/maxlen)))
             clf = RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=length)
             clf.fit(self.df, 1 - self.Y)
             for n in range(n_estimators):
                 nrules.extend(
                     extract_rules(clf.estimators_[n], self.df.columns))
         nrules = [list(x) for x in set(tuple(np.sort(x)) for x in nrules)]
         df = 1 - self.df
         df.columns = [name.strip() + 'neg' for name in self.df.columns]
         df = pd.concat([self.df, df], axis=1)
     self.prules, self.pRMatrix, self.psupp, self.pprecision, self.perror = self.screen_rules(
         prules, df, self.Y, N, supp)
     self.nrules, self.nRMatrix, self.nsupp, self.nprecision, self.nerror = self.screen_rules(
         nrules, df, 1 - self.Y, N, supp)
def get_freqitemsets(fname,minsupport,maxlhs):
    #minsupport is an integer percentage (e.g. 10 for 10%)
    #maxlhs is the maximum size of the lhs
    #first load the data
    data,Y = load_data(fname)
    #Now find frequent itemsets
    #Mine separately for each class
    if Y.shape[-1] == 2:
        # currently only mine itemsets for binary classification
        data_pos = [x for i,x in enumerate(data) if Y[i,0]==0]
        data_neg = [x for i,x in enumerate(data) if Y[i,0]==1]
        assert len(data_pos)+len(data_neg) == len(data)

        try:
            itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)]
            itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)])
        except TypeError:
            print("TypeError in fpgrowth")
            itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,max=maxlhs)]
            itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,max=maxlhs)])

    else:
        data_classes = [[] for _ in range(Y.shape[-1])]
        for row, y in zip(data, Y):
            i = list(y).index(1)
            data_classes[i].append(row)

        assert sum([len(x) for x in data_classes]) == len(data)

        itemsets = [
            [r[0] for r in fpgrowth(data_class, supp=minsupport, zmax=maxlhs)]
            for data_class in data_classes
        ]
        # flatten
        itemsets = [x for class_itemset in itemsets for x in class_itemset]

    itemsets = list(set(itemsets))
    print(len(itemsets),'rules mined')
    #Now form the data-vs.-lhs set
    #X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
    X = [ set() for j in range(len(itemsets)+1)]
    X[0] = set(range(len(data))) #the default rule satisfies all data
    for (j,lhs) in enumerate(itemsets):
        X[j+1] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)])
    #now form lhs_len
    lhs_len = [0]
    for lhs in itemsets:
        lhs_len.append(len(lhs))
    nruleslen = Counter(lhs_len)
    lhs_len = array(lhs_len)
    itemsets_all = ['null']
    itemsets_all.extend(itemsets)
    return X,Y,nruleslen,lhs_len,itemsets_all
Esempio n. 10
0
def get_freqitemsets(fname, minsupport=10, maxlhs=2):
    # Load the data
    data, Y = load_data(fname)

    # Open output file
    fout = open(fname + ".out", "w")

    # Now find frequent itemsets, mining separately for each class

    data_pos = [x for i, x in enumerate(data) if Y[i, 0] == 0]
    data_neg = [x for i, x in enumerate(data) if Y[i, 0] == 1]

    assert len(data_pos) + len(data_neg) == len(data)

    print "About to calculate positive itemsets"
    itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)]
    print "About to calculate negative itemsets"
    itemsets.extend(
        [r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)])
    itemsets = list(set(itemsets))
    print "Done"

    n_rules = len(itemsets)

    # Now for each rule we want to write out a line of output
    # containing the rule and a bit for each training sample
    # indicating if the sample satisfies the rule or not.

    for lhs in itemsets:
        print lhs
        fout.write('&'.join(lhs) + '\t')
        for (j, attrs) in enumerate(data):
            if set(lhs).issubset(attrs):
                fout.write('1 ')
            else:
                fout.write('0 ')
        fout.write('\n')

    fout.close()
    fout = open(fname + ".label", "w")
    for label in xrange(2):
        if label == 0:
            fout.write('negative' + '\t')
        else:
            fout.write('positive' + '\t')
        for i, x in enumerate(data):
            if Y[i, label] == 1:
                fout.write('1 ')
            else:
                fout.write('0 ')
        fout.write('\n')
    fout.close()
Esempio n. 11
0
def get_freqitemsets(fname, minsupport=10, maxlhs = 2):
    # Load the data
    data,Y = load_data(fname)
        
    # Open output file
    fout = open (fname+".out", "w")
        
    # Now find frequent itemsets, mining separately for each class
        
    data_pos = [x for i,x in enumerate(data) if Y[i,0]==0]
    data_neg = [x for i,x in enumerate(data) if Y[i,0]==1]
        
    assert len(data_pos)+len(data_neg) == len(data)
        
    print "About to calculate positive itemsets"
    itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)]
    print "About to calculate negative itemsets"
    itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)])
    itemsets = list(set(itemsets))
    print "Done"
        
    n_rules = len(itemsets)
        
    # Now for each rule we want to write out a line of output
    # containing the rule and a bit for each training sample
    # indicating if the sample satisfies the rule or not.
        
    for lhs in itemsets :
        print lhs
        fout.write('&'.join(lhs) + '\t')
        for (j, attrs) in enumerate(data) :
            if set(lhs).issubset(attrs) :
                fout.write('1 ')
            else :
                fout.write('0 ')
        fout.write('\n')
    
    fout.close()
    fout = open (fname+".label", "w")
    for label in xrange(2):
        if label==0:
            fout.write('negative' + '\t')
        else:
            fout.write('positive' + '\t')
        for i,x in enumerate(data):
            if Y[i,label] == 1:
                fout.write('1 ')
            else:
                fout.write('0 ')
        fout.write('\n');
    fout.close()
Esempio n. 12
0
 def generate_rules(self,supp,maxlen,N, method = 'randomforest'):
     self.maxlen = maxlen
     self.supp = supp
     df = 1-self.df #df has negative associations
     df.columns = [name.strip() + '_neg' for name in self.df.columns]
     df = pd.concat([self.df,df],axis = 1)
     if method =='fpgrowth' and maxlen<=3:
         itemMatrix = [[item for item in df.columns if row[item] ==1] for i,row in df.iterrows() ]  
         pindex = np.where(self.Y==1)[0]
         nindex = np.where(self.Y!=1)[0]
         print 'Generating rules using fpgrowth'
         start_time = time.time()
         rules= fpgrowth([itemMatrix[i] for i in pindex],supp = supp,zmin = 1,zmax = maxlen)
         rules = [tuple(np.sort(rule[0])) for rule in rules]
         rules = list(set(rules))
         start_time = time.time()
         print '\tTook %0.3fs to generate %d rules' % (time.time() - start_time, len(rules))
     else:
         rules = []
         start_time = time.time()
         for length in xrange(1,maxlen+1,1):
             n_estimators = min(pow(df.shape[1],length),4000)
             clf = RandomForestClassifier(n_estimators = n_estimators,max_depth = length)
             clf.fit(self.df,self.Y)
             for n in xrange(n_estimators):
                 rules.extend(extract_rules(clf.estimators_[n],df.columns))
         rules = [list(x) for x in set(tuple(x) for x in rules)]
         print '\tTook %0.3fs to generate %d rules' % (time.time() - start_time, len(rules))
     self.screen_rules(rules,df,N) # select the top N rules using secondary criteria, information gain
     self.getPatternSpace()
Esempio n. 13
0
def find_freq_mining_pred_deps(infile,
                               outfile,
                               mode='fim.fpgrowth',
                               support=10,
                               confidence=80,
                               zmin=2,
                               zmax=2,
                               sample_to_print=5):
    with open(infile) as f:
        transactions = pickle.load(f)

    if mode == 'fim.fpgrowth':
        import fim
        patterns = fim.fpgrowth(transactions,
                                zmin=zmin,
                                zmax=zmax,
                                supp=support,
                                conf=confidence)
        print "## Sample of rules ({} total): ##".format(len(patterns))
        print patterns[0:sample_to_print]

    elif mode == 'fim.carpenter':
        import fim
        patterns = fim.carpenter(transactions, zmin=2, zmax=2)
        print "## Sample of rules ({} total): ##".format(len(patterns))
        print patterns[0:sample_to_print]

    with open(outfile, 'w') as f:
        pickle.dump(patterns, f, -1)
Esempio n. 14
0
def get_nonsingle_itemsets(transactions, output_file):
    # Closed Non-Single Itemsets (ResponseBot 7)
    print("Building non-single itemsets with FP-Growth...")
    patterns = fpgrowth(transactions, target='c', supp=-1000, zmin=2)
    #output
    for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
        p = ','.join(pattern)
        output_file.write('{} {} \n'.format(p, str(support)))
    print 'Number of patterns:', len(patterns)
 def freq_set_mining(self, community_list_1, community_list_2):
     tracts = []
     for cl in [community_list_1, community_list_2]:
         for c in cl:
             tracts.append(c)
     fip = fim.fpgrowth(tracts, "m", -2)
     # size of overlap weighted by number of clusters in which this overlap occurs
     distance = np.sum([len(tup[0]) * tup[1] for tup in fip]) / np.array([community_list_1, community_list_2]).size
     return distance
    def generate_rules(self, supp=2, maxlen=10, N=10000, method='fpgrowth'):
        '''
        fp-growth, apriori, or using a tree based method.
        Note that, for frequen itemset mining, data needs to be discretized first.
        '''
        self.maxlen = maxlen
        self.supp = supp
        df = 1 - self.df  #df has negative associations
        df.columns = [name.strip() + '_neg' for name in self.df.columns]
        df = pd.concat([self.df, df], axis=1)
        # if method =='fpgrowth' and maxlen<=3:
        if method == 'fpgrowth' and self.maxlen <= 3:
            # if method =='fpgrowth':
            print('Generating rules using fpgrowth, of support', self.supp,
                  "max len", self.maxlen)
            # itemMatrix = [[item for item in df.columns if row[item] ==1] for i,row in df.iterrows() ]
            cols = df.columns.values.astype(str)
            R, C = np.where(df.values == 1)
            itemMatrix = np.split(cols[C],
                                  np.unique(R, return_index=True)[1])[1:]
            itemMatrix = [item.tolist() for item in itemMatrix]

            pindex = np.where(self.Y == self.target_class_idx)[0]
            nindex = np.where(self.Y != self.target_class_idx)[0]
            # pindex = np.where(self.Y==1)[0]
            # nindex = np.where(self.Y!=1)[0]
            start_time = time.time()
            rules = fpgrowth([itemMatrix[i] for i in pindex],
                             supp=supp,
                             zmin=1,
                             zmax=self.maxlen)
            rules = [tuple(np.sort(rule[0])) for rule in rules]
            rules = list(set(rules))
            print('\tTook %0.3fs to generate %d rules' %
                  (time.time() - start_time, len(rules)))
        else:
            rules = []
            print('Generating rules using tree-based method ')
            start_time = time.time()
            for length in range(1, maxlen + 1, 1):
                n_estimators = min(pow(df.shape[1], length), 300)
                clf = RandomForestClassifier(n_estimators=n_estimators,
                                             max_depth=length)
                clf.fit(self.df, self.Y)
                for n in range(n_estimators):
                    rules.extend(
                        extract_rules(clf.estimators_[n],
                                      df.columns,
                                      target_class_idx=self.target_class_idx))
            rules = [list(x) for x in set(tuple(x) for x in rules)]
            print('\tTook %0.3fs to generate %d rules' %
                  (time.time() - start_time, len(rules)))
        self.screen_rules(
            rules, df, N
        )  # select the top N rules using secondary criteria, information gain
        self.getPatternSpace()
Esempio n. 17
0
def mine_antecedents(data, Y, minsupport, max_predicates_per_antecedent):
    # data is the training data
    # Y is the training labels: 1 for positive and 0 for negative
    # minsupport is an integer percentage (e.g. 10 for 10%)
    # max_predicates_per_antecedent is the maximum number of predicates in a rule
    # mine the rule set
    n = len(data)
    data_pos = [x for i, x in enumerate(data) if Y[i] == 1]
    data_neg = [x for i, x in enumerate(data) if Y[i] == 0]
    assert len(data_pos) + len(data_neg) == n

    antecedent_set = [
        r[0] for r in fpgrowth(
            data_pos, supp=minsupport, zmax=max_predicates_per_antecedent)
    ]
    antecedent_set.extend([
        r[0] for r in fpgrowth(
            data_neg, supp=minsupport, zmax=max_predicates_per_antecedent)
    ])
    antecedent_set = list(set(antecedent_set))
    print len(antecedent_set), 'rules mined'
    # form the rule-versus-data set
    # X_pos[j] is the set of positive data points that satisfy rule j
    # X_neg[j] is the set of negative data points that satisfy rule j
    X_pos = [0 for j in range(len(antecedent_set) + 1)]
    X_neg = [0 for j in range(len(antecedent_set) + 1)]
    # X_pos[0] (X_neg[0]) is the set of all positive (negative) data points
    X_pos[0] = sum([1 << i for i, x in enumerate(data) if Y[i] == 1])
    X_neg[0] = sum([1 << i for i, x in enumerate(data) if Y[i] == 0])
    for (j, antecedent) in enumerate(antecedent_set):
        X_pos[j+1] = sum([1<<i for (i,xi) in enumerate(data) \
                          if Y[i] == 1 and set(antecedent).issubset(xi)])
        X_neg[j+1] = sum([1<<i for (i,xi) in enumerate(data) \
                          if Y[i] == 0 and set(antecedent).issubset(xi)])
    # form antecedent_len and nantecedents
    antecedent_len = [0]
    for antecedent in antecedent_set:
        antecedent_len.append(len(antecedent))
    nantecedents = Counter(antecedent_len)
    antecedent_len = np.array(antecedent_len)
    antecedent_set_all = ['null']
    antecedent_set_all.extend(antecedent_set)
    return X_pos, X_neg, nantecedents, antecedent_len, antecedent_set_all
Esempio n. 18
0
def write_frequent_itemsets(input_path, output_path, support=-10, min_set_size=1, max_set_size=3):
  # parse transactions from file
  transactions = parser.parse_csv_to_mat(input_path)
  
  # mine frequent itemsets
  frequent_itemsets = fpgrowth(transactions, supp=support, min=min_set_size, max=max_set_size)
  
  # write result to file
  with open(output_path, 'w+') as fd:
    pickle.dump(frequent_itemsets, fd)
Esempio n. 19
0
def compute_item_distribution_in_trans(fin_str):
    item_distribution_dict = dict()
    print fin_str
    for fre_item in fpgrowth(bca.iter_trans_data(fin_str).values(),supp=0,zmax=1,report='[a'):
        print fre_item[0][0],fre_item[1][0]
        base = 5
        key = int(fre_item[1][0])/base
        item_distribution_dict.setdefault(key,0)
        item_distribution_dict[key] +=1
    for key in item_distribution_dict:
        print >> sys.stdout,'[%d,%d) itemnums is %d' %(key*base , (key+1) *base, item_distribution_dict[key])
    def __call__(self, x_ns, feat_names=None):
        import pandas as pd

        def which_are_1(v):
            return list(pd.Series(range(len(v)))[map(bool, v)])

        feat_names = np.array(feat_names)
        length = float(x_ns.shape[0])
        raw = fim.fpgrowth([which_are_1(x_n) for x_n in x_ns],
                           supp=self.supp,
                           zmax=self.zmax)
        return [binary_rule(list(r), 1, feat_names[list(r)]) for (r, s) in raw]
Esempio n. 21
0
def get_freqitemsets(fname, minsupport, maxlhs):
    #minsupport is an integer percentage (e.g. 10 for 10%)
    #maxlhs is the maximum size of the lhs
    #first load the data
    data, Ydata = load_data(fname)
    #Now find frequent itemsets
    #Mine separately for each class
    data_pos = [x for i, x in enumerate(data) if Ydata[i, 0] == 0]
    data_neg = [x for i, x in enumerate(data) if Ydata[i, 0] == 1]
    assert len(data_pos) + len(data_neg) == len(data)
    Y = [0, 0]
    Y[0] = sum([1 << i for i, x in enumerate(data) if Ydata[i, 0] == 1])
    Y[1] = sum([1 << i for i, x in enumerate(data) if Ydata[i, 1] == 1])
    itemsets = [r[0] for r in fpgrowth(data_pos, supp=minsupport, zmax=maxlhs)]
    itemsets.extend(
        [r[0] for r in fpgrowth(data_neg, supp=minsupport, zmax=maxlhs)])
    itemsets = list(set(itemsets))
    print len(itemsets), 'rules mined'

    #Now form the data-vs.-lhs set
    #X[j] is the bit vector of data points that contain itemset j (that is, satisfy rule j)
    X = [0 for j in range(len(itemsets) + 1)]
    global trainingSize
    trainingSize = len(data)
    X[0] = (1 << trainingSize
            ) - 1  #the default rule satisfies all data, so all bits are 1's
    for (j, lhs) in enumerate(itemsets):
        X[j + 1] = sum(
            [1 << i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)])
    #now form lhs_len
    lhs_len = [0]
    for lhs in itemsets:
        lhs_len.append(len(lhs))
    nruleslen = Counter(lhs_len)
    lhs_len = array(lhs_len)
    itemsets_all = ['null']
    itemsets_all.extend(itemsets)
    return X, Y, nruleslen, lhs_len, itemsets_all
Esempio n. 22
0
def construct_fp_test(filename, notes_fp):
    setfile = open(filename)
    reader = csv.reader(setfile, delimiter=";")
    results = []
    ids = []
    for row in reader:
        id_file = row[0]
        measure = get_measures(id_file)
        fp_song = fpgrowth(measure, report='S', zmin=2)
        result = compare_fp(notes_fp, fp_song)
        results.append(result)
        ids.append(id_file)

    return results, ids
Esempio n. 23
0
def construct_fp_test(filename, notes_fp):
    setfile= open(filename)
    reader = csv.reader(setfile, delimiter=";")
    results = []
    ids = []
    for row in reader:
        id_file = row[0]
        measure = get_measures(id_file)
        fp_song = fpgrowth(measure, report='S', zmin=2)
        result = compare_fp(notes_fp, fp_song)
        results.append(result)
        ids.append(id_file)

    return results, ids
Esempio n. 24
0
def mining_rule_and_add_candidate(trans_dict, fout_str, min_supp=-5):
    candidate_dict = dict()
    for r in fpgrowth(trans_dict.values(), target="r", supp=-5, zmin=2, report="[ac"):
        # for r in fpgrowth(trans_dict.values(),supp=-5,zmin = 1,report='[a'):
        # r:(item_body,(item_head1,item_head2),[values])
        print >>sys.stdout, r
        item_heads_set = set(r[1])
        for user in trans_dict:
            buy_items_set = set(trans_dict[user])
            if item_heads_set.issubset(buy_items_set):
                if r[0] not in buy_items_set:
                    candidate_dict.setdefault(user, set())
                    candidate_dict[user].add(r[0])
    fout = open(fout_str, "w")
    for user in candidate_dict:
        print >> fout, "%s,%s" % (user, "#".join(candidate_dict[user]))
    fout.close()
Esempio n. 25
0
def item_stats():
    """
    Plot stats on frequent itemset occurences
    """
    transactions = parser.parse_csv_to_mat('/Users/ahkj/Dropbox/SAAS/data/csv/sample-big/customers.txt')
    frequent_itemsets = fpgrowth(transactions, supp=0.0005, max=3 ) 
    frequencies_1=[]
    frequencies_2 =[]
    frequencies_3 = []
    for frequent_itemset in frequent_itemsets:
	   if len(frequent_itemset[0])==1:
	       frequencies_1.append(frequent_itemset[1][0])
	   elif len(frequent_itemset[0])==2:
		   frequencies_2.append(frequent_itemset[1][0])
	   elif len(frequent_itemset[0])==3:
		   frequencies_3.append(frequent_itemset[1][0])

    frequencies_counts_1 = [0 for x in range(max(frequencies_1)+1)]
    frequencies_counts_2 = [0 for x in range(max(frequencies_2)+1)]
    frequencies_counts_3 = [0 for x in range(max(frequencies_3)+1)]

    for frequencie in frequencies_1:
        frequencies_counts_1[frequencie]+=1

    for frequencie in frequencies_2:
        frequencies_counts_2[frequencie]+=1

    for frequencie in frequencies_3:
        frequencies_counts_3[frequencie]+=1


    cleaned_ys_1 = frequencies_counts_1[0:30]
    xs_1 =[x for x in range(len(cleaned_ys_1))]
    plt.scatter(xs_1, cleaned_ys_1)
    plot_item_stats(xs_1, cleaned_ys_1, '../tmp/plots/item_stats/signletons.png')

    cleaned_ys_2 = frequencies_counts_2[0:30]
    xs_2 =[x for x in range(len(cleaned_ys_2))]
    plot_item_stats(xs_2, cleaned_ys_2, '../tmp/plots/item_stats/pairs.png')
    

    cleaned_ys_3 = frequencies_counts_3[0:30]
    xs_3 =[x for x in range(len(cleaned_ys_3))]
    plot_item_stats(xs_3, cleaned_ys_3, '../tmp/plots/item_stats/triples.png')

# item_stats()
Esempio n. 26
0
def queryGene(D1, thre):
    """
    Use fpgrowth to generate a finite queries pool

    :param D1: local database {'uniqueid':['database'. 'laboratory']}
    :param thre: threshold of queries' frequency
    :return: a closed frequency itemset of local database
    """
    D1bags = []
    for k, v in D1.iteritems():
        D1bags.append(v)
    queries_old = fim.fpgrowth(D1bags, 'c', 100.0 * thre / len(D1bags))
    queries = {}
    for i in queries_old:
        queries[frozenset(i[0])] = i[1]
    print >> perr, len(queries), 'queries generated in total.'
    return queries
Esempio n. 27
0
    def __call__(self, data):
        def which_are_1(v):
            return list(pd.Series(range(len(v)))[map(bool,v)])
        length = float(len(data))
        import pdb
        raw = fim.fpgrowth([which_are_1(x_n) for x_n in data.x_ns], supp = self.supp, zmax = self.zmax)
        data_idx = hash(data)
#        for (r,s) in raw:
#            try:
#                print data.x_names[r]
#            except:
#                pdb.set_trace()


        if data.x_names != None:
            return [rule_f((data_idx,i), r, s[0]/length, list(data.x_names[list(r)])) for (i, (r, s)) in enumerate(raw)]
        else:
            return [rule_f((data_idx,i), r, s[0]/length) for (i, (r, s)) in enumerate(raw)]
Esempio n. 28
0
def collaborationDiscovery(papers):

    support = 9
    frequentCollaborators = []

    allAuthorsPerPaper = []
    for key, value in papers.items():
        if 'affiliations' in papers[key]:
            authorsPerPaper = set()
            for affiliation in papers[key]['affiliations']:
                authorsPerPaper.add(authors[affiliation['aid']])
            allAuthorsPerPaper.append(authorsPerPaper)

    patterns = fpgrowth(allAuthorsPerPaper, supp=-support)

    for pattern, support in sorted(patterns, key=lambda x: -x[1]):
        if len(pattern) > 1:
            frequentCollaborators.append((pattern, support))

    return frequentCollaborators
Esempio n. 29
0
    def generate_rules(self,supp,maxlen,N, need_negcode = False,njobs = 5, method = 'fpgrowth',criteria = 'IG',add_rules = []):
        self.maxlen = maxlen
        self.supp = supp
        if method =='fpgrowth':
            print('Using fpgrowth to generate rules with support {} and max length {}'.format(supp,maxlen))
            itemMatrix = [[item for item in self.df.columns if row[item] ==1] for i,row in self.df.iterrows() ]  
            pindex = np.where(self.Y==1)[0]
            nindex = np.where(self.Y!=1)[0]
            start_time = time.time()
            rules= fpgrowth([itemMatrix[i] for i in pindex],supp = supp,zmin = 1,zmax = maxlen)
            rules = [np.sort(x[0]).tolist() for x in rules]
            df = self.df
        else:
            print('Using random forest to generate rules ...')
            rules = []
            start_time = time.time()

            for length in range(2,maxlen+1,1):
                n_estimators = 500*length# min(5000,int(min(comb(df.shape[1], length, exact=True),10000/maxlen)))
                clf = RandomForestClassifier(n_estimators = n_estimators,max_depth = length)
                clf.fit(self.df.iloc[:,list(range(int(self.df.shape[1]/2)))],self.Y)
                for n in range(n_estimators):
                    rules.extend(extract_rules(clf.estimators_[n],self.df.columns[:int(self.df.shape[1]/2)]))
            rules = [list(x) for x in set(tuple(np.sort(x)) for x in rules)]   
            df = 1-self.df 
            df.columns = [name.strip() + 'neg' for name in self.df.columns]
            df = pd.concat([self.df,df],axis = 1)

        self.generate_time = time.time() - start_time
        print('\tTook %0.3fs to generate %d rules' % (self.generate_time, len(rules)))
        count = 0
        index = []
        for rule in add_rules:
            if np.sort(rule).tolist()  not in rules:
                rules.append(rule)
                index.append(len(rules)-1)
            else:
                index.append(rules.index(rule))
        self.rulespace = [len(rules)]
        self.all_rulelen = np.array([len(rule) for rule in rules])
        self.screen_rules(rules,df,N,supp,criteria,njobs,index) # select the top N rules using secondary criteria, information gain
Esempio n. 30
0
def compute_frequent_transactions(lsynchs, sup, lsensors):
    """
    Applies FP-growth for finding the frequent transactions in the syncronizations

    :return:
    """
    ltrans = []

    for synch in lsynchs:
        trans = []
        for sn, _, cl in synch:
            trans.append('%s-C%s'%(lsensors[sn],str(cl)))
        ltrans.append(trans)

    lfreq = []
    cnt_len = np.zeros(len(lsensors))
    for itemset, sval in fpgrowth(ltrans, supp=-sup, zmin=2, target='m'):
        lfreq.append((itemset, sval))
        cnt_len[len(itemset)-2] += 1

    return lfreq, cnt_len
Esempio n. 31
0
    def generate_rules(self, X_trans, y):

        itemNames = dict()
        for i, item in enumerate(X_trans.columns):
            itemNames[i + 1] = item
        self.itemNames = itemNames

        if self.method == 'fpgrowth':
            from fim import fpgrowth, fim
            items = np.arange(1, len(X_trans.columns) + 1)
            itemMatrix = (X_trans * items).to_numpy()
            itemMatrix_numerical = np.array(
                [row[row > 0] for row in itemMatrix])
            rules = fpgrowth(itemMatrix_numerical[np.where(y == 1)].tolist(),
                             supp=self.support,
                             zmin=1,
                             zmax=self.maxlen)
            self.rules = [sorted(rule[0]) for rule in rules]
        else:
            items = np.arange(1, len(X_trans.columns) + 1)
            rules = []
            for length in range(1, self.maxlen + 1):
                # if the data is too small, it will complaine
                # the n_estimators can not larger than the
                # possible trees
                n_estimators = self.forest_size * length
                clf = RandomForestClassifier(n_estimators=n_estimators,
                                             max_depth=length)
                clf.fit(X_trans, y)
                for n in range(n_estimators):
                    rules.extend(extract_rules(clf.estimators_[n], items))
            # To-do: not sure which one is faster, needs to test on a large dataset
            rules = [list(x) for x in set(tuple(np.sort(y)) for y in rules)]
            # rules = [list(x) for x in remove_duplicates(tuple(np.sort(y)) for y in rules)]
            self.rules = rules
        # this needs to be modified, because
        # it needs user to add numerical version of the rules
        for add_rule in self.add_rules:
            if np.sort(add_rule).tolist() not in self.rules:
                self.rules.append(add_rule)
Esempio n. 32
0
def getfrequentitems(data, cl, minsupp, maxlhs):
    # returns all patterns/frequent itemsets given a certain minsupp and max length (maxlhs)
    itemsets = []
    for c in cl:
        #start_time = time.time()
        data_aux = [t.difference(c) for t in data if c <= t]
        #print("Time for set difference: " +str(time.time()-start_time))
        start_time = time.time()
        itemsets.extend([
            r[0] for r in fpgrowth(data_aux, supp=minsupp, zmin=2, zmax=maxlhs)
        ])
        #print("Time for set fpgrowth: " +str(time.time()-start_time))

    #remove repeated sets
    #start_time = time.time()
    itemsets = list(set(itemsets))
    #print("Time for list set transform: " +str(time.time()-start_time))

    #start_time = time.time()
    itemsets.sort(key=len)
    #print("Time for sorting list: " +str(time.time()-start_time))
    return itemsets
Esempio n. 33
0
 def generate_rules(self, supp, maxlen, N, method='randomforest'):
     self.maxlen = maxlen
     self.supp = supp
     df = 1 - self.df  #df has negative associations
     df.columns = [name.strip() + '_neg' for name in self.df.columns]
     df = pd.concat([self.df, df], axis=1)
     if method == 'fpgrowth' and maxlen <= 3:
         itemMatrix = [[item for item in df.columns if row[item] == 1]
                       for i, row in df.iterrows()]
         pindex = np.where(self.Y == 1)[0]
         nindex = np.where(self.Y != 1)[0]
         print 'Generating rules using fpgrowth'
         start_time = time.time()
         rules = fpgrowth([itemMatrix[i] for i in pindex],
                          supp=supp,
                          zmin=1,
                          zmax=maxlen)
         rules = [tuple(np.sort(rule[0])) for rule in rules]
         rules = list(set(rules))
         start_time = time.time()
         print '\tTook %0.3fs to generate %d rules' % (
             time.time() - start_time, len(rules))
     else:
         rules = []
         start_time = time.time()
         for length in xrange(1, maxlen + 1, 1):
             n_estimators = min(pow(df.shape[1], length), 4000)
             clf = RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=length)
             clf.fit(self.df, self.Y)
             for n in xrange(n_estimators):
                 rules.extend(extract_rules(clf.estimators_[n], df.columns))
         rules = [list(x) for x in set(tuple(x) for x in rules)]
         print '\tTook %0.3fs to generate %d rules' % (
             time.time() - start_time, len(rules))
     self.screen_rules(
         rules, df, N
     )  # select the top N rules using secondary criteria, information gain
     self.getPatternSpace()
Esempio n. 34
0
def get_freq_itemsets(data, y, min_support=50, max_lhs=2):
    """
    Xtrain,Ytrain,nruleslen,lhs_len,itemsets = get_freqitemsets(fname+'_train',minsupport,maxlhs) #Do frequent itemset mining from the training data
    """

    if y.shape[-1] == 2:
        # currently only mine itemsets for binary classification
        data_pos = [x for i, x in enumerate(data) if y[i, 0] == 0]
        data_neg = [x for i, x in enumerate(data) if y[i, 0] == 1]

        print(len(data_pos))
        print(len(data_neg))
        print(len(data))
        assert len(data_pos) + len(data_neg) == len(data)

        itemsets = [
            r[0] for r in fpgrowth(data_pos, supp=min_support, zmax=max_lhs)
        ]
        itemsets.extend(
            [r[0] for r in fpgrowth(data_neg, supp=min_support, zmax=max_lhs)])
    else:
        raise NotImplementedError

    itemsets = list(set(itemsets))
    print("{} rules mined".format(len(itemsets)))

    # build S (antecedent vs. datapoint matrix)
    # S[i] is the i-th antecedent
    # S[0] is for the default rule (which satisfies all data)

    print("Building S...")
    """
    S = [set() for _ in range(len(itemsets) + 1)]
    S[0] = set(range(len(data)))

    for j, lhs in enumerate(itemsets):
        s_lhs = set(lhs)
        S[j+1] = set([i for i, xi in enumerate(data) if s_lhs.issubset(xi)])
    """

    n_antes = len(itemsets)

    S = np.zeros((n_antes + 1, len(data)))
    S[0] = 1.
    for j, lhs in enumerate(itemsets):
        s_lhs = set(lhs)
        for i, xi in enumerate(data):
            S[j + 1, i] = s_lhs.issubset(xi)

    S = S.transpose()
    print("S built.")

    # get the cardinality of each antecendent
    # default rule has cardinality 0
    lhs_len = [0]
    lhs_len.extend([len(lhs) for lhs in itemsets])

    lhs_len = np.array(lhs_len)
    itemsets = ['null'] + itemsets

    return S, lhs_len, itemsets
#     for row in range(filtered.shape[0]):
#         to_append = list(filtered.indices[filtered.indptr[row]:filtered.indptr[row + 1]]
#                          [np.argsort(filtered.data[filtered.indptr[row]:filtered.indptr[row + 1]])])
#         sequences_spm.append(to_append)
#     save_obj(name="sequences_cat1_" + str(i), obj=sequences_spm, path=ROOT_DIR + '/data/cat1/')

costante_di_popolarita = 15

pred_lil = sps.lil_matrix((10000, 2262292))

for i in tqdm(range(1000,2000)):
    sequences = load_obj(path=ROOT_DIR+'/data/cat1/', name='sequences_cat1_'+str(i))
    popularity = len(sequences)
    preds_line = np.zeros(2262292)

    for seq in fpgrowth(sequences,supp= -popularity/costante_di_popolarita, target='m'):
        for song in seq[0]:
            preds_line[song]+= seq[1]*(len(seq[0])-1)*(len(seq[0])-1)
    vals = fast_argpart(preds_line)

    pred_lil[i,vals] = preds_line[vals]


eurm = sps.csr_matrix(pred_lil)
eurm = eurm_remove_seed(eurm , dr )
rec_list = eurm_to_recommendation_list(eurm)
ev.evaluate(rec_list, "cat2_spm_max",verbose=True, do_plot=True, show_plot=True, save=True )

exit()

# # parallel association rule.
def association_rule(i):
    sequences = load_obj(path=ROOT_DIR + '/data/cat1/', name='sequences_cat1_' + str(i))
    popularity_iniziale = len(sequences)
    preds_line = np.zeros(2262292)

    if popularity_iniziale > 2000:
        mean_len = 0
        for seq in sequences:
            mean_len += len(seq)
        mean_len = mean_len / len(sequences)

        count = 0
        for j in range(len(sequences)):
            if len(sequences[j]) > (mean_len * 2) or len(sequences[j]) < (mean_len / 2):
                sequences[j] = []
                count += 1
        popularity = popularity_iniziale - count

        print(i, "iniziale",popularity_iniziale, "new_pop", popularity, "rimosse", count, " mean_l", mean_len, "num_seq", len(sequences))

        if popularity > 2000:
            mean_len = 0
            for seq in sequences:
                mean_len += len(seq)
            mean_len = mean_len / len(sequences)

            count = 0
            for j in range(len(sequences)):
                if len(sequences[j]) > (mean_len * 2) or len(sequences[j]) < (mean_len / 2):
                    sequences[j] = []
                    count += 1
            popularity -= count

            print(i, popularity_iniziale, "new_pop", popularity, "rimosse", count, " mean_l", mean_len, "num_seq",
                  len(sequences))

        if popularity > 2000:
            mean_len = 0
            for seq in sequences:
                mean_len += len(seq)
            mean_len = mean_len / len(sequences)

            count = 0
            for j in range(len(sequences)):
                if len(sequences[j]) > (mean_len * 2) or len(sequences[j]) < (mean_len / 2):
                    sequences[j] = []
                    count += 1
            popularity -= count
            print(i, popularity_iniziale, "new_pop", popularity, "rimosse", count, " mean_l", mean_len, "num_seq",
                  len(sequences))

    sequences = np.array(sequences)
    sequences = sequences[len(sequences) > 0]
    const = costante_di_pop

    sequences = fpgrowth(sequences, supp=-popularity / const, target=target)

    for seq in sequences:
        for song in seq[0]:
            preds_line[song] += seq[1] * (len(seq[0]) - 1) * (len(seq[0]) - 1)
    indices = fast_argpart(preds_line)

    preds_line_lil = sps.lil_matrix((1, 2262292))
    vals = fast_argpart(preds_line)
    preds_line_lil[0, vals] = preds_line[vals]

    del sequences, indices, preds_line, vals,
    gc.collect()
    print("nnz", preds_line_lil.nnz)

    return preds_line_lil
Esempio n. 37
0
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None):
    from fim import fpgrowth
    """
    Cross validation. Using compact representation from
    Forward.
    """
    # init
    _id = str(time()).replace('.','')
    # if all_frequent_items is None:
    #     all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3)

    cv_start = time()
    print "\n### Running cross validation {}###".format(_id)
    print "Total transactions:{}".format(len(transactions))
    # print "Total frequest items:{}".format(len(all_frequent_items))

    # run results
    avg_errors = []
    var_errors = []

    # all_triangles, all_triples = filter_items(all_frequent_items)

    for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling

        all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3)
        all_triangles, all_triples = Forward.forward_compact(all_frequent_items)

        # Get triples for estimates
        frequent_items = fpgrowth(chunk, supp=support, min=1, max=3)
        if len(frequent_items) > 0:
            print 'frequent items: {}'.format(len(frequent_items))
        else:
            print 'No frequent items in chunk: {}'.format(index)
            continue
        triangle_tree, triples = Forward.forward_compact(frequent_items)
        print 'triangle roots: {}'.format(len(triangle_tree))

        estimates = []
        observations = []
        abs_errors = []
        max_est = 0
        max_obs = 0

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1)

                    # maxumum estiamte seen (for plotting)
                    max_est = max(max_est, est)

                    # record the estimate
                    estimates.append(est)

                    # from all observed triples get the actual observed number of triples
                    observed = 0
                    if all_triples.has_key((n1, n2, n3)):
                        observed = all_triples[(n1, n2, n3)]

                    # maximum observation of the triple (for plotting)
                    max_obs = max(max_obs, observed)

                    # record the observed
                    observations.append(observed)

                    # record abs error
                    error = abs(obs-est) / float(obs) * 100
                    abs_errors.append(error)


        if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found
            # evaluation
            min_error = min(abs_errors)
            max_error = max(abs_errors)
            avg_error = sum(abs_errors) / float(len(abs_errors))
            avg_errors.append(avg_error)
            var_error = 0
            if len(abs_errors) > 1:
                var_error = tvar(abs_errors) #tvar is the sample variance
            var_errors.append(var_error)

            res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error)
            print res_string
        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    total_avg_error = sum(avg_errors)/float(len(avg_errors))
    total_res_string = "Avg error:{}".format(total_avg_error)
Esempio n. 38
0
# In[5]:

# http://www.borgelt.net/pyfim.html
from fim import apriori, fpgrowth

patterns = apriori(transactions, supp=-3)  # +: percentage -: absolute number
# output
print '-------- Apriori --------'
for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
    print pattern, support
print 'Number of patterns:', len(patterns)

# In[6]:

patterns = fpgrowth(transactions, supp=-3)
# output
print '-------- FP-Growth --------'
for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
    print pattern, support
print 'Number of patterns:', len(patterns)

# In[7]:

patterns = fpgrowth(transactions, target='c', supp=-2, zmin=2)
# output
print '-------- Closed Non-single Itemsets --------'
for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
    print pattern, support
print 'Number of patterns:', len(patterns)
Esempio n. 39
0
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None):
    from fim import fpgrowth
    """
    Cross validation, 'old' version not using compatct
    triangle representation from Forward.
    """
    # init
    _id = str(time()).replace('.','')
    # if all_frequent_items is None:
    #     all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3)

    cv_start = time()
    print "\n### Running cross validation {}###".format(_id)
    print "Total transactions:{}".format(len(transactions))
    # print "Total frequest items:{}".format(len(all_frequent_items))

    # run results
    avg_errors = []
    var_errors = []

    # all_triangles, all_triples = filter_items(all_frequent_items)

    for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling

        all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3)
        all_triangles, all_triples = Forward.forward(all_frequent_items)

        # Get triples for estimates
        frequent_items = fpgrowth(chunk, supp=support, min=1, max=3)
        if len(frequent_items) > 0:
            print 'frequent items: {}'.format(len(frequent_items))
        else:
            print 'No frequent items in chunk: {}'.format(index)
            continue
        triangles, triples = Forward.forward(frequent_items)
        print 'triangles: {}'.format(len(triangles))

        estimates = []
        observations = []
        abs_errors = []
        max_est = 0
        max_obs = 0

        for (s1, s2, s3, s12, s23, s13, s123) in triangles:

            # if s123[1] != 0:
            #     continue
            # maxent estimate from the sample.
            # Index [1] of the tuples hold the # occurences in the sample
            est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1)

            # maxumum estiamte seen (for plotting)
            max_est = max(max_est, est)

            # record the estimate
            estimates.append(est)

            # from all observed triples get the actual observed number of triples
            observed = 0
            if all_triples.has_key(s123[0]):
                observed = all_triples[s123[0]]

            # maximum observation of the triple (for plotting)
            max_obs = max(max_obs, observed)

            # record the observed
            observations.append(observed)

            # record abs error
            error = abs(obs-est) / float(obs) * 100
            abs_errors.append(error)



        if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found
            # evaluation
            min_error = min(abs_errors)
            max_error = max(abs_errors)
            avg_error = sum(abs_errors) / float(len(abs_errors))
            avg_errors.append(avg_error)
            var_error = 0
            if len(abs_errors) > 1:
                var_error = tvar(abs_errors) #tvar is the sample variance
            var_errors.append(var_error)

            # TODO histogram of the average errors. max-ent, extrapolation, heurestic
            # TODO print average error og the average errors to the log.

            res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error)
            print res_string
        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    total_avg_error = sum(avg_errors)/float(len(avg_errors))
    total_res_string = "Avg error:{}".format(total_avg_error)
    return path
]


word_counts = Counter()
for w in wymagania:
    word_counts.update(w)

word_counts.most_common(100)

wymagania = [ w for w in wymagania if w ]


itemsets = fim.fpgrowth(wymagania, 
                        target = 'r',
                        zmin = 2, zmax = 2,
                        supp = 0.40,
                        conf = 20,
                        eval = 'l', 
                        report = '(acl')

'''
nodes = [
    {"id": id, "group":1} for id, count in word_counts.items()
]

links = [
    {"source":left, "target":right[0], "value": math.log(numbers[2] + 1) }
    for left, right, numbers in itemsets
]

graph = {
Esempio n. 41
0
#Lendo o arquivo csv
data_frame = pd.read_csv("./data/newTrainSet.csv")

#Pegar todos os ids unicos dos pedidos
array = data_frame.order_id.unique()

#Fazer uma lista com todos os pedidos
data_list = []
for p in data_frame.order_id.unique():
    data_list.append(
        (data_frame[data_frame['order_id'] == p].product_id).tolist())

#Executar algoritmo FP Growth
result = fpgrowth(data_list,
                  supp=min_sup,
                  conf=min_conf,
                  target='r',
                  report='XC')

#Escrever os resultados em um arquivo
i = 0
for p in result:
    filename = "res"
    filename = filename + str(i) + ".out"
    i = i + 1
    f = open(filename, 'w')
    f.write(repr(p))
    f.close()

concat_files(0, i)
id_to_name_pyfim()
Esempio n. 42
0
def get_association_rules(transactions):
    result = fpgrowth(transactions, target='r', conf=80, eval='c', report='hbalc')
    result = sorted(result, key=lambda x: (-x[-1], -x[-2]))[:10]
    return [(x[1], x[0]) for x in result]
Esempio n. 43
0
    print(fpgrowth.__doc__)
elif tid < -1:
    print(eclat.__doc__)
elif tid <  0:
    print(apriori.__doc__)
else:
    tracts = [ [ 1, 2, 3 ],
               [ 1, 4, 5 ],
               [ 2, 3, 4 ],
               [ 1, 2, 3, 4 ],
               [ 2, 3 ],
               [ 1, 2, 4 ],
               [ 4, 5 ],
               [ 1, 2, 3, 4 ],
               [ 3, 4, 5 ],
               [ 1, 2, 3 ] ]
    print('transactions:')
    for t in tracts: print(t)
    if   tid < 1:
        print  ('apriori(tracts, supp=-3, zmin=2):')
        for r in apriori(tracts, supp=-3, zmin=2): print r
    elif tid < 2:
        print  ('eclat(tracts, supp=-3, zmin=2):')
        for r in eclat(tracts, supp=-3, zmin=2): print r
    elif tid < 3:
        print  ('fpgrowth(tracts, supp=-3, zmin=2):')
        for r in fpgrowth(tracts, supp=-3, zmin=2): print r
    else:
        print  ('fim(tracts, supp=-3, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
Esempio n. 44
0
    def fit(self,
            X,
            y,
            feature_labels=[],
            undiscretized_features=[],
            verbose=False):
        """Fit rule lists to data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data 

        y : array_like, shape = [n_samples]
            Labels
            
        feature_labels : array_like, shape = [n_features], optional (default: [])
            String labels for each feature. If empty and X is a DataFrame, column 
            labels are used. If empty and X is not a DataFrame, then features are  
            simply enumerated
            
        undiscretized_features : array_like, shape = [n_features], optional (default: [])
            String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized
            
        verbose : bool
            Currently doesn't do anything

        Returns
        -------
        self : returns an instance of self.
        """
        self.seed()

        if len(set(y)) != 2:
            raise Exception(
                "Only binary classification is supported at this time!")

        # deal with pandas data
        if type(X) in [pd.DataFrame, pd.Series]:
            X = X.values
        if type(y) in [pd.DataFrame, pd.Series]:
            y = y.values

        X, y = self._setdata(X, y, feature_labels, undiscretized_features)

        permsdic = defaultdict(
            default_permsdic)  # We will store here the MCMC results

        data = list(X[:])
        # Now find frequent itemsets
        # Mine separately for each class
        data_pos = [x for i, x in enumerate(data) if y[i] == 0]
        data_neg = [x for i, x in enumerate(data) if y[i] == 1]
        assert len(data_pos) + len(data_neg) == len(data)
        try:
            itemsets = [
                r[0] for r in fpgrowth(data_pos,
                                       supp=self.minsupport,
                                       zmin=self._zmin,
                                       zmax=self.maxcardinality)
            ]
            itemsets.extend([
                r[0] for r in fpgrowth(data_neg,
                                       supp=self.minsupport,
                                       zmin=self._zmin,
                                       zmax=self.maxcardinality)
            ])
        except TypeError:
            itemsets = [
                r[0] for r in fpgrowth(data_pos,
                                       supp=self.minsupport,
                                       min=self._zmin,
                                       max=self.maxcardinality)
            ]
            itemsets.extend([
                r[0] for r in fpgrowth(data_neg,
                                       supp=self.minsupport,
                                       min=self._zmin,
                                       max=self.maxcardinality)
            ])
        itemsets = list(set(itemsets))
        if self.verbose:
            print(len(itemsets), 'rules mined')
        # Now form the data-vs.-lhs set
        # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
        X = [set() for j in range(len(itemsets) + 1)]
        X[0] = set(range(len(data)))  # the default rule satisfies all data
        for (j, lhs) in enumerate(itemsets):
            X[j + 1] = set(
                [i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)])
        # now form lhs_len
        lhs_len = [0]
        for lhs in itemsets:
            lhs_len.append(len(lhs))
        nruleslen = Counter(lhs_len)
        lhs_len = array(lhs_len)
        itemsets_all = ['null']
        itemsets_all.extend(itemsets)

        Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = (
            X, np.vstack((1 - np.array(y),
                          y)).T.astype(int), nruleslen, lhs_len, itemsets_all)

        # Do MCMC
        res, Rhat = run_bdl_multichain_serial(self.max_iter,
                                              self.thinning,
                                              self.alpha,
                                              self.listlengthprior,
                                              self.listwidthprior,
                                              Xtrain,
                                              Ytrain,
                                              nruleslen,
                                              lhs_len,
                                              self.maxcardinality,
                                              permsdic,
                                              self.burnin,
                                              self.n_chains,
                                              [None] * self.n_chains,
                                              verbose=self.verbose,
                                              seed=self.random_state)

        # Merge the chains
        permsdic = merge_chains(res)

        ###The point estimate, BRL-point
        self.d_star = get_point_estimate(
            permsdic,
            lhs_len,
            Xtrain,
            Ytrain,
            self.alpha,
            nruleslen,
            self.maxcardinality,
            self.listlengthprior,
            self.listwidthprior,
            verbose=self.verbose)  # get the point estimate

        if self.d_star:
            # Compute the rule consequent
            self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain,
                                                     self.d_star, self.alpha,
                                                     True)

        return self
Esempio n. 45
0
def minging_rule(trans_dict, fout_str, min_supp=-5):
    fout = open(fout_str, "w")
    for r in fpgrowth(trans_dict.values(), target="r", supp=-5, conf=50, zmin=2, report="[ac"):
        print >> fout, "%s,%s,%d#%.2f" % (r[0], "#".join(r[1]), r[2][0], r[2][1])
    fout.close()
Esempio n. 46
0
def fpgrowth(tracts,
             target='s',
             min_c=2,
             min_z=2,
             max=None,
             report='a',
             algo='s'):
    '''
    Find frequent item sets with the fpgrowth algorithm.

    INPUT:
        tracts [list of lists]
            transaction database to mine. The database must be an iterable of
            transactions; each transaction must be an iterable of items; each
            item must be a hashable object. If the database is a dictionary,
            the transactions are the keys, the values their (integer)
            multiplicities.
        target [str. Default: 's']
            type of frequent item sets to find
            s/a:   sets/all   all     frequent item sets
            c  :   closed     closed  frequent item sets
            m  :   maximal    maximal frequent item sets
            g  :   gens       generators
        min_c [int. Default: 2]
            minimum support of an item set
            (positive: absolute number, negative: percentage)
        min_z  [int. Default: 2]
            minimum number of items per item set
        max  [int. Default: no limit]
            maximum number of items per item set
        report  [str. Default: 'a']
            values to report with an item set
            a     absolute item set support (number of transactions)
            s     relative item set support as a fraction
            S     relative item set support as a percentage
            e     value of item set evaluation measure
            E     value of item set evaluation measure as a percentage
            #     pattern spectrum instead of full pattern set
        algo [str. Default: 's']
            algorithm variant to use:
            s     simple     simple  tree nodes with only link and parent
            c     complex    complex tree nodes with children and siblings
            d     single     top-down processing on a single prefix tree
            t     topdown    top-down processing of the prefix trees
            Variant d does not support closed/maximal item set mining.

    OUTPUT:
        * If *report* == 'a'/'s'/'S'/'e'/'E' return a list of pairs, each
          consisting of a frequent itemset (as a tuple of unit IDs) and a
          value representing that itemset's support or evaluation measure
        * If *report* == '#', return a pattern spectrum as a list of triplets
          (size, supp, cnt), representing pattern size, pattern support, and
          number of patterns with that size and that support found in *tracts*
    '''
    import fim

    # By default, set the maximum pattern size to the number of spike trains
    if max is None:
        max = numpy.max([len(t) for t in tracts]) + 1

    # Run the original fpgrowth
    fpgrowth_output = fim.fpgrowth(tracts=tracts,
                                   target=target,
                                   supp=-min_c,
                                   min=min_z,
                                   max=max,
                                   report=report,
                                   algo='s')
    # Return the output
    if report != '#':
        return [(cfis, s[0]) for (cfis, s) in fpgrowth_output]
    else:
        return fpgrowth_output
Esempio n. 47
0
tid = int(argv[1])
if tid < -2:
    print(fpgrowth.__doc__)
elif tid < -1:
    print(eclat.__doc__)
elif tid < 0:
    print(apriori.__doc__)
else:
    tracts = [[1, 2, 3], [1, 4, 5], [2, 3, 4], [1, 2, 3, 4], [2, 3], [1, 2, 4],
              [4, 5], [1, 2, 3, 4], [3, 4, 5], [1, 2, 3]]
    print('transactions:')
    for t in tracts:
        print(t)
    if tid < 1:
        print('apriori(tracts, supp=-3, zmin=2):')
        for r in apriori(tracts, supp=-3, zmin=2):
            print r
    elif tid < 2:
        print('eclat(tracts, supp=-3, zmin=2):')
        for r in eclat(tracts, supp=-3, zmin=2):
            print r
    elif tid < 3:
        print('fpgrowth(tracts, supp=-3, zmin=2):')
        for r in fpgrowth(tracts, supp=-3, zmin=2):
            print r
    else:
        print('fim(tracts, supp=-3, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-3, zmin=2, report='#'):
            print r