def mine_fds(ctx, closure): U = set(ctx.M) fdt = FDTree(U) A = closure(set([])) if bool(A): fdt.add_fd(set([]), A) i = max(ctx.M) while len(A) < len(ctx.M): for j in reversed(ctx.M): if j in A: A.remove(j) else: B = fdt.l_close(A.union([j])) if not bool(B - A) or j <= min(B - A): A = B i = j break AII = closure(A) # print sorted(A), sorted(AII) if len(A) < len(AII): fdt.add_fd(A, AII - A) if not bool(AII - A) or i <= min(AII - A): A = AII i = max(ctx.M) else: A = A.intersection(set([i for i in range(i + 1)])) return fdt
L.append((ant, set([x]))) L.sort(key=lambda (a, c): len(a)) # right-saturate pc = PreClosure(L, len(U)) classes = {} classes_inv = {} for ri, (ant, con) in enumerate(L): ant_ask = frozenset(pc.l_close(ant)) classes[ri] = ant_ask classes_inv.setdefault(ant_ask, []).append(ri) # print classes_inv # left-saturate real_L = {closed: [] for closed in classes_inv.keys()} new_L = FDTree(U) for ri, (ant, con) in enumerate(L): Hant = classes_inv[classes[ri]] #Hant = range(100) ant_circ = pc.l_close_ask(ant, Hant) # print '' # print '::', '({}=>{})'.format(ant, con),ant_circ, classes[ri], [L[i] for i in Hant] # print '::', '({}=>{})'.format(ant_circ, classes[ri])#, [L[i] for i in Hant] if ant_circ != classes[ri]: if not any( previous.issubset(ant_circ) for previous in real_L[classes[ri]]): for x in range(len(real_L[classes[ri]]) - 1, -1, -1): if ant_circ.issubset(real_L[classes[ri]][x]):
def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes n_atts = len(U) m_prime = [set([]) for i in range(len(U))] g_prime = [] stats = Stats() fctx = FormalContext(g_prime, m_prime) m_prime.append(set( [])) # THIS SHOULD BE AFTER DECLARING THE FORMAL CONTEXT print("Processing data... ", end='') sys.stdout.flush() representations = [[row[j] for row in tuples] for j in U] print("done") # ATTRIBUTE ORDERING print("Building representations... ", end='') sys.stdout.flush() plis = [(build_pli(r), ri) for ri, r in enumerate(representations)] print("done") print("Ordering... ", end='') sys.stdout.flush() # ATTRIBUTE ORDERING # ex_order = [290, 17, 7, 7, 489, 14, 10, 31, 509, 6, 341, 151, 16, 28, 49, 4, 1, 19, 571, 810, 6, 8, 17] # plis.sort(key=lambda k: ex_order[k[1]], reverse=False) # Lexicographic plis.sort(key=lambda k: k[0], reverse=False) # Lexicographic order = {j[1]: i for i, j in enumerate(plis)} #Original order -> new order inv_order = {i: j for j, i in order.items() } # At position i should be attribute j # print(order) # print(inv_order) # exit() # reco_order = { } plis = [ i[0] for i in plis ] # build_pli(representations[ inv_order[i] ]) for i in range(n_atts) ] print("done") print("Reconverting... ", end='') tuples = [[None] * n_atts for i in range(len(tuples))] # print(plis) # not_none = [0 for i in range(len(tuples))] for att in range(n_atts): att = inv_order[att] for i, cluster in enumerate(plis[att]): for row in cluster: tuples[row][att] = i print("done") # print(records) # print(tuples) # for ti, t in enumerate(tuples): # tuples[ti] = [t[inv_order[i]] if any(ti in part for part in plis[i]) else None for i in range(len(t))] # tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # print(tuples[ti], ti, ) # print (tuples) # tuples=records # records[] # print(plis) # # END ORDERING # VARIABLES FOR FAST STACKED NEXT CLOSURE Mjs = [set() for i in range(n_atts)] stack = [[None, m_prime[-1]], [None, set([]), Mjs]] # INITIALIZATION VARIABLES X = set([]) fdt = FDTree(U) m_i = -1 # WE START WITH THE EMPTY INTENT REPRESENTED BY THIS # COUNTERS TO KEEP SOME PERFORMANCE STATISTICS cycles = 0 cycles2 = 0 avoided_closures = 0 ncls = 0 while X != U: cycles += 1 if cycles % 1000 == 0: print("\rFDs:{}/{}/{}/{}/{} - {: <100}".format( fdt.n_fds, cycles, cycles2, len(g_prime), round((sum([len(mp) for mp in m_prime])) / len(m_prime)), ','.join(map(str, sorted(X)))), end='') #stack sys.stdout.flush() XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ): # XJJ = reduce(set.intersection, (g_prime[g] for g in XJ)) XJJ = set.intersection(*[g_prime[g] for g in XJ]) # if len(XJ) == 1: # XJJ = set(XJJ) else: XJJ = set(U) # AT THIS POINT WE HAVE XJJ WHICH IS OUR ESTIMATION OF THE CLOSURE # USING THE REPRESENTATION CONTEXT CALCULATED SO FAR # THE ACTUAL CLOSURE SHOULD BE XSS, HOWEVER IF # X = XJJ WE KNOW THAT XSS = XJJ AND WE CAN AVOID ITS # CALCULATION # XSS = None n_x = len(X) avoided_closures += n_x == len(XJJ) if n_x < len(XJJ): # CHECKS WHETHER X==XJJ cycles2 += 1 cache = [] check(X, XJJ, tuples, n_atts, cache, plis, stats) if n_x < len(XJJ): fdt.add_fd(X, XJJ) # break else: cache.sort(key=len) gp = cache.pop() n_gp = len(g_prime) XJ.add(n_gp) for i in stack[1:]: i[1].add(n_gp) for x in gp: m_prime[x].add(n_gp) g_prime.append(gp) # XJJ.intersection_update(gp) new_atts = XJJ - X if not bool(new_atts) or m_i <= min(new_atts): m_i = U[-1] X = XJJ else: # print(stack) stack[-2][2][m_i] = XJJ # print('\t',m_i, XJJ) X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i = fast_next_closure(X, U, fdt.l_close, m_i, stats, stack) # ncls += c stack[-1][0] = m_i # print ('--') # for g in g_prime: # print (g) L = list(fdt.read_fds()) print("\nNUMBER OF FDS:{}".format(len(L))) print("SAMPLING CONTEXT SIZE:{}".format(len(g_prime))) print("CYCLES:", cycles) print("DB CHECKS:", cycles2) print("GOOD CLOSURES:", avoided_closures) print("Closures:", stats.closures) print("Failures:", stats.failures) print("Row check:", stats.row_check) print("Conflicting Attributes:", [stats.conflicting_attributes[order[i]] for i in range(n_atts)]) print("Non Conflicting Attributes:", [stats.non_conflicting[order[i]] for i in range(n_atts)]) # print("EFF:", [abs(stats.conflicting_attributes[order[i]]-stats.non_conflicting[order[i]]) for i in range(n_atts)]) print(order)
def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes n_atts = len(U) # Number of attributes # rand_tuples = list(range(len(tuples))) # rand_tuples.sort(key=lambda i: len(set(tuples[i]))) print("Processing data... ", end='') sys.stdout.flush() representations = [[row[j] for row in tuples] for j in U] print("done") plis = [(build_pli(r), ri) for ri, r in enumerate(representations)] stats = Stats() # ORDERING print("Ordering... ", end='') sys.stdout.flush() # ATTRIBUTE ORDERING plis.sort(key=lambda k: k[0], reverse=False) # Lexicographic order = {j[1]: i for i, j in enumerate(plis)} #Original order -> new order inv_order = {i: j for j, i in order.items() } # At position i should be attribute j plis = [i[0] for i in plis] print("done") print("Reconverting... ", end='') tuples = [[None] * n_atts for i in range(len(tuples))] # print(plis) # not_none = [0 for i in range(len(tuples))] for att in range(n_atts): att = inv_order[att] for i, cluster in enumerate(plis[att]): for row in cluster: tuples[row][att] = i print("done") # # END ORDERING Mjs = [set() for i in range(n_atts)] # Needed by fast version of next_closure stack = [[None, None], [None, set([]), Mjs]] # Stack for next_closure X = set([]) fdt = FDTree(U) m_i = -1 cycles = 0 cycles2 = 0 XJ = set([]) ncls = 0 sU = set(U) while X != U: # Feedback Output cycles += 1 if cycles % 1 == 0: print("\rFDs:{}/{}".format(fdt.n_fds, cycles), ','.join(map(str, sorted(X))), end='') #stack sys.stdout.flush() # Stack re-use cache = [] XSS = set(U) check(X, XSS, tuples, n_atts, cache, plis, stats) if len(X) != len(XSS): fdt.add_fd(X, XSS) stack[-1][-1][m_i] = XSS if not bool(XSS - X) or m_i <= min(XSS - X): m_i = U[-1] X = XSS else: X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i = fast_next_closure(X, U, fdt.l_close, m_i, stats, stack) stack[-1][0] = m_i L = list(fdt.read_fds()) print("\nN_FDS:{}".format(len(L))) print("CYCLES:", cycles) print("Closures:", ncls) print(fdt.recursions)
def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes m_prime = [set([]) for i in range(len(U))] g_prime = [] dist = {t: 0 for t in range(len(tuples))} # non_fds_cache = BooleanTree() fctx = FormalContext(g_prime, m_prime) sampled_tuples = [] representations = [[row[j] for row in tuples] for j in U] # ORDERING order = [(len(set(r)), ri) for ri, r in enumerate(representations)] order.sort(key=lambda k: k[0], reverse=False) print order order = {j[1]: i for i, j in enumerate(order)} #Original order -> new order inv_order = {i: j for j, i in order.items()} for ti, t in enumerate(tuples): tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # END ORDERING representations = [[row[j] for row in tuples] for j in U] partitions = map(Partition.from_lst, representations) partition_signatures = [] for partition in partitions: T = {} for ki, k in enumerate(partition): for t in k: T[t] = ki partition_signatures.append(T) stack = [[None, None, None], [None, set([]), Partition.top()]] X = set([]) # L = [] # pc = PreClosure(L, len(U)) fdt = FDTree(U) m_i = None # m_top = frozenset(range(len(m_prime))) cycles = 0 cycles2 = 0 XJ = set([]) XJJ = fctx.closed_set(X) count_good_points = 0 USet = set(U) while X != U: cycles += 1 # if cycles%100==0: print "\rFDs:{}/{}/{}/{} - {: <100}".format( fdt.n_fds, cycles, cycles2, len(g_prime), ','.join(map(str, sorted(X)))), #stack sys.stdout.flush() if m_i is not None: XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ) and len(XJ) < len(U) - len(X): SXJ = sorted(XJ, key=lambda g: len(g_prime[g])) XJJ = copy.copy(g_prime[SXJ[0]]) for g in SXJ[1:]: # print '\n\t', XJJ,'::', X, m_i XJJ.intersection_update(g_prime[g]) if len(XJJ) == len(X): # print 'x' break elif bool(XJ) and len(XJ) >= len(USet) - len(X): XJJ = X.union([m for m in USet - X if XJ.issubset(m_prime[m])]) # XJJ = fctx.derive_extent(XJ) else: XJJ = set(range(len(m_prime))) # print '\t=>', X, m_i, '::', XJ, XJJ cache = {} XSS = None XS = None # X_match = [i in X for i in U] count_good_points += len(X) == len(XJJ) # if len(XJJ) == len(X): # print m_i # print XJJ, X, XJ # exit() while X != XJJ: cycles2 += 1 # print '.', sys.stdout.flush() if XSS is None: if stack[-2][2] is not None: XS = Partition.intersection(stack[-2][2], partition_signatures[m_i]) else: si = len(stack) for si in range(len(stack) - 2, 0, -1): if stack[si][2] is not None: break for i in range(si + 1, len(stack) - 1): stack[i][2] = Partition.intersection( stack[i - 1][2], partition_signatures[ stack[i][0]]) # partitions[stack[i][0]]) if m_i is not None: XS = Partition.intersection(stack[-2][2], partition_signatures[m_i]) else: XS = square(X, partitions) XSS = X.union([ m for m in sorted(XJJ - X, reverse=True) if all(m in atts for atts in cache.values()) and Partition.leq(XS, partition_signatures[m], cache, dist) ]) cache = sorted(cache.items(), key=lambda ((t1, t2), atts): len(atts)) # print cache # print '.', sys.stdout.flush() if XJJ == XSS: # L.append((set(X), set(XJJ))) fdt.add_fd(X, XJJ) break else: # for sample in cache: # non_fds_cache.append([i in sample[1] for i in U]) sampled_tuple, gp = cache.pop() for t in sampled_tuple: dist[t] += 1 sampled_tuples.append(sampled_tuple) # for sampled_tuple, gp in cache: XJ.add(len(g_prime)) for i in stack[1:]: i[1].add(len(g_prime)) for x in gp: m_prime[x].add(len(g_prime)) g_prime.append(gp) XJJ.intersection_update(gp) if not bool(XJJ - X) or m_i <= min(XJJ - X): m_i = U[-1] X = XJJ else: X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ stack[-1][2] = XS # X, m_i = next_closure(X, U, pc.l_close, m_i, stack) X, m_i = next_closure(X, U, fdt.l_close, m_i, stack) stack[-1][0] = m_i L = list(fdt.read_fds()) print "\nN_FDS:{}".format(len(L)) print "SAMPLING CONTEXT SIZE:{}".format(len(g_prime)) print "CYCLES:", cycles print "GOOD CLOSURES:", count_good_points
def mine_fds(U, tuples, partitions, fctx, rand_tuples): n_atts = len(U) stack = [ [None, set([])], [None, set([])] ] fdt = FDTree(U) # FD store # sampled_tuples = [] m_i = None # First X \oplus m_i is none # Counters cycles = 0 cycles2 = 0 X = set([]) # First candidate XJ = set([]) # First derivation XJJ = fctx.closed_set(X) # First closure count_good_points = 0 USet = set(U) while X != U: cycles += 1 # print "\rFDs:{}/{}/{}/{} - {: <100}".format(fdt.n_fds, cycles, cycles2, len(fctx.g_prime), ','.join(map(str, sorted(X)))),#stack sys.stdout.flush() if bool(XJ): SXJ = sorted(XJ, key=lambda g: len(fctx.g_prime[g])) XJJ = copy.copy(fctx.g_prime[SXJ[0]]) for g in SXJ[1:]: XJJ.intersection_update(fctx.g_prime[g]) if len(XJJ) == len(X): break else: XJJ = set(range(len(fctx.m_prime))) cache = {} XSS = None count_good_points += len(X) == len(XJJ) while X != XJJ: cycles2 += 1 sys.stdout.flush() if XSS is None: XSS = check(m_i, X, XJJ, tuples, n_atts, cache, rand_tuples) cache = sorted(cache.items(), key=lambda ((t1, t2), atts): len(atts)) if XJJ == XSS: fdt.add_fd(X, XJJ) break else: sampled_tuple, gp = cache.pop() XJ.add(len(fctx.g_prime)) for i in stack[1:]: i[1].add(len(fctx.g_prime)) for x in gp: fctx.m_prime[x].add(len(fctx.g_prime)) fctx.g_prime.append(gp) XJJ.intersection_update(gp) if not bool(XJJ-X) or m_i <= min(XJJ-X): m_i = U[-1] X = XJJ else: X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i = next_closure(X, U, fdt.l_close, m_i, stack) stack[-1][0] = m_i XJ = stack[-2][1].intersection(fctx.m_prime[m_i]) return fdt
def attribute_exploration_pps(tuples): alg = AddIntentAlgorithm() U = range(len(tuples[0])) # Attributes n_atts = len(U) m_prime = [set([]) for i in range(len(U))] g_prime = [] rand_tuples = list(range(len(tuples))) # random.shuffle(rand_tuples) rand_tuples.sort(key=lambda i: len(set(tuples[i]))) fctx = FormalContext(g_prime, m_prime) sampled_tuples = [] representations = [[row[j] for row in tuples] for j in U] # ATTRIBUTE ORDERING order = [(len(set(r)), ri) for ri, r in enumerate(representations)] order.sort(key=lambda k: k[0], reverse=False) # print (order) order = {j[1]: i for i, j in enumerate(order)} #Original order -> new order inv_order = {i: j for j, i in order.items()} for ti, t in enumerate(tuples): tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # # END ORDERING Mjs = [set() for i in range(n_atts)] stack = [[None, None], [None, set([]), Mjs]] X = set([]) fdt = FDTree(U) m_i = -1 cycles = 0 cycles2 = 0 XJ = set([]) XJJ = fctx.closed_set(X) avoided_closures = 0 ncls = 0 g_sub = [] while X != U: cycles += 1 if cycles % 1000 == 0: print("\rFDs:{}/{}/{}/{}/{}/{}/{} - {: <100}".format( fdt.n_fds, cycles, cycles2, len(g_prime), len(alg.jip_objects), len(alg.elements), (sum([len(mp) for mp in m_prime])) / len(m_prime), ','.join(map(str, sorted(X)))), end='') #stack sys.stdout.flush() if m_i >= 0: XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ): XJJ = reduce(set.intersection, (g_prime[g] for g in XJ)) if len(XJ) == 1: XJJ = set(XJJ) else: XJJ = set(range(len(m_prime))) # cache = {} XSS = None n_x = len(X) avoided_closures += n_x == len(XJJ) while n_x < len(XJJ): cycles2 += 1 # sys.stdout.flush() if XSS is None: cache = [] XSS = check(X, XJJ, tuples, n_atts, cache, rand_tuples) cache.sort(key=len) # cache = sorted(cache.items(), key=lambda k: len(k[1])) # sys.stdout.flush() if len(XJJ) == len(XSS): fdt.add_fd(X, XJJ) break else: gp = cache.pop() n_gp = len(g_prime) XJ.add(n_gp) for i in stack[1:]: i[1].add(n_gp) for x in gp: m_prime[x].add(n_gp) # print ('\t', gp) g_prime.append(gp) nid = alg.add_intent_iteration(gp) alg.add_object(n_gp, nid) # alg.objects[nid].append(n_gp) # print(len(list(alg.get_jips())), '/', n_gp+1) rem = set(alg.non_jip_objects()) for m in range(n_atts): m_prime[m].difference_update(rem) # print() # print (alg.inv_lat[nid]) XJJ.intersection_update(gp) if not bool(XJJ - X) or m_i <= min(XJJ - X): m_i = U[-1] X = XJJ else: X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i, c = fast_next_closure(X, U, fdt.l_close, m_i, stack) ncls += c stack[-1][0] = m_i # print ('--') # for g in g_prime: # print (g) L = list(fdt.read_fds()) print("\nN_FDS:{}".format(len(L))) print("SAMPLING CONTEXT SIZE:{}".format(len(g_prime))) print("CYCLES:", cycles) print("GOOD CLOSURES:", avoided_closures) print("Closures:", ncls) print(fdt.recursions) # print(alg.elements) jip = alg.jip_objects print(jip) # print ([alg.objects[i] for i in jip]) print("JIP", len(jip))
def attribute_exploration_pps(tuples): U = range(len(tuples[0])) # Attributes n_atts = len(U) m_prime = [set([]) for i in range(len(U))] g_prime = [] rand_tuples = list(range(len(tuples))) # random.shuffle(rand_tuples) rand_tuples.sort(key=lambda i: len(set(tuples[i]))) fctx = FormalContext(g_prime, m_prime) m_prime.append(set([])) # THIS SHOULD BE AFTER DECLARING THE FORMAL CONTEXT representations = [[row[j] for row in tuples] for j in U] # ATTRIBUTE ORDERING order = [(len(set(r)), ri) for ri, r in enumerate(representations)] order.sort(key=lambda k: k[0], reverse=False) #print (order) order = {j[1]:i for i,j in enumerate(order)} #Original order -> new order inv_order = {i:j for j,i in order.items()} for ti, t in enumerate(tuples): tuples[ti] = [t[inv_order[i]] for i in range(len(t))] # # END ORDERING # VARIABLES FOR FAST STACKED NEXT CLOSURE Mjs = [set() for i in range(n_atts)] stack = [[None, m_prime[-1]],[None, set([]), Mjs, True]] # INITIALIZATION VARIABLES X = set([]) fdt = FDTree(U) m_i = -1 # WE START WITH THE EMPTY INTENT REPRESENTED BY THIS # COUNTERS TO KEEP SOME PERFORMANCE STATISTICS cycles = 0 cycles2 = 0 avoided_closures = 0 ncls = 0 while X != U: cycles += 1 # if cycles%1000 == 0: print ("{}||FDs:{}/{}/{}/{}/{} - {: <100}".format(stack[-2][-1], fdt.n_fds, cycles, cycles2, len(g_prime), (sum([len(mp) for mp in m_prime]))/len(m_prime), ','.join(map(str, sorted(X)))), end='\n') #stack sys.stdout.flush() XJ = stack[-2][1].intersection(m_prime[m_i]) if bool(XJ): # XJJ = reduce(set.intersection, (g_prime[g] for g in XJ)) XJJ = set.intersection(*[g_prime[g] for g in XJ]) if len(XJ) == 1: XJJ = set(XJJ) else: XJJ = set(U) # AT THIS POINT WE HAVE XJJ WHICH IS OUR ESTIMATION OF THE CLOSURE # USING THE REPRESENTATION CONTEXT CALCULATED SO FAR # THE ACTUAL CLOSURE SHOULD BE XSS, HOWEVER IF # X = XJJ WE KNOW THAT XSS = XJJ AND WE CAN AVOID ITS # CALCULATION XSS = None n_x = len(X) avoided_closures += n_x == len(XJJ) while n_x < len(XJJ): # CHECKS WHETHER X==XJJ cycles2 += 1 if XSS is None: cache = [] XSS = check(X, XJJ, tuples, n_atts, cache, rand_tuples) cache.sort(key=len) if len(XJJ) == len(XSS): fdt.add_fd(X, XJJ) print('\t', X, XJJ-X) for i in stack[1:]: i[-1] = False # print(stack) break else: gp = cache.pop() n_gp = len(g_prime) XJ.add(n_gp) for i in stack[1:]: i[1].add(n_gp) for x in gp: m_prime[x].add(n_gp) g_prime.append(gp) XJJ.intersection_update(gp) new_atts = XJJ - X if not bool(new_atts) or m_i <= min(new_atts): m_i = U[-1] X = XJJ else: # print(stack) stack[-2][2][m_i] = XJJ X.difference_update([m for m in X if m > m_i]) stack[-1][1] = XJ X, m_i,c = fast_next_closure(X, U, fdt.l_close, m_i, stack) stack[-1].append(True) ncls += c stack[-1][0] = m_i # print ('--') # for g in g_prime: # print (g) L = list(fdt.read_fds()) print ("\nNUMBER OF FDS:{}".format(len(L))) print ("SAMPLING CONTEXT SIZE:{}".format(len(g_prime))) print ("CYCLES:",cycles) print ("GOOD CLOSURES:", avoided_closures) print ("Closures:", ncls) print(fdt.recursions)