def supergraphs_in_eq(g, g2, rate=1): '''Find all supergraphs of g that are also in the same equivalence class with respect to g2 and the rate. Currently works only for bfu.undersample by 1 ''' if bfu.undersample(g, rate) != g2: raise ValueError('g is not in equivalence class of g2') s = set() def addnodes(g, g2, edges): if edges: masks = [] for e in edges: if ok2addanedge(e[0], e[1], g, g2, rate=rate): masks.append(True) else: masks.append(False) nedges = [edges[i] for i in range(len(edges)) if masks[i]] n = len(nedges) if n: for i in range(n): mask = addanedge(g, nedges[i]) s.add(g2num(g)) addnodes(g, g2, nedges[:i] + nedges[i + 1:]) delanedge(g, nedges[i], mask) edges = gk.edgelist(gk.complement(g)) addnodes(g, g2, edges) return s
def estOE(d): gt = d['gt']['graph'] gt = bfu.undersample(gt, 1) e = gk.OCE(d['estimate'], gt) N = np.double(len(gk.edgelist(gt))) +\ np.double(len(gk.bedgelist(gt))) return (e['directed'][0] + e['bidirected'][0]) / N
def estCOE(d): gt = d['gt']['graph'] gt = bfu.undersample(gt, 1) e = gk.OCE(d['estimate'], gt) n = len(gt) N = np.double(n ** 2 + (n - 1) ** 2 / 2.0 - len(gk.edgelist(gt)) - len(gk.bedgelist(gt))) return (e['directed'][1] + e['bidirected'][1]) / N
def g22g1(g2, capsize=None): ''' computes all g1 that are in the equivalence class for g2 ''' if bfu.isSclique(g2): print 'Superclique - any SCC with GCD = 1 fits' return set([-1]) single_cache = {} @memo # memoize the search def nodesearch(g, g2, edges, s): if edges: if bfu.increment(g) == g2: s.add(g2num(g)) if capsize and len(s) > capsize: raise ValueError('Too many elements') return g e = edges[0] for n in g2: if (n, e) in single_cache: continue if not edge_increment_ok(e[0], n, e[1], g, g2): continue mask = add2edges(g, e, n) r = nodesearch(g, g2, edges[1:], s) del2edges(g, e, n, mask) elif bfu.increment(g) == g2: s.add(g2num(g)) if capsize and len(s) > capsize: raise ValueError('Too many elements in eqclass') return g # find all directed g1's not conflicting with g2 n = len(g2) edges = gk.edgelist(g2) random.shuffle(edges) g = cloneempty(g2) for e in edges: for n in g2: mask = add2edges(g, e, n) if not gk.isedgesubset(bfu.increment(g), g2): single_cache[(n, e)] = False del2edges(g, e, n, mask) s = set() try: nodesearch(g, g2, edges, s) except ValueError: s.add(0) return s
def checker(n, ee): g = gk.ringmore(n, ee) g2 = bfu.increment(g) d = checkable(g2) t = [len(d[x]) for x in d] r = [] n = len(g2) ee = len(gk.edgelist(g2)) for i in range(1, len(t)): r.append(sum(np.log10(t[:i])) - ee * np.log10(n)) return r
def checkerDS(n, ee): g = gk.ringmore(n, ee) g2 = bfu.increment(g) gg = checkable(g2) d, p, idx = conformanceDS(g2, gg, gg.keys()) t = [len(x) for x in p] r = [] n = len(g2) ee = len(gk.edgelist(g2)) for i in range(1, len(t)): r.append(sum(np.log10(t[:i])) - ee * np.log10(n)) return r
def makediscrete(graph, data, numvalues, ss): n = len(data) data = DiscretizeDataQuantiles(data, numvalues) ddata = MakeDataDictForBNPackage(data) result = getBNparams(graph, ddata, n) r = GenProbTable(result) initialVdata, bnV_data = alterinputsforBNtoDynBN(result, r, n, numvalues) trueEdges = gk.edgelist(graph) vertices = map(str, range(1, n + 1)) d = CreateDynDiscBN(vertices, trueEdges, initialVdata, bnV_data) data = sampleBN(d, ss) return data
def edge_backtrack2g1_directed(g2, capsize=None): ''' computes all g1 that are in the equivalence class for g2 ''' if bfu.isSclique(g2): print 'Superclique - any SCC with GCD = 1 fits' return set([-1]) single_cache = {} def edgeset(g): return set(gk.edgelist(g)) @memo # memoize the search def nodesearch(g, g2, edges, s): if edges: e = edges.pop() ln = [n for n in g2] for n in ln: if (n, e) in single_cache: continue mask = add2edges(g, e, n) if gk.isedgesubset(bfu.increment(g), g2): r = nodesearch(g, g2, edges, s) if r and edgeset(bfu.increment(r)) == edgeset(g2): s.add(g2num(r)) if capsize and len(s) > capsize: raise ValueError('Too many elements in eqclass') del2edges(g, e, n, mask) edges.append(e) else: return g # find all directed g1's not conflicting with g2 n = len(g2) edges = gk.edgelist(g2) random.shuffle(edges) g = cloneempty(g2) for e in edges: for n in g2: mask = add2edges(g, e, n) if not gk.isedgesubset(bfu.increment(g), g2): single_cache[(n, e)] = False del2edges(g, e, n, mask) s = set() try: nodesearch(g, g2, edges, s) except ValueError: s.add(0) return s
def getBNparams(graph, ddata, n): # Gets Disc. BN parameters given a graph skeleton #skeleton should include t-1 and t nodes for each variable nodes = range(1, (n * 2) + 1) nodes = map(str, nodes) edges = gk.edgelist(graph) for i in range(len(edges)): edges[i] = list([edges[i][0], str(n + int(edges[i][1]))]) skel = GraphSkeleton() skel.V = nodes skel.E = edges learner = PGMLearner() result = learner.discrete_mle_estimateparams(skel, ddata) return result
def BackwardDirected(): edges = gk.edgelist(g) CandidateSet = [] for e in edges: score = Comparescore('backward', g, int(e[0]), int(e[1])) if score < 0: CandidateSet.append((score, (int(e[0]), int(e[1])))) while CandidateSet: CandidateSet.sort(reverse=True) edgetoDel = CandidateSet.pop()[1] fr[edgetoDel[1]][edgetoDel[0]] = 0 g[str(edgetoDel[0])][str(edgetoDel[1])] = set() reeval_node = edgetoDel[1] newCandidateSet = [] for a, e in CandidateSet: if e[1] == reeval_node: CandidateSet.remove((a, e)) score = Comparescore('backward', g, e[0], e[1]) if score < 0: newCandidateSet.append((score, (e[0], e[1]))) CandidateSet = newCandidateSet + CandidateSet return g, fr
def vedgelist(g, pathtoo=False): """ Return a list of tuples for edges of g and forks a superugly organically grown function that badly needs refactoring """ l = [] el = gk.edgelist(g) bl = gk.bedgelist(g) if pathtoo: l.extend(make_longpaths(g, el)) l2, r = make_allforks_and_rest(g, el, bl, dofullforks=True) l.extend(l2) A, singles = makechains(r) if singles: B, singles = makesinks(singles) else: B, singles = [], [] l = longpaths_pick(l) + threedges_pick(l) + A + B + singles return l
def eqsearch(g2, rate=1): '''Find all g that are also in the equivalence class with respect to g2 and the rate. ''' s = set() noop = set() @memo1 def addnodes(g, g2, edges): if edges: masks = [] for e in edges: if ok2addanedge_(e[0], e[1], g, g2, rate=rate): masks.append(True) else: masks.append(False) nedges = [edges[i] for i in range(len(edges)) if masks[i]] n = len(nedges) if n: for i in range(n): mask = addanedge(g, nedges[i]) if bfu.undersample(g, rate) == g2: s.add(g2num(g)) addnodes(g, g2, nedges[:i] + nedges[i + 1:]) delanedge(g, nedges[i], mask) return s else: return noop else: return noop g = cloneempty(g2) edges = gk.edgelist(gk.complement(g)) addnodes(g, g2, edges) return s
def density(g): return len(gk.edgelist(g)) / np.double(len(g) ** 2)
def backtrack_more(g2, rate=1, capsize=None): ''' computes all g1 that are in the equivalence class for g2 ''' if bfu.isSclique(g2): print 'Superclique - any SCC with GCD = 1 fits' return set([-1]) single_cache = {} if rate == 1: ln = [n for n in g2] else: ln = [] for x in itertools.combinations_with_replacement(g2.keys(), rate): ln.extend(itertools.permutations(x, rate)) ln = set(ln) @memo # memoize the search def nodesearch(g, g2, edges, s): if edges: if bfu.undersample(g, rate) == g2: s.add(g2num(g)) if capsize and len(s) > capsize: raise ValueError('Too many elements') return g e = edges[0] for n in ln: if (n, e) in single_cache: continue if not ok2addaVpath(e, n, g, g2, rate=rate): continue mask = addaVpath(g, e, n) r = nodesearch(g, g2, edges[1:], s) delaVpath(g, e, n, mask) elif bfu.undersample(g, rate) == g2: s.add(g2num(g)) if capsize and len(s) > capsize: raise ValueError('Too many elements in eqclass') return g # find all directed g1's not conflicting with g2 n = len(g2) edges = gk.edgelist(g2) random.shuffle(edges) g = cloneempty(g2) for e in edges: for n in ln: mask = addaVpath(g, e, n) if not gk.isedgesubset(bfu.undersample(g, rate), g2): single_cache[(n, e)] = False delaVpath(g, e, n, mask) s = set() try: nodesearch(g, g2, edges, s) except ValueError: s.add(0) return s
def backtrack_more2(g2, rate=2, capsize=None): ''' computes all g1 that are in the equivalence class for g2 ''' if bfu.isSclique(g2): print 'Superclique - any SCC with GCD = 1 fits' return set([-1]) f = [(addaVpath, delaVpath, maskaVpath)] c = [ok2addaVpath] def predictive_check(g, g2, pool, checks_ok, key): s = set() for u in pool: if not checks_ok(key, u, g, g2, rate=rate): continue s.add(u) return s @memo2 # memoize the search def nodesearch(g, g2, order, inlist, s, cds, pool, pc): if order: if bfu.undersample(g, rate) == g2: s.add(g2num(g)) if capsize and len(s) > capsize: raise ValueError('Too many elements') s.update(supergraphs_in_eq(g, g2, rate=rate)) return g key = order[0] if pc: tocheck = [x for x in pc if x in cds[len(inlist) - 1][inlist[0]]] else: tocheck = cds[len(inlist) - 1][inlist[0]] if len(order) > 1: kk = order[1] pc = predictive_check(g, g2, pool[len(inlist)], c[edge_function_idx(kk)], kk) else: pc = set() adder, remover, masker = f[edge_function_idx(key)] checks_ok = c[edge_function_idx(key)] for n in tocheck: if not checks_ok(key, n, g, g2, rate=rate): continue masked = np.prod(masker(g, key, n)) if masked: nodesearch(g, g2, order[1:], [n] + inlist, s, cds, pool, pc) else: mask = adder(g, key, n) nodesearch(g, g2, order[1:], [n] + inlist, s, cds, pool, pc) remover(g, key, n, mask) elif bfu.undersample(g, rate) == g2: s.add(g2num(g)) if capsize and len(s) > capsize: raise ValueError('Too many elements') return g # find all directed g1's not conflicting with g2 startTime = int(round(time.time() * 1000)) ln = [x for x in itertools.permutations(g2.keys(), rate)] + \ [(n, n) for n in g2] gg = {x: ln for x in gk.edgelist(g2)} keys = gg.keys() cds, order, idx = conformanceDS(g2, gg, gg.keys(), f=f, c=c) endTime = int(round(time.time() * 1000)) print "precomputed in {:10} seconds".format(round((endTime - startTime) / 1000., 3)) if 0 in [len(x) for x in order]: return set() g = cloneempty(g2) s = set() try: nodesearch(g, g2, [keys[i] for i in idx], ['0'], s, cds, order, set()) except ValueError, e: print e s.add(0)
def dpc(data, varst, pval=0.1): n = data.shape[0] # if n<200: # pval=.05 # if n <1000: # pval=.1 # elif n<2000: # pval=.1 # stack the data: first n rows is t-1 slice, the next n are slice t data = np.asarray(np.r_[data[:, :-1], data[:, 1:]]) def cindependent(y, x, counter, parents=[], pval=pval): for S in [j for j in combinations(parents, counter)]: print S if ChiSquaredTest(x, y, condset=list(S)): return True return False def bindependent(y, x, parents=[], pval=pval): print "done" return ChiSquaredTest(x, y, condset=parents, shift=n) def dir_prune(elist, mask, g): for e in mask: sett = copy.deepcopy(g[e[0]][e[1]]) sett.remove((0, 1)) g[e[0]][e[1]] = sett elist.remove(e) def bi_prune(mask, g): for e in mask: sett = copy.deepcopy(g[e[0]][e[1]]) sett.remove((2, 0)) g[e[0]][e[1]] = sett g[e[1]][e[0]] = sett def chisq_of_df_cols(df, c1, c2): groupsizes = df.groupby([c1, c2]).size() ctsum = groupsizes.unstack(c1) # fillna(0) is necessary to remove any NAs which will cause exceptions return (chi2_contingency(ctsum.fillna(0))) def ChiSquaredTest(x, y, condset, shift=0): if condset: X = data[[shift + int(x) - 1] + [n + int(y) - 1] + condset, :].T df = makeDF(X) condnum = df.shape[1] - 2 for i in range(condnum): if i == 0: v = pd.Series.unique(df[i + 2]) else: v = np.vstack([v, pd.Series.unique(df[i + 2])]) if condnum == 1: condvalues = [v] else: condvalues = list(itertools.product(*v)) chis = 0 dofs = 0 for i in condvalues: count = 1 for j in range(condnum): if count == 1: newdf = df[df[j + 2] == i[j]] count = 2 else: newdf = newdf[newdf[j + 2] == i[j]] try: chi2, p, dof, ex = chisq_of_df_cols(newdf, 0, 1) except: chi2 = 0 dof = (varst - 1)**2 chis += chi2 dofs += dof val = chisqprob(chis, dofs) else: X = data[[shift + int(x) - 1] + [n + int(y) - 1], :].T df = makeDF(X) chi2, p, dof, ex = chisq_of_df_cols(df, 0, 1) val = chisqprob(chi2, dof) return val > pval def stringify(array): d = [] for i in array: d.append((str(i[0]), str(i[1]))) return d num_g = gk.superclique(n) el = gk.edgelist(num_g) el = stringify(el) print(el) num_gtr = gk.gtranspose(num_g) gtr = conv.ian2g(num_gtr) g = conv.ian2g(num_g) for counter in range(n): to_remove = [] for e in el: ppp = [int(k) - 1 for k in gtr[e[1]] if k != e[0]] if counter <= len(ppp): if cindependent(e[1], e[0], counter, parents=ppp, pval=pval): to_remove.append(e) gtr[e[1]].pop(e[0], None) dir_prune(el, to_remove, g) print(g) bel = [map(lambda k: str(k + 1), x) for x in combinations(range(n), 2)] bi_list = [] for e in bel: ppp = list(set(gtr[e[0]].keys()) | set(gtr[e[1]].keys())) ppp = map(lambda x: int(x) - 1, ppp) if bindependent(e[0], e[1], parents=ppp, pval=pval): bi_list.append(e) bi_prune(bi_list, g) g = conv.dict_format_converter(g) gk.clean_leaf_nodes(g) return g
def udensity(g): return (len(gk.edgelist(g)) + len(gk.bedgelist(g)) / 2.) / np.double(len(g) ** 2 + len(g) * (len(g) - 1) / 2.)
def edgeset(g): return set(gk.edgelist(g))
def checkcedge(c, g2): """ Nodes to check to merge the virtual nodes of c ( a->b->c ) """ l = gk.edgelist(g2) return list(set(l))