def timer(inputfile, trials, datalength): # load nodedata and graphskeleton nd = NodeData() skel = GraphSkeleton() #print "bp1" nd.load(inputfile) #print "bp2" skel.load(inputfile) #print "bp3" # msg = "%d, %d" % (asizeof(nd), asizeof(skel)) # print >>op, msg # topologically order graphskeleton skel.toporder() # load bayesian network bn = DiscreteBayesianNetwork(skel, nd) # instantiate pgm learner l = PGMLearner() # free unused memory del nd #sum1 = summary.summarize(muppy.get_objects()) #summary.print_(sum1) # TIME totaltime = 0 for _ in range(trials): data = bn.randomsample(datalength) start = time.clock() ret = l.discrete_mle_estimateparams(skel, data) elapsed = time.clock() - start totaltime += elapsed totaltime /= trials print json.dumps(ret.Vdata, indent=1) return totaltime
def timer(inputfile, trials): # load nodedata and graphskeleton nd = NodeData() skel = GraphSkeleton() nd.load(inputfile) skel.load(inputfile) # topologically order graphskeleton skel.toporder() # load bayesian network bn = DiscreteBayesianNetwork(skel, nd) # TIME totaltime = 0 for _ in range(trials): start = time.clock() ret = bn.randomsample(100) elapsed = time.clock() - start totaltime += elapsed totaltime /= trials return totaltime
def discrete_constraint_estimatestruct(self, data, pvalparam=0.05, indegree=1): ''' Learn a Bayesian network structure from discrete data given by *data*, using constraint-based approaches. This function first calculates all the independencies and conditional independencies present between variables in the data. To calculate dependencies, it uses the *discrete_condind* method on each pair of variables, conditioned on other sets of variables of size *indegree* or smaller, to generate a chi-squared result and a p-value. If this p-value is less than *pvalparam*, the pair of variables are considered dependent conditioned on the variable set. Once all true dependencies -- pairs of variables that are dependent no matter what they are conditioned by -- are found, the algorithm uses these dependencies to construct a directed acyclic graph. It returns this DAG in the form of a :doc:`GraphSkeleton <graphskeleton>` class. Arguments: 1. *data* -- An array of dicts containing samples from the network in {vertex: value} format. Example:: [ { 'Grade': 'B', 'SAT': 'lowscore', ... }, ... ] 2. *pvalparam* -- (Optional, default is 0.05) The p-value below which to consider something significantly unlikely. A common number used is 0.05. This is passed to *discrete_condind* when it is called. 3. *indegree* -- (Optional, default is 1) The upper bound on the size of a witness set (see Koller et al. 85). If this is larger than 1, a huge amount of samples in *data* are required to avoid a divide-by-zero error. Usage example: this would learn structure from a set of 8000 discrete samples:: import json from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner # generate some data to use nd = NodeData() nd.load("../tests/unittestdict.txt") # an input file skel = GraphSkeleton() skel.load("../tests/unittestdict.txt") skel.toporder() bn = DiscreteBayesianNetwork(skel, nd) data = bn.randomsample(8000) # instantiate my learner learner = PGMLearner() # estimate structure result = learner.discrete_constraint_estimatestruct(data) # output print json.dumps(result.E, indent=2) ''' assert (isinstance(data, list) and data and isinstance(data[0], dict)), "Arg must be a list of dicts." # instantiate array of variables and array of potential dependencies variables = data[0].keys() ovariables = variables[:] dependencies = [] for x in variables: ovariables.remove(x) for y in ovariables: if (x != y): dependencies.append([x, y]) # define helper function to find subsets def subsets(array): result = [] for i in range(indegree + 1): comb = itertools.combinations(array, i) for c in comb: result.append(list(c)) return result witnesses = [] othervariables = variables[:] # for each pair of variables X, Y: for X in variables: othervariables.remove(X) for Y in othervariables: # consider all sets of witnesses that do not have X or Y in # them, and are less than or equal to the size specified by # the "indegree" argument for U in subsets(variables): if (X not in U) and (Y not in U) and len(U) <= indegree: # determine conditional independence chi, pv, witness = self.discrete_condind(data, X, Y, U) if pv > pvalparam: msg = "***%s and %s are found independent (chi = %f, pv = %f) with witness %s***" % (X, Y, chi, pv, U) try: dependencies.remove([X, Y]) dependencies.remove([Y, X]) except: pass witnesses.append([X, Y, witness]) break # now that we have found our dependencies, run build PDAG (cf. Koller p. 89) # with the stored set of independencies: # assemble undirected graph skeleton pdag = GraphSkeleton() pdag.E = dependencies pdag.V = variables # adjust for immoralities (cf. Koller 86) dedges = [x[:] for x in pdag.E] for edge in dedges: edge.append('u') # define helper method "exists_undirected_edge" def exists_undirected_edge(one_end, the_other_end): for edge in dedges: if len(edge) == 3: if (edge[0] == one_end and edge[1] == the_other_end): return True elif (edge[1] == one_end and edge[0] == the_other_end): return True return False # define helper method "exists_edge" def exists_edge(one_end, the_other_end): if exists_undirected_edge(one_end, the_other_end): return True elif [one_end, the_other_end] in dedges: return True elif [the_other_end, one_end] in dedges: return True return False for edge1 in reversed(dedges): for edge2 in reversed(dedges): if (edge1 in dedges) and (edge2 in dedges): if edge1[0] == edge2[1] and not exists_edge(edge1[1], edge2[0]): if (([edge1[1], edge2[0], [edge1[0]]] not in witnesses) and ([edge2[0], edge1[1], [edge1[0]]] not in witnesses)): dedges.append([edge1[1], edge1[0]]) dedges.append([edge2[0], edge2[1]]) dedges.remove(edge1) dedges.remove(edge2) elif edge1[1] == edge2[0] and not exists_edge(edge1[0], edge2[1]): if (([edge1[0], edge2[1], [edge1[1]]] not in witnesses) and ([edge2[1], edge1[0], [edge1[1]]] not in witnesses)): dedges.append([edge1[0], edge1[1]]) dedges.append([edge2[1], edge2[0]]) dedges.remove(edge1) dedges.remove(edge2) elif edge1[1] == edge2[1] and edge1[0] != edge2[0] and not exists_edge(edge1[0], edge2[0]): if (([edge1[0], edge2[0], [edge1[1]]] not in witnesses) and ([edge2[0], edge1[0], [edge1[1]]] not in witnesses)): dedges.append([edge1[0], edge1[1]]) dedges.append([edge2[0], edge2[1]]) dedges.remove(edge1) dedges.remove(edge2) elif edge1[0] == edge2[0] and edge1[1] != edge2[1] and not exists_edge(edge1[1], edge2[1]): if (([edge1[1], edge2[1], [edge1[0]]] not in witnesses) and ([edge2[1], edge1[1], [edge1[0]]] not in witnesses)): dedges.append([edge1[1], edge1[0]]) dedges.append([edge2[1], edge2[0]]) dedges.remove(edge1) dedges.remove(edge2) # use right hand rules to improve graph until convergence (Koller 89) olddedges = [] while (olddedges != dedges): olddedges = [x[:] for x in dedges] for edge1 in reversed(dedges): for edge2 in reversed(dedges): # rule 1 inverted = False check1, check2 = False, True if (edge1[1] == edge2[0] and len(edge1) == 2 and len(edge2) == 3): check1 = True elif (edge1[1] == edge2[1] and len(edge1) == 2 and len(edge2) == 3): check = True inverted = True for edge3 in dedges: if edge3 != edge1 and ((edge3[0] == edge1[0] and edge3[1] == edge2[1]) or (edge3[1] == edge1[0] and edge3[0] == edge2[1])): check2 = False if check1 == True and check2 == True: if inverted: dedges.append([edge1[1], edge2[0]]) else: dedges.append([edge1[1], edge2[1]]) dedges.remove(edge2) # rule 2 check1, check2 = False, False if (edge1[1] == edge2[0] and len(edge1) == 2 and len(edge2) == 2): check1 = True for edge3 in dedges: if ((edge3[0] == edge1[0] and edge3[1] == edge2[1]) or (edge3[1] == edge1[0] and edge3[0] == edge2[1]) and len(edge3) == 3): check2 = True if check1 == True and check2 == True: if edge3[0] == edge1[0]: dedges.append([edge3[0], edge3[1]]) elif edge3[1] == edge1[0]: dedges.append([edge3[1], edge3[0]]) dedges.remove(edge3) # rule 3 check1, check2 = False, False if len(edge1) == 2 and len(edge2) == 2: if (edge1[1] == edge2[1] and edge1[0] != edge2[0]): check1 = True for v in variables: if (exists_undirected_edge(v, edge1[0]) and exists_undirected_edge(v, edge1[1]) and exists_undirected_edge(v, edge2[0])): check2 = True if check1 == True and check2 == True: dedges.append([v, edge1[1]]) for edge3 in dedges: if (len(edge3) == 3 and ((edge3[0] == v and edge3[1] == edge1[1]) or (edge3[1] == v and edge3[0] == edge1[1]))): dedges.remove(edge3) # return one possible graph skeleton from the pdag class found for x in range(len(dedges)): if len(dedges[x]) == 3: dedges[x] = dedges[x][:2] pdag.E = dedges pdag.toporder() return pdag