Beispiel #1
0
def build_bayesnet(graph, data):
    directed = direct_edges(graph)
    edge_dict = generate_edge_dict(directed)
    print(edge_dict)
    value_dict = generate_value_dict(data)
    print(value_dict)
    return BayesNet(edge_dict, value_dict)
Beispiel #2
0
def chow_liu(data,edges_only=False):
	"""
	Perform Chow-Liu structure learning algorithm
	over an entire dataset, and return the BN-tree.


	Arguments
	---------
	*data* : a nested numpy array
		The data from which we will learn. It should be
		the entire dataset.

	Returns
	-------
	*bn* : a BayesNet object
		The structure-learned BN.

	Effects
	-------
	None

	Notes: Prim's algorithm or Kruskal's 
	Remark: This code is wrong. Since once an edge i->j both not in vertex_cache,
	It will not be considerred any longer. Even later, when one of them, say i, is
	added to vertex_cache, apparently i->j would be a safe link, but won't be
	considerred, leading to lower weight spanning tree.
	-----

	"""
	value_dict = dict(zip(range(data.shape[1]),
		[list(np.unique(col)) for col in data.T]))

	n_rv = data.shape[1]

	edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \
					for i in xrange(n_rv) for j in xrange(i+1,n_rv)]
	
	edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight
	vertex_cache = {edge_list[0][0]} # start with first vertex..
	mst = dict((rv, []) for rv in xrange(n_rv))

	for i,j,w in edge_list:
		# since undirected, i->j and j-> is the same
		# and in edge_list, there are only i->j
		# since edge_list already sorted, when we encounter i->j,
		# it must be largest weight edge crossing the cut, thus safe edge
		if i in vertex_cache and j not in vertex_cache:
			mst[i].append(j)
			vertex_cache.add(j)
		elif i not in vertex_cache and j in vertex_cache:
			mst[j].append(i)
			vertex_cache.add(i)
	
	if edges_only == True:
		return mst, value_dict

	bn=BayesNet(mst,value_dict)
	return bn
Beispiel #3
0
def naive_bayes(data, target, estimator='mle'):
	"""
	Learn naive bayes model from data.

	The Naive Bayes model is a Tree-based
	model where all random variables have
	the same parent (the "target" variable).
	From a probabilistic standpoint, the implication
	of this model is that all random variables 
	(i.e. features) are assumed to be
	conditionally independent of any other random variable,
	conditioned upon the single parent (target) variable.

	It turns out that this model performs quite well
	as a classifier, and can be used as such. Moreover,
	this model is quite fast and simple to learn/create
	from a computational standpoint.

	Note that this function not only learns the structure,
	but ALSO learns the parameters.

	Arguments
	---------
	*data* : a nested numpy array

	*target* : an integer
		The target variable column in *data*

	Returns
	-------
	*bn* : a BayesNet object,
		with the structure instantiated.

	Effects
	-------
	None

	Notes
	-----

	"""	
	value_dict = dict(zip(range(data.shape[1]),
		[list(np.unique(col)) for col in data.T]))

	edge_dict = {target:[v for v in value_dict if v!=target]}
	edge_dict.update(dict([(rv,[]) for rv in value_dict if rv!=target]))

	bn = BayesNet(edge_dict,value_dict)
	if estimator == 'bayes':
		bayes_estimator(bn,data)
	else:
		mle_estimator(bn,data)
	return bn
Beispiel #4
0
def chow_liu(data, edges_only=False):
    """
	Perform Chow-Liu structure learning algorithm
	over an entire dataset, and return the BN-tree.


	Arguments
	---------
	*data* : a nested numpy array
		The data from which we will learn. It should be
		the entire dataset.

	Returns
	-------
	*bn* : a BayesNet object
		The structure-learned BN.

	Effects
	-------
	None

	Notes
	-----

	"""
    value_dict = dict(
        zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))

    n_rv = data.shape[1]

    edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \
        for i in xrange(n_rv) for j in xrange(i+1,n_rv)]

    edge_list.sort(key=operator.itemgetter(2), reverse=True)  # sort by weight
    vertex_cache = {edge_list[0][0]}  # start with first vertex..
    mst = dict((rv, []) for rv in xrange(n_rv))

    for i, j, w in edge_list:
        if i in vertex_cache and j not in vertex_cache:
            mst[i].append(j)
            vertex_cache.add(j)
        elif i not in vertex_cache and j in vertex_cache:
            mst[j].append(i)
            vertex_cache.add(i)

    if edges_only == True:
        return mst, value_dict

    bn = BayesNet(mst, value_dict)
    return bn
Beispiel #5
0
def read_mat(path, delim=' '):
    """
    Read an adjacency matrix into a BayesNet object.

    NOTE: This is for reading the structure only, and
    therefore no parameters for the BayesNet object will
    be set - they must be learned by calling "mle_estimator"
    or "bayes_estimator" on the object.
    """
    _V = []
    _E = {}
    _F = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.split(delim)
            rv = line[0]
            _E[rv] = []

    bn = BayesNet(_E)

    return bn
Beispiel #6
0
def bridge(c_bn, f_bn, data):
	"""
	Make a Multi-Dimensional Bayesian Network by
	bridging two Bayesian network structures. This happens by
	placing edges from c_bn -> f_bn using a heuristic 
	optimization procedure.

	This can be used to create a Multi-Dimensional Bayesian
	Network classifier from two already-learned Bayesian networks -
	one of which is a BN containing all the class variables, the other
	containing all the feature variables.

	Arguments
	---------
	*c_bn* : a BayesNet object with known structure

	*f_bn* : a BayesNet object with known structure.

	Returns
	-------
	*m_bn* : a merged/bridge BayesNet object,
		whose structure contains *c_bn*, *f_bn*, and some bridge
		edges between them.
	"""
	restrict = []
	for u in c_bn:
		for v in f_bn:
			restrict.append((u,v)) # only allow edges from c_bn -> f_bn

	bridge_bn = hc_rr(data, restriction=restrict)

	m_bn = bridge_bn.E
	m_bn.update(c_bn.E)
	m_bn.update(f_bn.E)

	mbc_bn = BayesNet(E=m_bn)
Beispiel #7
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print('Markov Blanket for %s : %s' % (X, str(TEMP_S)))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print('Unoriented edge dict:\n %s' % str(edge_dict))
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print('Oriented edge dict:\n %s' % str(oriented_edge_dict))
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Beispiel #8
0
def tabu(data, k=5, metric='AIC', max_iter=100, debug=False, restriction=None):
    """
	Tabu search for score-based structure learning.

	The algorithm maintains a list called "tabu_list",
	which consists of 3-tuples, where the first two
	elements constitute the edge which is tabued, and
	the third element is a string - either 'Addition',
	'Deletion', or 'Reversal' denoting the operation
	associated with the edge.

	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print(the scores/moves of the)
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object
	
	"""
    nrow = data.shape[0]
    ncol = data.shape[1]

    names = range(ncol)

    # INITIALIZE NETWORK W/ NO EDGES
    # maintain children and parents dict for fast lookups
    c_dict = dict([(n, []) for n in names])
    p_dict = dict([(n, []) for n in names])

    # COMPUTE INITIAL LIKELIHOOD SCORE
    value_dict = dict([(n, np.unique(data[:, i]))
                       for i, n in enumerate(names)])
    bn = BayesNet(c_dict)
    mle_estimator(bn, data)
    max_score = info_score(bn, nrow, metric)

    tabu_list = [None] * k

    _iter = 0
    improvement = True

    while improvement:
        improvement = False
        max_delta = 0

        if debug:
            print('ITERATION: ', _iter)

        ### TEST ARC ADDITIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                # CHECK TABU LIST - can't delete an addition on the tabu list
                if (u, v, 'Deletion') not in tabu_list:
                    # CHECK EDGE EXISTENCE AND CYCLICITY
                    if v not in c_dict[u] and u != v and not would_cause_cycle(
                            c_dict, u, v):
                        # FOR MMHC ALGORITHM -> Edge Restrictions
                        if restriction is None or (u, v) in restriction:
                            # SCORE FOR 'V' -> gaining a parent
                            old_cols = (v, ) + tuple(
                                p_dict[v])  # without 'u' as parent
                            mi_old = mutual_information(data[:, old_cols])
                            new_cols = old_cols + (u, )  # with'u' as parent
                            mi_new = mutual_information(data[:, new_cols])
                            delta_score = nrow * (mi_old - mi_new)

                            if delta_score > max_delta:
                                if debug:
                                    print('Improved Arc Addition: ', (u, v))
                                    print('Delta Score: ', delta_score)
                                max_delta = delta_score
                                max_operation = 'Addition'
                                max_arc = (u, v)

        ### TEST ARC DELETIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                # CHECK TABU LIST - can't add back a deletion on the tabu list
                if (u, v, 'Addition') not in tabu_list:
                    if v in c_dict[u]:
                        # SCORE FOR 'V' -> losing a parent
                        old_cols = (v, ) + tuple(
                            p_dict[v])  # with 'u' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = tuple([i for i in old_cols
                                          if i != u])  # without 'u' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta_score = nrow * (mi_old - mi_new)

                        if delta_score > max_delta:
                            if debug:
                                print('Improved Arc Deletion: ', (u, v))
                                print('Delta Score: ', delta_score)
                            max_delta = delta_score
                            max_operation = 'Deletion'
                            max_arc = (u, v)

        ### TEST ARC REVERSALS ###
        for u in bn.nodes():
            for v in bn.nodes():
                # CHECK TABU LIST - can't reverse back a reversal on the tabu list
                if (u, v, 'Reversal') not in tabu_list:
                    if v in c_dict[u] and not would_cause_cycle(
                            c_dict, v, u, reverse=True):
                        # SCORE FOR 'U' -> gaining 'v' as parent
                        old_cols = (u, ) + tuple(
                            p_dict[v])  # without 'v' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = old_cols + (v, )  # with 'v' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta1 = nrow * (mi_old - mi_new)
                        # SCORE FOR 'V' -> losing 'u' as parent
                        old_cols = (v, ) + tuple(
                            p_dict[v])  # with 'u' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = tuple([u for i in old_cols
                                          if i != u])  # without 'u' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta2 = nrow * (mi_old - mi_new)
                        # COMBINED DELTA-SCORES
                        delta_score = delta1 + delta2

                        if delta_score > max_delta:
                            if debug:
                                print('Improved Arc Reversal: ', (u, v))
                                print('Delta Score: ', delta_score)
                            max_delta = delta_score
                            max_operation = 'Reversal'
                            max_arc = (u, v)

        ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
        if max_delta != 0:
            improvement = True
            u, v = max_arc
            if max_operation == 'Addition':
                if debug:
                    print('ADDING: ', max_arc, '\n')
                c_dict[u].append(v)
                p_dict[v].append(u)
                tabu_list[_iter % 5] = (u, v, 'Addition')
            elif max_operation == 'Deletion':
                if debug:
                    print('DELETING: ', max_arc, '\n')
                c_dict[u].remove(v)
                p_dict[v].remove(u)
                tabu_list[_iter % 5] = (u, v, 'Deletion')
            elif max_operation == 'Reversal':
                if debug:
                    print('REVERSING: ', max_arc, '\n')
                    c_dict[u].remove(v)
                    p_dict[v].remove(u)
                    c_dict[v].append(u)
                    p_dict[u].append(v)
                    tabu_list[_iter % 5] = (u, v, 'Reversal')
        else:
            if debug:
                print('No Improvement on Iter: ', _iter)

        ### TEST FOR MAX ITERATION ###
        _iter += 1
        if _iter > max_iter:
            if debug:
                print('Max Iteration Reached')
            break

    # bn = BayesNet(c_dict)

    return bn
Beispiel #9
0
def read_bif(path):
    """
    This function reads a .bif file into a
    BayesNet object. It's probably not the 
    fastest or prettiest but it gets the job
    done.

    Arguments
    ---------
    *path* : a string
        The path

    Returns
    -------
    *bn* : a BayesNet object

    Effects
    -------
    None

    Notes
    -----
    *V* : a list of strings
    *E* : a dict, where key = vertex, val = list of its children
    *F* : a dict, where key = rv, val = another dict with
                keys = 'parents', 'values', cpt'

    """
    _parents = {
    }  # key = vertex, value = list of vertices in the scope (includind itself)
    _cpt = {}  # key = vertex, value = list (then numpy array)
    _vals = {}  # key=vertex, val=list of its possible values

    with open(path, 'r') as f:
        while True:
            line = f.readline()
            if 'variable' in line:
                new_vertex = line.split()[1]

                _parents[new_vertex] = []
                _cpt[new_vertex] = []
                #_vals[new_vertex] = []

                new_line = f.readline()
                new_vals = new_line.replace(',',
                                            ' ').split()[6:-1]  # list of vals
                _vals[new_vertex] = new_vals
                num_outcomes = len(new_vals)
            elif 'probability' in line:
                line = line.replace(',', ' ')
                child_rv = line.split()[2]
                parent_rvs = line.split()[4:-2]

                if len(parent_rvs) == 0:  # prior
                    new_line = f.readline().replace(';',
                                                    ' ').replace(',',
                                                                 ' ').split()
                    prob_values = new_line[1:]
                    _cpt[child_rv].append(map(float, prob_values))
                    #_cpt[child_rv] = map(float,prob_values)
                else:  # not a prior
                    _parents[child_rv].extend(list(parent_rvs))
                    while True:
                        new_line = f.readline()
                        if '}' in new_line:
                            break
                        new_line = new_line.replace(',', ' ').replace(
                            ';', ' ').replace('(', ' ').replace(')',
                                                                ' ').split()
                        prob_values = new_line[-(len(_vals[new_vertex])):]
                        prob_values = map(float, prob_values)
                        _cpt[child_rv].append(prob_values)
            if line == '':
                break

    # CREATE FACTORS
    _F = {}
    _E = {}
    for rv in _vals.keys():
        _E[rv] = [c for c in _vals.keys() if rv in _parents[c]]
        f = {
            'parents': _parents[rv],
            'values': _vals[rv],
            'cpt': [item for sublist in _cpt[rv] for item in sublist]
        }
        _F[rv] = f

    bn = BayesNet()
    bn.F = _F
    bn.E = _E
    bn.V = list(topsort(_E))

    return bn
Beispiel #10
0
def read_json(path):
    """
    Read a BayesNet object from the json format. This
    format has the ".bn" extension and is completely
    unique to pyBN.

    Arguments
    ---------
    *path* : a string
        The file path

    Returns
    -------
    None

    Effects
    -------
    - Instantiates and sets a new BayesNet object

    Notes
    -----
    
    This function reads in a libpgm-style format into a bn object

    File Format:
        {
            "V": ["Letter", "Grade", "Intelligence", "SAT", "Difficulty"],
            "E": [["Intelligence", "Grade"],
                ["Difficulty", "Grade"],
                ["Intelligence", "SAT"],
                ["Grade", "Letter"]],
            "Vdata": {
                "Letter": {
                    "ord": 4,
                    "numoutcomes": 2,
                    "vals": ["weak", "strong"],
                    "parents": ["Grade"],
                    "children": None,
                    "cprob": [[.1, .9],[.4, .6],[.99, .01]]
                },
                ...
        }


    """
    def byteify(input):
        if isinstance(input, dict):
            return {byteify(key):byteify(value) for key,value in input.iteritems()}
        elif isinstance(input, list):
            return [byteify(element) for element in input]
        elif isinstance(input, unicode):
            return input.encode('utf-8')
        else:
            return input

    bn = BayesNet()
    
    f = open(path,'r')
    ftxt = f.read()

    success=False
    try:
        data = byteify(json.loads(ftxt))
        bn.V = data['V']
        bn.E = data['E']
        bn.F = data['F']
        success = True
    except ValueError:
        print "Could not read file - check format"
    bn.V = topsort(bn.E)

    return bn
Beispiel #11
0
def hc_rr(data,
          M=5,
          R=3,
          metric='AIC',
          max_iter=100,
          debug=False,
          restriction=None):
    """
	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print(the scores/moves of the)
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object
	"""
    nrow = data.shape[0]
    ncol = data.shape[1]

    names = range(ncol)

    # INITIALIZE NETWORK W/ NO EDGES
    # maintain children and parents dict for fast lookups
    c_dict = dict([(n, []) for n in names])
    p_dict = dict([(n, []) for n in names])

    # COMPUTE INITIAL LIKELIHOOD SCORE
    value_dict = dict([(n, np.unique(data[:, i]))
                       for i, n in enumerate(names)])
    bn = BayesNet(c_dict)
    mle_estimator(bn, data)
    max_score = info_score(bn, nrow, metric)

    _iter = 0
    improvement = True
    _restarts = 0

    while improvement:
        improvement = False
        max_delta = 0

        if debug:
            print('ITERATION: ', _iter)

        ### TEST ARC ADDITIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v not in c_dict[u] and u != v and not would_cause_cycle(
                        c_dict, u, v):
                    # FOR MMHC ALGORITHM -> Edge Restrictions
                    if restriction is None or (u, v) in restriction:
                        # SCORE FOR 'V' -> gaining a parent
                        old_cols = (v, ) + tuple(
                            p_dict[v])  # without 'u' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = old_cols + (u, )  # with'u' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta_score = nrow * (mi_old - mi_new)

                        if delta_score > max_delta:
                            if debug:
                                print('Improved Arc Addition: ', (u, v))
                                print('Delta Score: ', delta_score)
                            max_delta = delta_score
                            max_operation = 'Addition'
                            max_arc = (u, v)

        ### TEST ARC DELETIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u]:
                    # SCORE FOR 'V' -> losing a parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([i for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta_score = nrow * (mi_old - mi_new)

                    if delta_score > max_delta:
                        if debug:
                            print('Improved Arc Deletion: ', (u, v))
                            print('Delta Score: ', delta_score)
                        max_delta = delta_score
                        max_operation = 'Deletion'
                        max_arc = (u, v)

        ### TEST ARC REVERSALS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u] and not would_cause_cycle(
                        c_dict, v, u, reverse=True):
                    # SCORE FOR 'U' -> gaining 'v' as parent
                    old_cols = (u, ) + tuple(
                        p_dict[v])  # without 'v' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = old_cols + (v, )  # with 'v' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta1 = nrow * (mi_old - mi_new)
                    # SCORE FOR 'V' -> losing 'u' as parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([u for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta2 = nrow * (mi_old - mi_new)
                    # COMBINED DELTA-SCORES
                    delta_score = delta1 + delta2

                    if delta_score > max_delta:
                        if debug:
                            print('Improved Arc Reversal: ', (u, v))
                            print('Delta Score: ', delta_score)
                        max_delta = delta_score
                        max_operation = 'Reversal'
                        max_arc = (u, v)

        ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
        if max_delta != 0:
            improvement = True
            u, v = max_arc
            if max_operation == 'Addition':
                if debug:
                    print('ADDING: ', max_arc, '\n')
                c_dict[u].append(v)
                p_dict[v].append(u)
            elif max_operation == 'Deletion':
                if debug:
                    print('DELETING: ', max_arc, '\n')
                c_dict[u].remove(v)
                p_dict[v].remove(u)
            elif max_operation == 'Reversal':
                if debug:
                    print('REVERSING: ', max_arc, '\n')
                    c_dict[u].remove(v)
                    p_dict[v].remove(u)
                    c_dict[v].append(u)
                    p_dict[u].append(v)
        else:
            if debug:
                print('No Improvement on Iter: ', _iter)
            #### RESTART WITH RANDOM MOVES ####
            if _restarts < R:
                improvement = True  # make another pass of hill climbing
                _iter = 0  # reset iterations
                if debug:
                    print('Restart - ', _restarts)
                _restarts += 1
                for _ in range(M):
                    # 0 = Addition, 1 = Deletion, 2 = Reversal
                    operation = np.random.choice([0, 1, 2])
                    if operation == 0:
                        while True:
                            u, v = np.random.choice(list(bn.nodes()),
                                                    size=2,
                                                    replace=False)
                            # IF EDGE DOESN'T EXIST, ADD IT
                            if u not in p_dict[
                                    v] and u != v and not would_cause_cycle(
                                        c_dict, u, v):
                                if debug:
                                    print('RESTART - ADDING: ', (u, v))
                                c_dict[u].append(v)
                                p_dict[v].append(u)
                                break
                    elif operation == 1:
                        while True:
                            u, v = np.random.choice(list(bn.nodes()),
                                                    size=2,
                                                    replace=False)
                            # IF EDGE EXISTS, DELETE IT
                            if u in p_dict[v]:
                                if debug:
                                    print('RESTART - DELETING: ', (u, v))
                                c_dict[u].remove(v)
                                p_dict[v].remove(u)
                                break
                    elif operation == 2:
                        while True:
                            u, v = np.random.choice(list(bn.nodes()),
                                                    size=2,
                                                    replace=False)
                            # IF EDGE EXISTS, REVERSE IT
                            if u in p_dict[v] and not would_cause_cycle(
                                    c_dict, v, u, reverse=True):
                                if debug:
                                    print('RESTART - REVERSING: ', (u, v))
                                c_dict[u].remove(v)
                                p_dict[v].remove(u)
                                c_dict[v].append(u)
                                p_dict[u].append(v)
                                break

        ### TEST FOR MAX ITERATION ###
        _iter += 1
        if _iter > max_iter:
            if debug:
                print('Max Iteration Reached')
            break

    # bn = BayesNet(c_dict)

    return c_dict
Beispiel #12
0
    def hc(self,
           metric='AIC',
           max_iter=100,
           debug=False,
           restriction=None,
           whitelist=None):
        """
        Greedy Hill Climbing search proceeds by choosing the move
        which maximizes the increase in fitness of the
        network at the current step. It continues until
        it reaches a point where there does not exist any
        feasible single move that increases the network fitness.

        It is called "greedy" because it simply does what is
        best at the current iteration only, and thus does not
        look ahead to what may be better later on in the search.

        For computational saving, a Priority Queue (python's heapq)
        can be used	to maintain the best operators and reduce the
        complexity of picking the best operator from O(n^2) to O(nlogn).
        This works by maintaining the heapq of operators sorted by their
        delta score, and each time a move is made, we only have to recompute
        the O(n) delta-scores which were affected by the move. The rest of
        the operator delta-scores are not affected.

        For additional computational efficiency, we can cache the
        sufficient statistics for various families of distributions -
        therefore, computing the mutual information for a given family
        only needs to happen once.

        The possible moves are the following:
            - add edge
            - delete edge
            - invert edge

        Arguments
        ---------
        *data* : a nested numpy array
            The data from which the Bayesian network
            structure will be learned.

        *metric* : a string
            Which score metric to use.
            Options:
                - AIC
                - BIC / MDL
                - LL (log-likelihood)

        *max_iter* : an integer
            The maximum number of iterations of the
            hill-climbing algorithm to run. Note that
            the algorithm will terminate on its own if no
            improvement is made in a given iteration.

        *debug* : boolean
            Whether to print the scores/moves of the
            algorithm as its happening.

        *restriction* : a list of 2-tuples
            For MMHC algorithm, the list of allowable edge additions.

        Returns
        -------
        *bn* : a BayesNet object

        """

        # INITIALIZE NETWORK W/ NO EDGES
        # maintain children and parents dict for fast lookups
        self.c_dict = dict([(n, []) for n in self.nodes])
        self.p_dict = dict([(n, []) for n in self.nodes])

        self.restriction = restriction
        self.whitelist = whitelist

        if whitelist is None:
            self.whitelist = []
        for (u, v) in self.whitelist:
            if u in self.c_dict:
                self.c_dict[u].append(v)
            if v in self.p_dict:
                self.p_dict[v].append(u)
        print("Whitelist", self.whitelist)

        self.bn = BayesNet(self.c_dict)

        # COMPUTE INITIAL LIKELIHOOD SCORE
        print("Nodes:", list(self.bn.nodes()))

        # We do not take the complexity into account for Continuous Variables
        score = model_score(
            self.data,
            self.bn)  # - model_complexity(self.bn, self.nrow, metric)
        print("Initial Score:", score)

        # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING
        #ED = EmpiricalDistribution(data,names)

        _iter = 0
        improvement = True

        man = Manager()

        mut_inf_cache = man.dict()
        configs_cache = man.dict()

        x = []
        y = []

        while improvement:
            x.append(_iter)
            y.append(score)
            start_t = time.time()
            improvement = False
            max_delta = 0
            max_operation = None

            if debug:
                print('ITERATION: ', _iter)

            return_queue = Queue()
            p_add = Process(target=self.test_arc_additions,
                            args=(configs_cache, mut_inf_cache, return_queue))
            p_rem = Process(target=self.test_arc_deletions,
                            args=(configs_cache, mut_inf_cache, return_queue))
            #p_rev = Process(target=self.test_arc_reversals, args=(configs_cache, mut_inf_cache, return_queue))

            p_add.start()
            p_rem.start()
            #p_rev.start()

            p_add.join()
            p_rem.join()
            #p_rev.join()

            while not return_queue.empty():
                results = return_queue.get()
                if results[1] > max_delta:
                    max_arc = results[0]
                    max_delta = results[1]
                    max_operation = results[2]

            ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
            if max_operation:
                score += max_delta
                improvement = True
                u, v = max_arc
                str_arc = [e for e in max_arc]
                if max_operation == 'Addition':
                    if debug:
                        print("delta:", max_delta)
                        print('ADDING: ', str_arc, '\n')
                    self.p_dict[v].append(u)
                    self.bn.add_edge(u, v)
                elif max_operation == 'Deletion':
                    if debug:
                        print("delta:", max_delta)
                        print('DELETING: ', str_arc, '\n')
                    self.p_dict[v].remove(u)
                    self.bn.remove_edge(u, v)
                elif max_operation == 'Reversal':
                    if debug:
                        print("delta:", max_delta)
                        print('REVERSING: ', str_arc, '\n')
                    self.p_dict[v].remove(u)
                    self.bn.remove_edge(u, v)
                    self.p_dict[u].append(v)
                    self.bn.add_edge(v, u)
                print("Model score:", score
                      )  # TODO: improve so only changed elements get an update
            else:
                if debug:
                    print('No Improvement on Iter: ', _iter)
            print("Time for iteration:", time.time() - start_t)

            ### TEST FOR MAX ITERATION ###
            _iter += 1
        #    if _iter > max_iter:
        #        if debug:
        #            print('Max Iteration Reached')
        #        break

        bn = BayesNet(self.c_dict)
        print("Size of Cache", len(mut_inf_cache))
        print("SCORE =", score)

        plt.plot(x, y)
        plt.show()

        return bn
Beispiel #13
0
class hill_climbing:
    def __init__(self, data, nodes):
        self.data = data
        self.nodes = nodes

        self.nrow = len(self.data)
        self.ncol = len(self.nodes)
        self.names = range(self.ncol)

        # From Density Estimation for Statistics and Data Analysis, Bernard. W. Silverman, CRC ,1986
        #   (chapter Required sample size for given accuracy)
        self.sample_size = [
            4, 19, 67, 223, 768, 2790, 10700, 43700, 187000, 842000
        ]

    def hc(self,
           metric='AIC',
           max_iter=100,
           debug=False,
           restriction=None,
           whitelist=None):
        """
        Greedy Hill Climbing search proceeds by choosing the move
        which maximizes the increase in fitness of the
        network at the current step. It continues until
        it reaches a point where there does not exist any
        feasible single move that increases the network fitness.

        It is called "greedy" because it simply does what is
        best at the current iteration only, and thus does not
        look ahead to what may be better later on in the search.

        For computational saving, a Priority Queue (python's heapq)
        can be used	to maintain the best operators and reduce the
        complexity of picking the best operator from O(n^2) to O(nlogn).
        This works by maintaining the heapq of operators sorted by their
        delta score, and each time a move is made, we only have to recompute
        the O(n) delta-scores which were affected by the move. The rest of
        the operator delta-scores are not affected.

        For additional computational efficiency, we can cache the
        sufficient statistics for various families of distributions -
        therefore, computing the mutual information for a given family
        only needs to happen once.

        The possible moves are the following:
            - add edge
            - delete edge
            - invert edge

        Arguments
        ---------
        *data* : a nested numpy array
            The data from which the Bayesian network
            structure will be learned.

        *metric* : a string
            Which score metric to use.
            Options:
                - AIC
                - BIC / MDL
                - LL (log-likelihood)

        *max_iter* : an integer
            The maximum number of iterations of the
            hill-climbing algorithm to run. Note that
            the algorithm will terminate on its own if no
            improvement is made in a given iteration.

        *debug* : boolean
            Whether to print the scores/moves of the
            algorithm as its happening.

        *restriction* : a list of 2-tuples
            For MMHC algorithm, the list of allowable edge additions.

        Returns
        -------
        *bn* : a BayesNet object

        """

        # INITIALIZE NETWORK W/ NO EDGES
        # maintain children and parents dict for fast lookups
        self.c_dict = dict([(n, []) for n in self.nodes])
        self.p_dict = dict([(n, []) for n in self.nodes])

        self.restriction = restriction
        self.whitelist = whitelist

        if whitelist is None:
            self.whitelist = []
        for (u, v) in self.whitelist:
            if u in self.c_dict:
                self.c_dict[u].append(v)
            if v in self.p_dict:
                self.p_dict[v].append(u)
        print("Whitelist", self.whitelist)

        self.bn = BayesNet(self.c_dict)

        # COMPUTE INITIAL LIKELIHOOD SCORE
        print("Nodes:", list(self.bn.nodes()))

        # We do not take the complexity into account for Continuous Variables
        score = model_score(
            self.data,
            self.bn)  # - model_complexity(self.bn, self.nrow, metric)
        print("Initial Score:", score)

        # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING
        #ED = EmpiricalDistribution(data,names)

        _iter = 0
        improvement = True

        man = Manager()

        mut_inf_cache = man.dict()
        configs_cache = man.dict()

        x = []
        y = []

        while improvement:
            x.append(_iter)
            y.append(score)
            start_t = time.time()
            improvement = False
            max_delta = 0
            max_operation = None

            if debug:
                print('ITERATION: ', _iter)

            return_queue = Queue()
            p_add = Process(target=self.test_arc_additions,
                            args=(configs_cache, mut_inf_cache, return_queue))
            p_rem = Process(target=self.test_arc_deletions,
                            args=(configs_cache, mut_inf_cache, return_queue))
            #p_rev = Process(target=self.test_arc_reversals, args=(configs_cache, mut_inf_cache, return_queue))

            p_add.start()
            p_rem.start()
            #p_rev.start()

            p_add.join()
            p_rem.join()
            #p_rev.join()

            while not return_queue.empty():
                results = return_queue.get()
                if results[1] > max_delta:
                    max_arc = results[0]
                    max_delta = results[1]
                    max_operation = results[2]

            ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
            if max_operation:
                score += max_delta
                improvement = True
                u, v = max_arc
                str_arc = [e for e in max_arc]
                if max_operation == 'Addition':
                    if debug:
                        print("delta:", max_delta)
                        print('ADDING: ', str_arc, '\n')
                    self.p_dict[v].append(u)
                    self.bn.add_edge(u, v)
                elif max_operation == 'Deletion':
                    if debug:
                        print("delta:", max_delta)
                        print('DELETING: ', str_arc, '\n')
                    self.p_dict[v].remove(u)
                    self.bn.remove_edge(u, v)
                elif max_operation == 'Reversal':
                    if debug:
                        print("delta:", max_delta)
                        print('REVERSING: ', str_arc, '\n')
                    self.p_dict[v].remove(u)
                    self.bn.remove_edge(u, v)
                    self.p_dict[u].append(v)
                    self.bn.add_edge(v, u)
                print("Model score:", score
                      )  # TODO: improve so only changed elements get an update
            else:
                if debug:
                    print('No Improvement on Iter: ', _iter)
            print("Time for iteration:", time.time() - start_t)

            ### TEST FOR MAX ITERATION ###
            _iter += 1
        #    if _iter > max_iter:
        #        if debug:
        #            print('Max Iteration Reached')
        #        break

        bn = BayesNet(self.c_dict)
        print("Size of Cache", len(mut_inf_cache))
        print("SCORE =", score)

        plt.plot(x, y)
        plt.show()

        return bn

    def test_arc_reversals(self, configs_cache, mut_inf_cache, return_queue):
        print("Test Reversals")
        ### TEST ARC REVERSALS ###
        max_delta = 0
        max_operation = None
        max_arc = None
        max_qi = 0
        for u in self.bn.nodes():
            for v in self.c_dict[u]:
                if not would_cause_cycle(self.c_dict, v, u, reverse=True) and (
                        self.restriction is None or
                    (v, u) in self.restriction):  # and (
                    #                       self.whitelist is None or (u,v) not in self.whitelist):
                    # SCORE FOR 'U' -> gaining 'v' as parent
                    old_cols = (u, ) + tuple(
                        self.p_dict[u])  # without 'v' as parent
                    if old_cols not in mut_inf_cache:
                        mut_inf_cache[old_cols] = mutual_information(
                            self.data[list(old_cols)])
                    mi_old = mut_inf_cache[old_cols]

                    new_cols = old_cols + (v, )  # with 'v' as parent
                    if new_cols not in mut_inf_cache:
                        mut_inf_cache[new_cols] = mutual_information(
                            self.data[list(new_cols)])
                    mi_new = mut_inf_cache[new_cols]

                    delta1 = self.nrow * (
                        mi_new - mi_old
                    )  # Add difference in complexity -> recalculate qi for node v

                    # SCORE FOR 'V' -> losing 'u' as parent
                    old_cols = (v, ) + tuple(
                        self.p_dict[v])  # with 'u' as parent
                    if old_cols not in mut_inf_cache:
                        mut_inf_cache[old_cols] = mutual_information(
                            self.data[list(old_cols)])
                    mi_old = mut_inf_cache[old_cols]

                    new_cols = tuple([i for i in old_cols
                                      if i != u])  # without 'u' as parent
                    if new_cols not in mut_inf_cache:
                        mut_inf_cache[new_cols] = mutual_information(
                            self.data[list(new_cols)])
                    mi_new = mut_inf_cache[new_cols]

                    delta2 = self.nrow * (
                        mi_new - mi_old
                    )  # Add difference in complexity -> recalculate qi for node v

                    # COMBINED DELTA-SCORES
                    ri1 = self.bn.F[u]['ri']
                    qi1 = self.bn.F[u]['qi']
                    qi_new1 = calc_num_parent_configs(self.data,
                                                      self.bn.parents(u) + [v],
                                                      configs_cache)
                    ri2 = self.bn.F[v]['ri']
                    qi2 = self.bn.F[v]['qi']
                    qi_new2 = calc_num_parent_configs(
                        self.data, [x for x in self.bn.parents(v) if x != u],
                        configs_cache)

                    delta_score = delta1 + delta2 - (
                        ri2 * (qi_new2 - qi2) - (qi_new2 - qi2)
                    ) - (
                        ri1 * (qi_new1 - qi1) - (qi_new1 - qi1)
                    )  # Add difference in complexity -> recalculate qi for node u and v

                    if delta_score - max_delta > 10**(-10):
                        max_delta = delta_score
                        max_operation = 'Reversal'
                        max_arc = (u, v)
                        max_qi = (qi_new1, qi_new2)
        return_queue.put((max_arc, max_delta, max_operation, max_qi))

    def test_arc_deletions(self, configs_cache, l_inf_cache, return_queue):
        print("Test Deletions")
        ### TEST ARC DELETIONS ###
        max_delta = 0
        max_operation = None
        max_arc = None
        max_qi = 0
        for u in self.bn.nodes():
            for v in [
                    n for n in self.c_dict[u] if (u, n) not in self.whitelist
            ]:
                #if (u,v) not in self.whitelist:
                # SCORE FOR 'V' -> losing a parent
                old_cols = (v, ) + tuple(self.p_dict[v])  # with 'u' as parent
                if old_cols not in l_inf_cache:
                    l_inf_cache[old_cols] = calc_score(self.data, old_cols)
                old_cols2 = tuple(self.p_dict[v])
                if old_cols2 not in l_inf_cache:
                    l_inf_cache[old_cols2] = calc_score(self.data, old_cols2)
                l_old = l_inf_cache[old_cols] - l_inf_cache[old_cols2]

                new_cols = tuple([i for i in old_cols
                                  if i != u])  # without 'u' as parent
                if len(new_cols) == 1:
                    if new_cols not in l_inf_cache:
                        l_inf_cache[new_cols] = calc_score(self.data, new_cols)
                    l_new = l_inf_cache[new_cols]
                else:
                    if new_cols not in l_inf_cache:
                        l_inf_cache[new_cols] = calc_score(self.data, new_cols)
                    if tuple([n for n in self.p_dict[v]
                              if n != u]) not in l_inf_cache:
                        l_inf_cache[tuple(self.p_dict[v])] = calc_score(
                            self.data, self.p_dict[v])
                    l_new = l_inf_cache[old_cols] - l_inf_cache[tuple(
                        self.p_dict[v])]

                delta_score = (
                    l_new - l_old
                )  #- self.sample_size[min(len(new_cols), len(self.sample_size))]

                if delta_score - max_delta > 10**(-10):
                    max_delta = delta_score
                    max_operation = 'Deletion'
                    max_arc = (u, v)
        return_queue.put((max_arc, max_delta, max_operation, max_qi))

    def test_arc_additions(self, configs_cache, l_inf_cache, return_queue):
        print("Test Additions")
        ### TEST ARC ADDITIONS ###
        max_delta = 0
        max_operation = None
        max_arc = None
        procs = []
        result_queue = Queue()
        for u in self.bn.nodes():
            p = Process(target=self.test_arcs,
                        args=(configs_cache, l_inf_cache, u, result_queue))
            procs.append(p)
            p.start()

        for p in procs:
            p.join()

        while not result_queue.empty():
            results = result_queue.get()

            if results[1] - max_delta > 10**(-10):
                max_arc = results[0]
                max_delta = results[1]
                max_operation = results[2]
        return_queue.put((max_arc, max_delta, max_operation))

    def test_arcs(self, configs_cache, l_inf_cache, u, result_queue):
        max_delta = 0
        max_operation = None
        max_arc = None
        for v in [
                n for n in self.bn.nodes()
                if u != n and n not in self.c_dict[u]
                and not would_cause_cycle(self.c_dict, u, n)
        ]:
            # FOR MMHC ALGORITHM -> Edge Restrictions
            if self.restriction is None or (u, v) in self.restriction:
                # SCORE FOR 'V' -> gaining a parent
                old_cols = (v, ) + tuple(
                    self.p_dict[v])  # without 'u' as parent
                if len(old_cols) == 1:
                    if old_cols not in l_inf_cache:
                        l_inf_cache[old_cols] = calc_score(self.data, old_cols)
                    l_old = l_inf_cache[old_cols]
                else:
                    if old_cols not in l_inf_cache:
                        l_inf_cache[old_cols] = calc_score(self.data, old_cols)
                    if tuple(self.p_dict[v]) not in l_inf_cache:
                        l_inf_cache[tuple(self.p_dict[v])] = calc_score(
                            self.data, self.p_dict[v])
                    l_old = l_inf_cache[old_cols] - l_inf_cache[tuple(
                        self.p_dict[v])]

                new_cols = old_cols + (u, )  # with'u' as parent
                if new_cols not in l_inf_cache:
                    l_inf_cache[new_cols] = calc_score(self.data, new_cols)
                new_cols2 = tuple(self.p_dict[v]) + (u, )
                if new_cols2 not in l_inf_cache:
                    l_inf_cache[new_cols2] = calc_score(self.data, new_cols2)
                l_new = l_inf_cache[new_cols] - l_inf_cache[new_cols2]

                delta_score = (l_new - l_old) - self.sample_size[min(
                    len(new_cols), len(self.sample_size))]

                if delta_score - max_delta > 10**(-10):
                    max_delta = delta_score
                    max_operation = 'Addition'
                    max_arc = (u, v)
        result_queue.put((max_arc, max_delta, max_operation))
Beispiel #14
0
def hc_rr(data, M=5, R=3, metric='AIC', max_iter=100, debug=False, restriction=None):
	"""
	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print the scores/moves of the
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object
	"""
	nrow = data.shape[0]
	ncol = data.shape[1]
	
	names = range(ncol)

	# INITIALIZE NETWORK W/ NO EDGES
	# maintain children and parents dict for fast lookups
	c_dict = dict([(n,[]) for n in names])
	p_dict = dict([(n,[]) for n in names])
	
	# COMPUTE INITIAL LIKELIHOOD SCORE	
	value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)])
	bn = BayesNet(c_dict)
	mle_estimator(bn, data)
	max_score = info_score(bn, nrow, metric)
	

	_iter = 0
	improvement = True
	_restarts = 0

	while improvement:
		improvement = False
		max_delta = 0

		if debug:
			print 'ITERATION: ' , _iter

		### TEST ARC ADDITIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v):
					# FOR MMHC ALGORITHM -> Edge Restrictions
					if restriction is None or (u,v) in restriction:
						# SCORE FOR 'V' -> gaining a parent
						old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = old_cols + (u,) # with'u' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta_score = nrow * (mi_old - mi_new)

						if delta_score > max_delta:
							if debug:
								print 'Improved Arc Addition: ' , (u,v)
								print 'Delta Score: ' , delta_score
							max_delta = delta_score
							max_operation = 'Addition'
							max_arc = (u,v)

		### TEST ARC DELETIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v in c_dict[u]:
					# SCORE FOR 'V' -> losing a parent
					old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta_score = nrow * (mi_old - mi_new)

					if delta_score > max_delta:
						if debug:
							print 'Improved Arc Deletion: ' , (u,v)
							print 'Delta Score: ' , delta_score
						max_delta = delta_score
						max_operation = 'Deletion'
						max_arc = (u,v)

		### TEST ARC REVERSALS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True):
					# SCORE FOR 'U' -> gaining 'v' as parent
					old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = old_cols + (v,) # with 'v' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta1 = nrow * (mi_old - mi_new)
					# SCORE FOR 'V' -> losing 'u' as parent
					old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta2 = nrow * (mi_old - mi_new)
					# COMBINED DELTA-SCORES
					delta_score = delta1 + delta2

					if delta_score > max_delta:
						if debug:
							print 'Improved Arc Reversal: ' , (u,v)
							print 'Delta Score: ' , delta_score
						max_delta = delta_score
						max_operation = 'Reversal'
						max_arc = (u,v)


		### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
		if max_delta != 0:
			improvement = True
			u,v = max_arc
			if max_operation == 'Addition':
				if debug:
					print 'ADDING: ' , max_arc , '\n'
				c_dict[u].append(v)
				p_dict[v].append(u)
			elif max_operation == 'Deletion':
				if debug:
					print 'DELETING: ' , max_arc , '\n'
				c_dict[u].remove(v)
				p_dict[v].remove(u)
			elif max_operation == 'Reversal':
				if debug:
					print 'REVERSING: ' , max_arc, '\n'
					c_dict[u].remove(v)
					p_dict[v].remove(u)
					c_dict[v].append(u)
					p_dict[u].append(v)
		else:
			if debug:
				print 'No Improvement on Iter: ' , _iter
			#### RESTART WITH RANDOM MOVES ####
			if _restarts < R:
				improvement = True # make another pass of hill climbing
				_iter=0 # reset iterations
				if debug:
					print 'Restart - ' , _restarts
				_restarts+=1
				for _ in range(M):
					# 0 = Addition, 1 = Deletion, 2 = Reversal
					operation = np.random.choice([0,1,2])
					if operation == 0:
						while True:
							u,v = np.random.choice(list(bn.nodes()), size=2, replace=False)
							# IF EDGE DOESN'T EXIST, ADD IT
							if u not in p_dict[v] and u!=v and not would_cause_cycle(c_dict,u,v):
								if debug:
									print 'RESTART - ADDING: ', (u,v)
								c_dict[u].append(v)
								p_dict[v].append(u)
								break
					elif operation == 1:
						while True:
							u,v = np.random.choice(list(bn.nodes()), size=2, replace=False)
							# IF EDGE EXISTS, DELETE IT
							if u in p_dict[v]:
								if debug:
									print 'RESTART - DELETING: ', (u,v)
								c_dict[u].remove(v)
								p_dict[v].remove(u)
								break
					elif operation == 2:
						while True:
							u,v = np.random.choice(list(bn.nodes()), size=2, replace=False)
							# IF EDGE EXISTS, REVERSE IT
							if u in p_dict[v] and not would_cause_cycle(c_dict,v,u, reverse=True):
								if debug:
									print 'RESTART - REVERSING: ', (u,v)
								c_dict[u].remove(v)
								p_dict[v].remove(u)
								c_dict[v].append(u)
								p_dict[u].append(v)
								break

		### TEST FOR MAX ITERATION ###
		_iter += 1
		if _iter > max_iter:
			if debug:
				print 'Max Iteration Reached'
			break

	
	bn = BayesNet(c_dict)

	return bn
Beispiel #15
0
def hc(data, metric='AIC', max_iter=100, debug=False, restriction=None):
	"""
	Greedy Hill Climbing search proceeds by choosing the move
	which maximizes the increase in fitness of the
	network at the current step. It continues until
	it reaches a point where there does not exist any
	feasible single move that increases the network fitness.

	It is called "greedy" because it simply does what is
	best at the current iteration only, and thus does not
	look ahead to what may be better later on in the search.

	For computational saving, a Priority Queue (python's heapq) 
	can be used	to maintain the best operators and reduce the
	complexity of picking the best operator from O(n^2) to O(nlogn).
	This works by maintaining the heapq of operators sorted by their
	delta score, and each time a move is made, we only have to recompute
	the O(n) delta-scores which were affected by the move. The rest of
	the operator delta-scores are not affected.

	For additional computational efficiency, we can cache the
	sufficient statistics for various families of distributions - 
	therefore, computing the mutual information for a given family
	only needs to happen once.

	The possible moves are the following:
		- add edge
		- delete edge
		- invert edge

	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print the scores/moves of the
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object

	"""
	nrow = data.shape[0]
	ncol = data.shape[1]
	
	names = range(ncol)

	# INITIALIZE NETWORK W/ NO EDGES
	# maintain children and parents dict for fast lookups
	c_dict = dict([(n,[]) for n in names])
	p_dict = dict([(n,[]) for n in names])
	
	# COMPUTE INITIAL LIKELIHOOD SCORE	
	value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)])
	bn = BayesNet(c_dict)
	mle_estimator(bn, data)
	max_score = info_score(bn, nrow, metric)

	# CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING
	#ED = EmpiricalDistribution(data,names)

	

	_iter = 0
	improvement = True

	while improvement:
		improvement = False
		max_delta = 0

		if debug:
			print 'ITERATION: ' , _iter

		### TEST ARC ADDITIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v):
					# FOR MMHC ALGORITHM -> Edge Restrictions
					if restriction is None or (u,v) in restriction:
						# SCORE FOR 'V' -> gaining a parent
						old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = old_cols + (u,) # with'u' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta_score = nrow * (mi_old - mi_new)

						if delta_score > max_delta:
							#if debug:
							#	print 'Improved Arc Addition: ' , (u,v)
							#	print 'Delta Score: ' , delta_score
							max_delta = delta_score
							max_operation = 'Addition'
							max_arc = (u,v)

		### TEST ARC DELETIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v in c_dict[u]:
					# SCORE FOR 'V' -> losing a parent
					old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta_score = nrow * (mi_old - mi_new)

					if delta_score > max_delta:
						#if debug:
						#	print 'Improved Arc Deletion: ' , (u,v)
						#	print 'Delta Score: ' , delta_score
						max_delta = delta_score
						max_operation = 'Deletion'
						max_arc = (u,v)

		### TEST ARC REVERSALS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True):
					# SCORE FOR 'U' -> gaining 'v' as parent
					old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = old_cols + (v,) # with 'v' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta1 = nrow * (mi_old - mi_new)
					# SCORE FOR 'V' -> losing 'u' as parent
					old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta2 = nrow * (mi_old - mi_new)
					# COMBINED DELTA-SCORES
					delta_score = delta1 + delta2

					if delta_score > max_delta:
						#if debug:
						#	print 'Improved Arc Reversal: ' , (u,v)
						#	print 'Delta Score: ' , delta_score
						max_delta = delta_score
						max_operation = 'Reversal'
						max_arc = (u,v)


		### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
		if max_delta != 0:
			improvement = True
			u,v = max_arc
			if max_operation == 'Addition':
				if debug:
					print 'ADDING: ' , max_arc , '\n'
				c_dict[u].append(v)
				p_dict[v].append(u)
			elif max_operation == 'Deletion':
				if debug:
					print 'DELETING: ' , max_arc , '\n'
				c_dict[u].remove(v)
				p_dict[v].remove(u)
			elif max_operation == 'Reversal':
				if debug:
					print 'REVERSING: ' , max_arc, '\n'
					c_dict[u].remove(v)
					p_dict[v].remove(u)
					c_dict[v].append(u)
					p_dict[u].append(v)
		else:
			if debug:
				print 'No Improvement on Iter: ' , _iter

		### TEST FOR MAX ITERATION ###
		_iter += 1
		if _iter > max_iter:
			if debug:
				print 'Max Iteration Reached'
			break

	
	bn = BayesNet(c_dict)

	return bn
Beispiel #16
0
def tabu(data, k=5, metric='AIC', max_iter=100, debug=False, restriction=None):
	"""
	Tabu search for score-based structure learning.

	The algorithm maintains a list called "tabu_list",
	which consists of 3-tuples, where the first two
	elements constitute the edge which is tabued, and
	the third element is a string - either 'Addition',
	'Deletion', or 'Reversal' denoting the operation
	associated with the edge.

	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print the scores/moves of the
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object
	
	"""
	nrow = data.shape[0]
	ncol = data.shape[1]
	
	names = range(ncol)

	# INITIALIZE NETWORK W/ NO EDGES
	# maintain children and parents dict for fast lookups
	c_dict = dict([(n,[]) for n in names])
	p_dict = dict([(n,[]) for n in names])
	
	# COMPUTE INITIAL LIKELIHOOD SCORE	
	value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)])
	bn = BayesNet(c_dict)
	mle_estimator(bn, data)
	max_score = info_score(bn, nrow, metric)

	tabu_list = [None]*k


	_iter = 0
	improvement = True

	while improvement:
		improvement = False
		max_delta = 0

		if debug:
			print 'ITERATION: ' , _iter

		### TEST ARC ADDITIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				# CHECK TABU LIST - can't delete an addition on the tabu list
				if (u,v,'Deletion') not in tabu_list:
					# CHECK EDGE EXISTENCE AND CYCLICITY
					if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v):
						# FOR MMHC ALGORITHM -> Edge Restrictions
						if restriction is None or (u,v) in restriction:
							# SCORE FOR 'V' -> gaining a parent
							old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent
							mi_old = mutual_information(data[:,old_cols])
							new_cols = old_cols + (u,) # with'u' as parent
							mi_new = mutual_information(data[:,new_cols])
							delta_score = nrow * (mi_old - mi_new)

							if delta_score > max_delta:
								if debug:
									print 'Improved Arc Addition: ' , (u,v)
									print 'Delta Score: ' , delta_score
								max_delta = delta_score
								max_operation = 'Addition'
								max_arc = (u,v)

		### TEST ARC DELETIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				# CHECK TABU LIST - can't add back a deletion on the tabu list
				if (u,v,'Addition') not in tabu_list:
					if v in c_dict[u]:
						# SCORE FOR 'V' -> losing a parent
						old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta_score = nrow * (mi_old - mi_new)

						if delta_score > max_delta:
							if debug:
								print 'Improved Arc Deletion: ' , (u,v)
								print 'Delta Score: ' , delta_score
							max_delta = delta_score
							max_operation = 'Deletion'
							max_arc = (u,v)

		### TEST ARC REVERSALS ###
		for u in bn.nodes():
			for v in bn.nodes():
				# CHECK TABU LIST - can't reverse back a reversal on the tabu list
				if (u,v,'Reversal') not in tabu_list:
					if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True):
						# SCORE FOR 'U' -> gaining 'v' as parent
						old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = old_cols + (v,) # with 'v' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta1 = nrow * (mi_old - mi_new)
						# SCORE FOR 'V' -> losing 'u' as parent
						old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta2 = nrow * (mi_old - mi_new)
						# COMBINED DELTA-SCORES
						delta_score = delta1 + delta2

						if delta_score > max_delta:
							if debug:
								print 'Improved Arc Reversal: ' , (u,v)
								print 'Delta Score: ' , delta_score
							max_delta = delta_score
							max_operation = 'Reversal'
							max_arc = (u,v)


		### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
		if max_delta != 0:
			improvement = True
			u,v = max_arc
			if max_operation == 'Addition':
				if debug:
					print 'ADDING: ' , max_arc , '\n'
				c_dict[u].append(v)
				p_dict[v].append(u)
				tabu_list[_iter % 5] = (u,v,'Addition')
			elif max_operation == 'Deletion':
				if debug:
					print 'DELETING: ' , max_arc , '\n'
				c_dict[u].remove(v)
				p_dict[v].remove(u)
				tabu_list[_iter % 5] = (u,v,'Deletion')
			elif max_operation == 'Reversal':
				if debug:
					print 'REVERSING: ' , max_arc, '\n'
					c_dict[u].remove(v)
					p_dict[v].remove(u)
					c_dict[v].append(u)
					p_dict[u].append(v)
					tabu_list[_iter % 5] = (u,v,'Reversal')
		else:
			if debug:
				print 'No Improvement on Iter: ' , _iter

		### TEST FOR MAX ITERATION ###
		_iter += 1
		if _iter > max_iter:
			if debug:
				print 'Max Iteration Reached'
			break

	
	bn = BayesNet(c_dict)

	return bn
Beispiel #17
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
    """
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            # find X_max in V-Mb(T)-{T} that maximizes
            # mutual information of X,T|Mb(T)
            # i.e. max of mi_test(data[:,(X,T,Mb(T))])
            max_val = -1
            max_x = None
            for X in V - set(Mb[T]) - {T}:
                cols = (X, T) + tuple(Mb[T])
                mi_val = mi_test(data[:, cols], test=False)
                if mi_val > max_val:
                    max_val = mi_val
                    max_x = X
            # if Xmax is dependent on T given Mb(T)
            cols = (max_x, T) + tuple(Mb[T])
            if max_x is not None and are_independent(data[:, cols]):
                Mb[T].append(X)
                Mb_change = True
                if debug:
                    print('Adding %s to MB of %s' % (str(X), str(T)))

        # SHRINKING PHASE
        for X in Mb[T]:
            # if x is independent of t given Mb(T) - {x}
            cols = (X, T) + tuple(set(Mb[T]) - {X})
            if are_independent(data[:, cols], alpha):
                Mb[T].remove(X)
                if debug:
                    print('Removing %s from MB of %s' % (str(X), str(T)))

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print('Unoriented edge dict:\n %s' % str(edge_dict))
            print('MB: %s' % str(Mb))
        # ORIENT EDGES
        oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha)
        if debug:
            print('Oriented edge dict:\n %s' % str(oriented_edge_dict))

        # CREATE BAYESNET OBJECT
        value_dict = dict(
            zip(range(data.shape[1]),
                [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Beispiel #18
0
 def setUp(self):
     self.bn = BayesNet()
     self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))),
                               'data')
     self.bn_bif = read_bn(os.path.join(self.dpath, 'cancer.bif'))
     self.bn_bn = read_bn(os.path.join(self.dpath, 'cmu.bn'))
Beispiel #19
0
def hc(data, metric='AIC', max_iter=100, debug=False, restriction=None):
	"""
	Greedy Hill Climbing search proceeds by choosing the move
	which maximizes the increase in fitness of the
	network at the current step. It continues until
	it reaches a point where there does not exist any
	feasible single move that increases the network fitness.

	It is called "greedy" because it simply does what is
	best at the current iteration only, and thus does not
	look ahead to what may be better later on in the search.

	For computational saving, a Priority Queue (python's heapq)
	can be used	to maintain the best operators and reduce the
	complexity of picking the best operator from O(n^2) to O(nlogn).
	This works by maintaining the heapq of operators sorted by their
	delta score, and each time a move is made, we only have to recompute
	the O(n) delta-scores which were affected by the move. The rest of
	the operator delta-scores are not affected.

	For additional computational efficiency, we can cache the
	sufficient statistics for various families of distributions - 
	therefore, computing the mutual information for a given family
	only needs to happen once.

	The possible moves are the following:
		- add edge
		- delete edge
		- invert edge

	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print(the scores/moves of the)
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object

	"""
	nrow = data.shape[0]
	ncol = data.shape[1]
	
	names = range(ncol)

	# INITIALIZE NETWORK W/ NO EDGES
	# maintain children and parents dict for fast lookups
	c_dict = dict([(n, []) for n in names])
	p_dict = dict([(n, []) for n in names])
	
	# COMPUTE INITIAL LIKELIHOOD SCORE	
	value_dict = dict([(n, np.unique(data[:, i])) for i,n in enumerate(names)])
	bn = BayesNet(c_dict)
	mle_estimator(bn, data)
	max_score = info_score(bn, nrow, metric)

	# CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING
	#ED = EmpiricalDistribution(data,names)

	

	_iter = 0
	improvement = True

	while improvement:
		improvement = False
		max_delta = 0

		if debug:
			print('ITERATION: ', _iter)

		### TEST ARC ADDITIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v not in c_dict[u] and u != v and not would_cause_cycle(c_dict, u, v):
					# FOR MMHC ALGORITHM -> Edge Restrictions
					if restriction is None or (u, v) in restriction:
						# SCORE FOR 'V' -> gaining a parent
						old_cols = (v,) + tuple(p_dict[v])  # without 'u' as parent
						mi_old = mutual_information(data[:, old_cols])
						new_cols = old_cols + (u,) # with'u' as parent
						mi_new = mutual_information(data[:, new_cols])
						delta_score = nrow * (mi_old - mi_new)

						if delta_score > max_delta:
							#if debug:
							#	print('Improved Arc Addition: ' , (u,v))
							#	print('Delta Score: ' , delta_score)
							max_delta = delta_score
							max_operation = 'Addition'
							max_arc = (u,v)

		### TEST ARC DELETIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v in c_dict[u]:
					# SCORE FOR 'V' -> losing a parent
					old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta_score = nrow * (mi_old - mi_new)

					if delta_score > max_delta:
						#if debug:
						#	print('Improved Arc Deletion: ' , (u,v))
						#	print('Delta Score: ' , delta_score)
						max_delta = delta_score
						max_operation = 'Deletion'
						max_arc = (u,v)

		### TEST ARC REVERSALS ###
		for u in bn.nodes():
			for v in bn.nodes():
				if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True):
					# SCORE FOR 'U' -> gaining 'v' as parent
					old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = old_cols + (v,) # with 'v' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta1 = nrow * (mi_old - mi_new)
					# SCORE FOR 'V' -> losing 'u' as parent
					old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
					mi_old = mutual_information(data[:,old_cols])
					new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent
					mi_new = mutual_information(data[:,new_cols])
					delta2 = nrow * (mi_old - mi_new)
					# COMBINED DELTA-SCORES
					delta_score = delta1 + delta2

					if delta_score > max_delta:
						#if debug:
						#	print('Improved Arc Reversal: ' , (u,v))
						#	print('Delta Score: ' , delta_score)
						max_delta = delta_score
						max_operation = 'Reversal'
						max_arc = (u,v)


		### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
		if max_delta != 0:
			improvement = True
			u,v = max_arc
			if max_operation == 'Addition':
				if debug:
					print('ADDING: ' , max_arc , '\n')
				c_dict[u].append(v)
				p_dict[v].append(u)
			elif max_operation == 'Deletion':
				if debug:
					print('DELETING: ' , max_arc , '\n')
				c_dict[u].remove(v)
				p_dict[v].remove(u)
			elif max_operation == 'Reversal':
				if debug:
					print('REVERSING: ' , max_arc, '\n')
					c_dict[u].remove(v)
					p_dict[v].remove(u)
					c_dict[v].append(u)
					p_dict[u].append(v)
		else:
			if debug:
				print('No Improvement on Iter: ' , _iter)

		### TEST FOR MAX ITERATION ###
		_iter += 1
		if _iter > max_iter:
			if debug:
				print('Max Iteration Reached')
			break


	# bn = BayesNet(c_dict)
	# print("bn is: " + str(bn.E))




	return c_dict
Beispiel #20
0
def read_json(path):
    """
    Read a BayesNet object from the json format. This
    format has the ".bn" extension and is completely
    unique to pyBN.

    Arguments
    ---------
    *path* : a string
        The file path

    Returns
    -------
    None

    Effects
    -------
    - Instantiates and sets a new BayesNet object

    Notes
    -----
    
    This function reads in a libpgm-style format into a bn object

    File Format:
        {
            "V": ["Letter", "Grade", "Intelligence", "SAT", "Difficulty"],
            "E": [["Intelligence", "Grade"],
                ["Difficulty", "Grade"],
                ["Intelligence", "SAT"],
                ["Grade", "Letter"]],
            "Vdata": {
                "Letter": {
                    "ord": 4,
                    "numoutcomes": 2,
                    "vals": ["weak", "strong"],
                    "parents": ["Grade"],
                    "children": None,
                    "cprob": [[.1, .9],[.4, .6],[.99, .01]]
                },
                ...
        }


    """
    def byteify(input):
        if isinstance(input, dict):
            return {
                byteify(key): byteify(value)
                for key, value in input.iteritems()
            }
        elif isinstance(input, list):
            return [byteify(element) for element in input]
        elif isinstance(input, unicode):
            return input.encode('utf-8')
        else:
            return input

    bn = BayesNet()

    f = open(path, 'r')
    ftxt = f.read()

    success = False
    try:
        data = byteify(json.loads(ftxt))
        bn.V = data['V']
        bn.E = data['E']
        bn.F = data['F']
        success = True
    except ValueError:
        print("Could not read file - check format")
    bn.V = topsort(bn.E)

    return bn
Beispiel #21
0
def read_bif(path):
    """
    This function reads a .bif file into a
    BayesNet object. It's probably not the 
    fastest or prettiest but it gets the job
    done.

    Arguments
    ---------
    *path* : a string
        The path

    Returns
    -------
    *bn* : a BayesNet object

    Effects
    -------
    None

    Notes
    -----
    *V* : a list of strings
    *E* : a dict, where key = vertex, val = list of its children
    *F* : a dict, where key = rv, val = another dict with
                keys = 'parents', 'values', cpt'

    """
    _parents = {} # key = vertex, value = list of vertices in the scope (includind itself)
    _cpt = {} # key = vertex, value = list (then numpy array)
    _vals = {} # key=vertex, val=list of its possible values

    with open(path, 'r') as f:
        while True:
            line = f.readline()
            if 'variable' in line:
                new_vertex = line.split()[1]

                _parents[new_vertex] = []
                _cpt[new_vertex] = []
                #_vals[new_vertex] = []

                new_line = f.readline()
                new_vals = new_line.replace(',', ' ').split()[6:-1] # list of vals
                _vals[new_vertex] = new_vals
                num_outcomes = len(new_vals)
            elif 'probability' in line:
                line = line.replace(',', ' ')
                child_rv = line.split()[2]
                parent_rvs = line.split()[4:-2]

                if len(parent_rvs) == 0: # prior
                    new_line = f.readline().replace(';', ' ').replace(',',' ').split()
                    prob_values = new_line[1:]
                    _cpt[child_rv].append(map(float,prob_values))
                    #_cpt[child_rv] = map(float,prob_values)
                else: # not a prior
                    _parents[child_rv].extend(list(parent_rvs))
                    while True:
                        new_line = f.readline()
                        if '}' in new_line:
                            break
                        new_line = new_line.replace(',',' ').replace(';',' ').replace('(', ' ').replace(')', ' ').split()
                        prob_values = new_line[-(len(_vals[new_vertex])):]
                        prob_values = map(float,prob_values)
                        _cpt[child_rv].append(prob_values)
            if line == '':
                break

    # CREATE FACTORS
    _F = {}
    _E = {}
    for rv in _vals.keys():
        _E[rv] = [c for c in _vals.keys() if rv in _parents[c]]
        f = {
            'parents' : _parents[rv],
            'values' : _vals[rv],
            'cpt' : [item for sublist in _cpt[rv] for item in sublist]
        }
        _F[rv] = f

    bn = BayesNet()
    bn.F = _F
    bn.E = _E
    bn.V = list(topsort(_E))

    return bn
Beispiel #22
0
def train_model(data: np.ndarray,
                clusters: int = 5,
                init_nodes: list = None) -> BayesianNetwork:

    bn = BayesNet()
    #Сluster the initial data in order to fill in a hidden variable based on the distribution of clusters
    kmeans = KMeans(n_clusters=clusters, random_state=0).fit(data)
    labels = kmeans.labels_
    hidden_dist = DiscreteDistribution.from_samples(labels)
    hidden_var = np.array(hidden_dist.sample(data.shape[0]))
    new_data = np.column_stack((data, hidden_var))
    latent = (new_data.shape[1]) - 1

    #Train the network structure on data taking into account a hidden variable
    bn = hc_rr(new_data, latent=latent, init_nodes=init_nodes)
    structure = []
    nodes = sorted(list(bn.nodes()))
    for rv in nodes:
        structure.append(tuple(bn.F[rv]['parents']))
    structure = tuple(structure)
    bn = BayesianNetwork.from_structure(new_data, structure)
    bn.bake()
    #Learn a hidden variable
    hidden_var = np.array([np.nan] * (data.shape[0]))
    new_data = np.column_stack((data, hidden_var))
    bn.predict(new_data)
    bn.fit(new_data)
    bn.bake()
    return (bn)