Exemple #1
0
def hc_rr(data,
          M=5,
          R=3,
          metric='AIC',
          max_iter=100,
          debug=False,
          restriction=None):
    """
	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print the scores/moves of the
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object
	"""
    nrow = data.shape[0]
    ncol = data.shape[1]

    names = range(ncol)

    # INITIALIZE NETWORK W/ NO EDGES
    # maintain children and parents dict for fast lookups
    c_dict = dict([(n, []) for n in names])
    p_dict = dict([(n, []) for n in names])

    # COMPUTE INITIAL LIKELIHOOD SCORE
    value_dict = dict([(n, np.unique(data[:, i]))
                       for i, n in enumerate(names)])
    bn = BayesNet(c_dict)
    mle_estimator(bn, data)
    max_score = info_score(bn, nrow, metric)

    _iter = 0
    improvement = True
    _restarts = 0

    while improvement:
        improvement = False
        max_delta = 0

        if debug:
            print 'ITERATION: ', _iter

        ### TEST ARC ADDITIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v not in c_dict[u] and u != v and not would_cause_cycle(
                        c_dict, u, v):
                    # FOR MMHC ALGORITHM -> Edge Restrictions
                    if restriction is None or (u, v) in restriction:
                        # SCORE FOR 'V' -> gaining a parent
                        old_cols = (v, ) + tuple(
                            p_dict[v])  # without 'u' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = old_cols + (u, )  # with'u' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta_score = nrow * (mi_old - mi_new)

                        if delta_score > max_delta:
                            if debug:
                                print 'Improved Arc Addition: ', (u, v)
                                print 'Delta Score: ', delta_score
                            max_delta = delta_score
                            max_operation = 'Addition'
                            max_arc = (u, v)

        ### TEST ARC DELETIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u]:
                    # SCORE FOR 'V' -> losing a parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([i for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta_score = nrow * (mi_old - mi_new)

                    if delta_score > max_delta:
                        if debug:
                            print 'Improved Arc Deletion: ', (u, v)
                            print 'Delta Score: ', delta_score
                        max_delta = delta_score
                        max_operation = 'Deletion'
                        max_arc = (u, v)

        ### TEST ARC REVERSALS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u] and not would_cause_cycle(
                        c_dict, v, u, reverse=True):
                    # SCORE FOR 'U' -> gaining 'v' as parent
                    old_cols = (u, ) + tuple(
                        p_dict[v])  # without 'v' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = old_cols + (v, )  # with 'v' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta1 = nrow * (mi_old - mi_new)
                    # SCORE FOR 'V' -> losing 'u' as parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([u for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta2 = nrow * (mi_old - mi_new)
                    # COMBINED DELTA-SCORES
                    delta_score = delta1 + delta2

                    if delta_score > max_delta:
                        if debug:
                            print 'Improved Arc Reversal: ', (u, v)
                            print 'Delta Score: ', delta_score
                        max_delta = delta_score
                        max_operation = 'Reversal'
                        max_arc = (u, v)

        ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
        if max_delta != 0:
            improvement = True
            u, v = max_arc
            if max_operation == 'Addition':
                if debug:
                    print 'ADDING: ', max_arc, '\n'
                c_dict[u].append(v)
                p_dict[v].append(u)
            elif max_operation == 'Deletion':
                if debug:
                    print 'DELETING: ', max_arc, '\n'
                c_dict[u].remove(v)
                p_dict[v].remove(u)
            elif max_operation == 'Reversal':
                if debug:
                    print 'REVERSING: ', max_arc, '\n'
                    c_dict[u].remove(v)
                    p_dict[v].remove(u)
                    c_dict[v].append(u)
                    p_dict[u].append(v)
        else:
            if debug:
                print 'No Improvement on Iter: ', _iter
            #### RESTART WITH RANDOM MOVES ####
            if _restarts < R:
                improvement = True  # make another pass of hill climbing
                _iter = 0  # reset iterations
                if debug:
                    print 'Restart - ', _restarts
                _restarts += 1
                for _ in range(M):
                    # 0 = Addition, 1 = Deletion, 2 = Reversal
                    operation = np.random.choice([0, 1, 2])
                    if operation == 0:
                        while True:
                            u, v = np.random.choice(list(bn.nodes()),
                                                    size=2,
                                                    replace=False)
                            # IF EDGE DOESN'T EXIST, ADD IT
                            if u not in p_dict[
                                    v] and u != v and not would_cause_cycle(
                                        c_dict, u, v):
                                if debug:
                                    print 'RESTART - ADDING: ', (u, v)
                                c_dict[u].append(v)
                                p_dict[v].append(u)
                                break
                    elif operation == 1:
                        while True:
                            u, v = np.random.choice(list(bn.nodes()),
                                                    size=2,
                                                    replace=False)
                            # IF EDGE EXISTS, DELETE IT
                            if u in p_dict[v]:
                                if debug:
                                    print 'RESTART - DELETING: ', (u, v)
                                c_dict[u].remove(v)
                                p_dict[v].remove(u)
                                break
                    elif operation == 2:
                        while True:
                            u, v = np.random.choice(list(bn.nodes()),
                                                    size=2,
                                                    replace=False)
                            # IF EDGE EXISTS, REVERSE IT
                            if u in p_dict[v] and not would_cause_cycle(
                                    c_dict, v, u, reverse=True):
                                if debug:
                                    print 'RESTART - REVERSING: ', (u, v)
                                c_dict[u].remove(v)
                                p_dict[v].remove(u)
                                c_dict[v].append(u)
                                p_dict[u].append(v)
                                break

        ### TEST FOR MAX ITERATION ###
        _iter += 1
        if _iter > max_iter:
            if debug:
                print 'Max Iteration Reached'
            break

    bn = BayesNet(c_dict)

    return bn
Exemple #2
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
    """
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            # find X_max in V-Mb(T)-{T} that maximizes
            # mutual information of X,T|Mb(T)
            # i.e. max of mi_test(data[:,(X,T,Mb(T))])
            max_val = -1
            max_x = None
            for X in V - set(Mb[T]) - {T}:
                cols = (X, T) + tuple(Mb[T])
                mi_val = mi_test(data[:, cols], test=False)
                if mi_val > max_val:
                    max_val = mi_val
                    max_x = X
            # if Xmax is dependent on T given Mb(T)
            cols = (max_x, T) + tuple(Mb[T])
            if max_x is not None and are_independent(data[:, cols]):
                Mb[T].append(X)
                Mb_change = True
                if debug:
                    print 'Adding %s to MB of %s' % (str(X), str(T))

        # SHRINKING PHASE
        for X in Mb[T]:
            # if x is independent of t given Mb(T) - {x}
            cols = (X, T) + tuple(set(Mb[T]) - {X})
            if are_independent(data[:, cols], alpha):
                Mb[T].remove(X)
                if debug:
                    print 'Removing %s from MB of %s' % (str(X), str(T))

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print 'Unoriented edge dict:\n %s' % str(edge_dict)
            print 'MB: %s' % str(Mb)
        # ORIENT EDGES
        oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha)
        if debug:
            print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)

        # CREATE BAYESNET OBJECT
        value_dict = dict(
            zip(range(data.shape[1]),
                [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Exemple #3
0
	def setUp(self):
		self.bn = BayesNet()
		self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))),'data')	
		self.bn_bif = read_bn(os.path.join(self.dpath,'cancer.bif'))
		self.bn_bn = read_bn(os.path.join(self.dpath,'cmu.bn'))
Exemple #4
0
def hc(data, metric='AIC', max_iter=100, debug=False, restriction=None):
    """
	Greedy Hill Climbing search proceeds by choosing the move
	which maximizes the increase in fitness of the
	network at the current step. It continues until
	it reaches a point where there does not exist any
	feasible single move that increases the network fitness.

	It is called "greedy" because it simply does what is
	best at the current iteration only, and thus does not
	look ahead to what may be better later on in the search.

	For computational saving, a Priority Queue (python's heapq) 
	can be used	to maintain the best operators and reduce the
	complexity of picking the best operator from O(n^2) to O(nlogn).
	This works by maintaining the heapq of operators sorted by their
	delta score, and each time a move is made, we only have to recompute
	the O(n) delta-scores which were affected by the move. The rest of
	the operator delta-scores are not affected.

	For additional computational efficiency, we can cache the
	sufficient statistics for various families of distributions - 
	therefore, computing the mutual information for a given family
	only needs to happen once.

	The possible moves are the following:
		- add edge
		- delete edge
		- invert edge

	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print the scores/moves of the
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object

	"""
    nrow = data.shape[0]
    ncol = data.shape[1]

    names = range(ncol)

    # INITIALIZE NETWORK W/ NO EDGES
    # maintain children and parents dict for fast lookups
    c_dict = dict([(n, []) for n in names])
    p_dict = dict([(n, []) for n in names])

    # COMPUTE INITIAL LIKELIHOOD SCORE
    value_dict = dict([(n, np.unique(data[:, i]))
                       for i, n in enumerate(names)])
    bn = BayesNet(c_dict)
    mle_estimator(bn, data)
    max_score = info_score(bn, nrow, metric)

    # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING
    #ED = EmpiricalDistribution(data,names)

    _iter = 0
    improvement = True

    while improvement:
        improvement = False
        max_delta = 0

        if debug:
            print 'ITERATION: ', _iter

        ### TEST ARC ADDITIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v not in c_dict[u] and u != v and not would_cause_cycle(
                        c_dict, u, v):
                    # FOR MMHC ALGORITHM -> Edge Restrictions
                    if restriction is None or (u, v) in restriction:
                        # SCORE FOR 'V' -> gaining a parent
                        old_cols = (v, ) + tuple(
                            p_dict[v])  # without 'u' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = old_cols + (u, )  # with'u' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta_score = nrow * (mi_old - mi_new)

                        if delta_score > max_delta:
                            #if debug:
                            #	print 'Improved Arc Addition: ' , (u,v)
                            #	print 'Delta Score: ' , delta_score
                            max_delta = delta_score
                            max_operation = 'Addition'
                            max_arc = (u, v)

        ### TEST ARC DELETIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u]:
                    # SCORE FOR 'V' -> losing a parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([i for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta_score = nrow * (mi_old - mi_new)

                    if delta_score > max_delta:
                        #if debug:
                        #	print 'Improved Arc Deletion: ' , (u,v)
                        #	print 'Delta Score: ' , delta_score
                        max_delta = delta_score
                        max_operation = 'Deletion'
                        max_arc = (u, v)

        ### TEST ARC REVERSALS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u] and not would_cause_cycle(
                        c_dict, v, u, reverse=True):
                    # SCORE FOR 'U' -> gaining 'v' as parent
                    old_cols = (u, ) + tuple(
                        p_dict[v])  # without 'v' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = old_cols + (v, )  # with 'v' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta1 = nrow * (mi_old - mi_new)
                    # SCORE FOR 'V' -> losing 'u' as parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([u for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta2 = nrow * (mi_old - mi_new)
                    # COMBINED DELTA-SCORES
                    delta_score = delta1 + delta2

                    if delta_score > max_delta:
                        #if debug:
                        #	print 'Improved Arc Reversal: ' , (u,v)
                        #	print 'Delta Score: ' , delta_score
                        max_delta = delta_score
                        max_operation = 'Reversal'
                        max_arc = (u, v)

        ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
        if max_delta != 0:
            improvement = True
            u, v = max_arc
            if max_operation == 'Addition':
                if debug:
                    print 'ADDING: ', max_arc, '\n'
                c_dict[u].append(v)
                p_dict[v].append(u)
            elif max_operation == 'Deletion':
                if debug:
                    print 'DELETING: ', max_arc, '\n'
                c_dict[u].remove(v)
                p_dict[v].remove(u)
            elif max_operation == 'Reversal':
                if debug:
                    print 'REVERSING: ', max_arc, '\n'
                    c_dict[u].remove(v)
                    p_dict[v].remove(u)
                    c_dict[v].append(u)
                    p_dict[u].append(v)
        else:
            if debug:
                print 'No Improvement on Iter: ', _iter

        ### TEST FOR MAX ITERATION ###
        _iter += 1
        if _iter > max_iter:
            if debug:
                print 'Max Iteration Reached'
            break

    bn = BayesNet(c_dict)

    return bn