Exemple #1
0
def naive_bayes(data, target, estimator='mle'):
    """
	Learn naive bayes model from data.

	The Naive Bayes model is a Tree-based
	model where all random variables have
	the same parent (the "target" variable).
	From a probabilistic standpoint, the implication
	of this model is that all random variables 
	(i.e. features) are assumed to be
	conditionally independent of any other random variable,
	conditioned upon the single parent (target) variable.

	It turns out that this model performs quite well
	as a classifier, and can be used as such. Moreover,
	this model is quite fast and simple to learn/create
	from a computational standpoint.

	Note that this function not only learns the structure,
	but ALSO learns the parameters.

	Arguments
	---------
	*data* : a nested numpy array

	*target* : an integer
		The target variable column in *data*

	Returns
	-------
	*bn* : a BayesNet object,
		with the structure instantiated.

	Effects
	-------
	None

	Notes
	-----

	"""
    value_dict = dict(
        zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))

    edge_dict = {target: [v for v in value_dict if v != target]}
    edge_dict.update(dict([(rv, []) for rv in value_dict if rv != target]))

    bn = BayesNet(edge_dict, value_dict)
    if estimator == 'bayes':
        bayes_estimator(bn, data)
    else:
        mle_estimator(bn, data)
    return bn
def K2(bn, data, ed=None):
    """
	K2 is bayesian posterior probability of structure given the data,
	where N'ijk = 1.
	"""
    counts_dict = mle_estimator(bn, data)
    a_ijk = []
    k2 = 1
    for rv, value in counts_dict.items():
        nijk = value['cpt']
        nijk_prime = 1
        k2 *= gamma(nijk + nijk_prime) / gamma(nijk_prime)
        nij_prime = nijk_prime * (len(nijk) / bn.card(rv))
        nij = np.mean(nijk.reshape(-1, bn.card(rv)),
                      axis=1)  # sum along parents
        k2 *= gamma(nij_prime) / gamma(nij + nij_prime)

    return k2
def hc_rr(data,
          M=5,
          R=8,
          metric='AIC',
          max_iter=300,
          debug=False,
          init_nodes=None,
          restriction=None):
    """
    Arguments
    ---------
    *data* : a nested numpy array
        The data from which the Bayesian network
        structure will be learned.

    *metric* : a string
        Which score metric to use.
        Options:
            - AIC
            - BIC / MDL
            - LL (log-likelihood)

    *max_iter* : an integer
        The maximum number of iterations of the
        hill-climbing algorithm to run. Note that
        the algorithm will terminate on its own if no
        improvement is made in a given iteration.

    *debug* : boolean
        Whether to print(the scores/moves of the)
        algorithm as its happening.

    *init_nodes* : a list of initialize nodes (number of nodes according to the dataset)
    
    *restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.
        
    


    Returns
    -------
    *bn* : a BayesNet object
    """
    nrow = data.shape[0]
    ncol = data.shape[1]

    names = range(ncol)

    # INITIALIZE NETWORK W/ NO EDGES
    # maintain children and parents dict for fast lookups
    c_dict = dict([(n, []) for n in names])
    p_dict = dict([(n, []) for n in names])

    # COMPUTE INITIAL LIKELIHOOD SCORE
    value_dict = dict([(n, np.unique(data[:, i]))
                       for i, n in enumerate(names)])
    bn = BayesNet(c_dict)
    mle_estimator(bn, data)
    max_score = info_score(bn, data, metric)

    _iter = 0
    improvement = True
    _restarts = 0

    while improvement:
        improvement = False
        max_delta = 0

        if debug:
            print('ITERATION: ', _iter)

        ### TEST ARC ADDITIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v not in c_dict[u] and u != v and not would_cause_cycle(
                        c_dict, u, v):
                    # FOR MMHC ALGORITHM -> Edge Restrictions
                    if (init_nodes is None or not (v in init_nodes)):
                        if restriction is None or (u, v) in restriction:
                            # SCORE FOR 'V' -> gaining a parent
                            old_cols = (v, ) + tuple(
                                p_dict[v])  # without 'u' as parent
                            mi_old = mutual_information(data[:, old_cols])
                            new_cols = old_cols + (u, )  # with'u' as parent
                            mi_new = mutual_information(data[:, new_cols])
                            delta_score = nrow * (mi_old - mi_new)
                            if delta_score > max_delta:
                                if debug:
                                    print('Improved Arc Addition: ', (u, v))
                                    print('Delta Score: ', delta_score)
                                max_delta = delta_score
                                max_operation = 'Addition'
                                max_arc = (u, v)

        ### TEST ARC DELETIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u]:
                    # SCORE FOR 'V' -> losing a parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([i for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta_score = nrow * (mi_old - mi_new)

                    if delta_score > max_delta:
                        if debug:
                            print('Improved Arc Deletion: ', (u, v))
                            print('Delta Score: ', delta_score)
                        max_delta = delta_score
                        max_operation = 'Deletion'
                        max_arc = (u, v)

        ### TEST ARC REVERSALS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u] and not would_cause_cycle(
                        c_dict, v, u, reverse=True) and (
                            init_nodes is None or
                            not (u in init_nodes)) and (restriction is None or
                                                        (v, u) in restriction):
                    # SCORE FOR 'U' -> gaining 'v' as parent
                    old_cols = (u, ) + tuple(
                        p_dict[v])  # without 'v' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = old_cols + (v, )  # with 'v' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta1 = nrow * (mi_old - mi_new)
                    # SCORE FOR 'V' -> losing 'u' as parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([u for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta2 = nrow * (mi_old - mi_new)
                    # COMBINED DELTA-SCORES
                    delta_score = delta1 + delta2

                    if delta_score > max_delta:
                        if debug:
                            print('Improved Arc Reversal: ', (u, v))
                            print('Delta Score: ', delta_score)
                        max_delta = delta_score
                        max_operation = 'Reversal'
                        max_arc = (u, v)

        ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
        if max_delta != 0:
            improvement = True
            u, v = max_arc
            if max_operation == 'Addition':
                if debug:
                    print('ADDING: ', max_arc, '\n')
                c_dict[u].append(v)
                p_dict[v].append(u)
            elif max_operation == 'Deletion':
                if debug:
                    print('DELETING: ', max_arc, '\n')
                c_dict[u].remove(v)
                p_dict[v].remove(u)
            elif max_operation == 'Reversal':
                if debug:
                    print('REVERSING: ', max_arc, '\n')
                    c_dict[u].remove(v)
                    p_dict[v].remove(u)
                    c_dict[v].append(u)
                    p_dict[u].append(v)
        else:
            if debug:
                print('No Improvement on Iter: ', _iter)
            #### RESTART WITH RANDOM MOVES ####
            if _restarts < R:
                improvement = True  # make another pass of hill climbing
                _iter = 0  # reset iterations
                if debug:
                    print('Restart - ', _restarts)
                _restarts += 1
                for _ in range(M):
                    # 0 = Addition, 1 = Deletion, 2 = Reversal
                    operation = np.random.choice([0, 1, 2])
                    if operation == 0:

                        u, v = np.random.choice(list(bn.nodes()),
                                                size=2,
                                                replace=False)
                        # IF EDGE DOESN'T EXIST, ADD IT
                        if u not in p_dict[
                                v] and u != v and not would_cause_cycle(
                                    c_dict, u,
                                    v) and (init_nodes is None
                                            or not (v in init_nodes)) and (
                                                restriction is None or
                                                (u, v) in restriction):
                            if debug:
                                print('RESTART - ADDING: ', (u, v))
                            c_dict[u].append(v)
                            p_dict[v].append(u)

                    elif operation == 1:
                        u, v = np.random.choice(list(bn.nodes()),
                                                size=2,
                                                replace=False)
                        # IF EDGE EXISTS, DELETE IT
                        if u in p_dict[v]:
                            if debug:
                                print('RESTART - DELETING: ', (u, v))
                            c_dict[u].remove(v)
                            p_dict[v].remove(u)

                    elif operation == 2:
                        u, v = np.random.choice(list(bn.nodes()),
                                                size=2,
                                                replace=False)
                        # IF EDGE EXISTS, REVERSE IT
                        if u in p_dict[v] and not would_cause_cycle(
                                c_dict, v, u, reverse=True) and (
                                    init_nodes is None or not (u in init_nodes)
                                ) and (restriction is None or
                                       (v, u) in restriction):
                            if debug:
                                print('RESTART - REVERSING: ', (u, v))
                            c_dict[u].remove(v)
                            p_dict[v].remove(u)
                            c_dict[v].append(u)
                            p_dict[u].append(v)
                            break
        ### TEST FOR MAX ITERATION ###
        _iter += 1
        if _iter > max_iter:
            if debug:
                print('Max Iteration Reached')
            break

    bn = BayesNet(c_dict)

    return bn
def tabu(data, k=5, metric='AIC', max_iter=100, debug=False, restriction=None):
	"""
	Tabu search for score-based structure learning.

	The algorithm maintains a list called "tabu_list",
	which consists of 3-tuples, where the first two
	elements constitute the edge which is tabued, and
	the third element is a string - either 'Addition',
	'Deletion', or 'Reversal' denoting the operation
	associated with the edge.

	Arguments
	---------
	*data* : a nested numpy array
		The data from which the Bayesian network
		structure will be learned.

	*metric* : a string
		Which score metric to use.
		Options:
			- AIC
			- BIC / MDL
			- LL (log-likelihood)

	*max_iter* : an integer
		The maximum number of iterations of the
		hill-climbing algorithm to run. Note that
		the algorithm will terminate on its own if no
		improvement is made in a given iteration.

	*debug* : boolean
		Whether to print(the scores/moves of the)
		algorithm as its happening.

	*restriction* : a list of 2-tuples
		For MMHC algorithm, the list of allowable edge additions.

	Returns
	-------
	*bn* : a BayesNet object
	
	"""
	nrow = data.shape[0]
	ncol = data.shape[1]
	
	names = range(ncol)

	# INITIALIZE NETWORK W/ NO EDGES
	# maintain children and parents dict for fast lookups
	c_dict = dict([(n,[]) for n in names])
	p_dict = dict([(n,[]) for n in names])
	
	# COMPUTE INITIAL LIKELIHOOD SCORE	
	value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)])
	bn = BayesNet(c_dict)
	mle_estimator(bn, data)
	max_score = info_score(bn, nrow, metric)

	tabu_list = [None]*k


	_iter = 0
	improvement = True

	while improvement:
		improvement = False
		max_delta = 0

		if debug:
			print('ITERATION: ' , _iter)

		### TEST ARC ADDITIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				# CHECK TABU LIST - can't delete an addition on the tabu list
				if (u,v,'Deletion') not in tabu_list:
					# CHECK EDGE EXISTENCE AND CYCLICITY
					if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v):
						# FOR MMHC ALGORITHM -> Edge Restrictions
						if restriction is None or (u,v) in restriction:
							# SCORE FOR 'V' -> gaining a parent
							old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent
							mi_old = mutual_information(data[:,old_cols])
							new_cols = old_cols + (u,) # with'u' as parent
							mi_new = mutual_information(data[:,new_cols])
							delta_score = nrow * (mi_old - mi_new)

							if delta_score > max_delta:
								if debug:
									print('Improved Arc Addition: ' , (u,v))
									print('Delta Score: ' , delta_score)
								max_delta = delta_score
								max_operation = 'Addition'
								max_arc = (u,v)

		### TEST ARC DELETIONS ###
		for u in bn.nodes():
			for v in bn.nodes():
				# CHECK TABU LIST - can't add back a deletion on the tabu list
				if (u,v,'Addition') not in tabu_list:
					if v in c_dict[u]:
						# SCORE FOR 'V' -> losing a parent
						old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta_score = nrow * (mi_old - mi_new)

						if delta_score > max_delta:
							if debug:
								print('Improved Arc Deletion: ' , (u,v))
								print('Delta Score: ' , delta_score)
							max_delta = delta_score
							max_operation = 'Deletion'
							max_arc = (u,v)

		### TEST ARC REVERSALS ###
		for u in bn.nodes():
			for v in bn.nodes():
				# CHECK TABU LIST - can't reverse back a reversal on the tabu list
				if (u,v,'Reversal') not in tabu_list:
					if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True):
						# SCORE FOR 'U' -> gaining 'v' as parent
						old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = old_cols + (v,) # with 'v' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta1 = nrow * (mi_old - mi_new)
						# SCORE FOR 'V' -> losing 'u' as parent
						old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent
						mi_old = mutual_information(data[:,old_cols])
						new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent
						mi_new = mutual_information(data[:,new_cols])
						delta2 = nrow * (mi_old - mi_new)
						# COMBINED DELTA-SCORES
						delta_score = delta1 + delta2

						if delta_score > max_delta:
							if debug:
								print('Improved Arc Reversal: ' , (u,v))
								print('Delta Score: ' , delta_score)
							max_delta = delta_score
							max_operation = 'Reversal'
							max_arc = (u,v)


		### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ###
		if max_delta != 0:
			improvement = True
			u,v = max_arc
			if max_operation == 'Addition':
				if debug:
					print('ADDING: ' , max_arc , '\n')
				c_dict[u].append(v)
				p_dict[v].append(u)
				tabu_list[_iter % 5] = (u,v,'Addition')
			elif max_operation == 'Deletion':
				if debug:
					print('DELETING: ' , max_arc , '\n')
				c_dict[u].remove(v)
				p_dict[v].remove(u)
				tabu_list[_iter % 5] = (u,v,'Deletion')
			elif max_operation == 'Reversal':
				if debug:
					print('REVERSING: ' , max_arc, '\n')
					c_dict[u].remove(v)
					p_dict[v].remove(u)
					c_dict[v].append(u)
					p_dict[u].append(v)
					tabu_list[_iter % 5] = (u,v,'Reversal')
		else:
			if debug:
				print('No Improvement on Iter: ' , _iter)

		### TEST FOR MAX ITERATION ###
		_iter += 1
		if _iter > max_iter:
			if debug:
				print('Max Iteration Reached')
			break

	
	bn = BayesNet(c_dict)

	return bn
Exemple #5
0
def hc(data,
       metric='LL',
       max_iter=100,
       debug=False,
       init_nodes=None,
       restriction=None,
       init_edges=None,
       remove_geo_edges=True,
       black_list=None):
    """
    Greedy Hill Climbing search proceeds by choosing the move
    which maximizes the increase in fitness of the
    network at the current step. It continues until
    it reaches a point where there does not exist any
    feasible single move that increases the network fitness.

    It is called "greedy" because it simply does what is
    best at the current iteration only, and thus does not
    look ahead to what may be better later on in the search.

    For computational saving, a Priority Queue (python's heapq) 
    can be used	to maintain the best operators and reduce the
    complexity of picking the best operator from O(n^2) to O(nlogn).
    This works by maintaining the heapq of operators sorted by their
    delta score, and each time a move is made, we only have to recompute
    the O(n) delta-scores which were affected by the move. The rest of
    the operator delta-scores are not affected.

    For additional computational efficiency, we can cache the
    sufficient statistics for various families of distributions - 
    therefore, computing the mutual information for a given family
    only needs to happen once.

    The possible moves are the following:
        - add edge
        - delete edge
        - invert edge

    Arguments
    ---------
    *data* : a nested numpy array
        The data from which the Bayesian network
        structure will be learned.

    *metric* : a string
        Which score metric to use.
        Options:
            - AIC
            - BIC / MDL
            - LL (log-likelihood)

    *max_iter* : an integer
        The maximum number of iterations of the
        hill-climbing algorithm to run. Note that
        the algorithm will terminate on its own if no
        improvement is made in a given iteration.

    *debug* : boolean
        Whether to print(the scores/moves of the)
        algorithm as its happening.

    *init_nodes* : a list of initialize nodes (number of nodes according to the dataset)

    *restriction* : a list of 2-tuples
        For MMHC algorithm, the list of allowable edge additions.

    Returns
    -------
    *bn* : a BayesNet object

    """
    nrow = data.shape[0]
    ncol = data.shape[1]

    names = range(ncol)

    # INITIALIZE NETWORK W/ NO EDGES
    # maintain children and parents dict for fast lookups
    c_dict = dict([(n, []) for n in names])
    p_dict = dict([(n, []) for n in names])
    if init_edges:
        for edge in init_edges:
            c_dict[edge[0]].append(edge[1])
            p_dict[edge[1]].append(edge[0])

    score_list = []
    # COMPUTE INITIAL LIKELIHOOD SCORE
    value_dict = dict([(n, np.unique(data[:, i]))
                       for i, n in enumerate(names)])
    bn = BayesNet(c_dict)
    mle_estimator(bn, data)
    max_score = info_score(bn, data, metric)

    # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING
    #ED = EmpiricalDistribution(data,names)

    _iter = 0
    improvement = True

    while improvement:
        improvement = False
        max_delta = 0

        if debug:
            print('ITERATION: ', _iter)

        ### TEST ARC ADDITIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v not in c_dict[u] and u != v and not would_cause_cycle(
                        c_dict, u, v) and len(p_dict[v]) != 3:
                    # FOR MMHC ALGORITHM -> Edge Restrictions
                    if (init_nodes is None or not (v in init_nodes)) and (
                            restriction is None or
                        (u, v) in restriction) and (black_list is None or not (
                            (u, v) in black_list)):
                        # SCORE FOR 'V' -> gaining a parent
                        old_cols = (v, ) + tuple(
                            p_dict[v])  # without 'u' as parent
                        mi_old = mutual_information(data[:, old_cols])
                        new_cols = old_cols + (u, )  # with'u' as parent
                        mi_new = mutual_information(data[:, new_cols])
                        delta_score = nrow * (mi_old - mi_new)

                        if delta_score > max_delta:
                            if debug:
                                print('Improved Arc Addition: ', (u, v))
                                print('Delta Score: ', delta_score)
                            max_delta = delta_score
                            max_operation = 'Addition'
                            max_arc = (u, v)

        # ### TEST ARC DELETIONS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u]:
                    # SCORE FOR 'V' -> losing a parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([i for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta_score = nrow * (mi_old - mi_new)

                    if (delta_score > max_delta):
                        if init_edges == None:
                            if debug:
                                print('Improved Arc Deletion: ', (u, v))
                                print('Delta Score: ', delta_score)
                            max_delta = delta_score
                            max_operation = 'Deletion'
                            max_arc = (u, v)
                        else:
                            if (u, v) in init_edges:
                                if remove_geo_edges:
                                    if debug:
                                        print('Improved Arc Deletion: ',
                                              (u, v))
                                        print('Delta Score: ', delta_score)
                                    max_delta = delta_score
                                    max_operation = 'Deletion'
                                    max_arc = (u, v)
                            else:
                                if debug:
                                    print('Improved Arc Deletion: ', (u, v))
                                    print('Delta Score: ', delta_score)
                                max_delta = delta_score
                                max_operation = 'Deletion'
                                max_arc = (u, v)

        # ### TEST ARC REVERSALS ###
        for u in bn.nodes():
            for v in bn.nodes():
                if v in c_dict[u] and not would_cause_cycle(
                        c_dict, v, u,
                        reverse=True) and len(p_dict[u]) != 3 and (
                            init_nodes is None or not (u in init_nodes)) and (
                                restriction is None or
                                (v, u) in restriction) and (
                                    black_list is None
                                    or not ((v, u) in black_list)):
                    # SCORE FOR 'U' -> gaining 'v' as parent
                    old_cols = (u, ) + tuple(
                        p_dict[v])  # without 'v' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = old_cols + (v, )  # with 'v' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta1 = -1 * nrow * (mi_old - mi_new)
                    # SCORE FOR 'V' -> losing 'u' as parent
                    old_cols = (v, ) + tuple(p_dict[v])  # with 'u' as parent
                    mi_old = mutual_information(data[:, old_cols])
                    new_cols = tuple([u for i in old_cols
                                      if i != u])  # without 'u' as parent
                    mi_new = mutual_information(data[:, new_cols])
                    delta2 = nrow * (mi_old - mi_new)
                    # COMBINED DELTA-SCORES
                    delta_score = delta1 + delta2

                    if (delta_score > max_delta):
                        if init_edges == None:
                            if debug:
                                print('Improved Arc Reversal: ', (u, v))
                                print('Delta Score: ', delta_score)
                            max_delta = delta_score
                            max_operation = 'Reversal'
                            max_arc = (u, v)
                        else:
                            if (u, v) in init_edges:
                                if remove_geo_edges:
                                    if debug:
                                        print('Improved Arc Reversal: ',
                                              (u, v))
                                        print('Delta Score: ', delta_score)
                                    max_delta = delta_score
                                    max_operation = 'Reversal'
                                    max_arc = (u, v)
                            else:
                                if debug:
                                    print('Improved Arc Reversal: ', (u, v))
                                    print('Delta Score: ', delta_score)
                                max_delta = delta_score
                                max_operation = 'Reversal'
                                max_arc = (u, v)

        if max_delta != 0:
            improvement = True
            u, v = max_arc
            if max_operation == 'Addition':
                if debug:
                    print('ADDING: ', max_arc, '\n')
                c_dict[u].append(v)
                p_dict[v].append(u)

            elif max_operation == 'Deletion':
                if debug:
                    print('DELETING: ', max_arc, '\n')
                c_dict[u].remove(v)
                p_dict[v].remove(u)

            elif max_operation == 'Reversal':
                if debug:
                    print('REVERSING: ', max_arc, '\n')
                c_dict[u].remove(v)
                p_dict[v].remove(u)
                c_dict[v].append(u)
                p_dict[u].append(v)

        else:
            if debug:
                print('No Improvement on Iter: ', _iter)

        ### TEST FOR MAX ITERATION ###
        _iter += 1
        if _iter > max_iter:
            if debug:
                print('Max Iteration Reached')
            break

    bn = BayesNet(c_dict)

    return bn