Ejemplo n.º 1
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
    """
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            # find X_max in V-Mb(T)-{T} that maximizes
            # mutual information of X,T|Mb(T)
            # i.e. max of mi_test(data[:,(X,T,Mb(T))])
            max_val = -1
            max_x = None
            for X in V - set(Mb[T]) - {T}:
                cols = (X, T) + tuple(Mb[T])
                mi_val = mi_test(data[:, cols], test=False)
                if mi_val > max_val:
                    max_val = mi_val
                    max_x = X
            # if Xmax is dependent on T given Mb(T)
            cols = (max_x, T) + tuple(Mb[T])
            if max_x is not None and are_independent(data[:, cols]):
                Mb[T].append(X)
                Mb_change = True
                if debug:
                    print('Adding %s to MB of %s' % (str(X), str(T)))

        # SHRINKING PHASE
        for X in Mb[T]:
            # if x is independent of t given Mb(T) - {x}
            cols = (X, T) + tuple(set(Mb[T]) - {X})
            if are_independent(data[:, cols], alpha):
                Mb[T].remove(X)
                if debug:
                    print('Removing %s from MB of %s' % (str(X), str(T)))

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print('Unoriented edge dict:\n %s' % str(edge_dict))
            print('MB: %s' % str(Mb))
        # ORIENT EDGES
        oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha)
        if debug:
            print('Oriented edge dict:\n %s' % str(oriented_edge_dict))

        # CREATE BAYESNET OBJECT
        value_dict = dict(
            zip(range(data.shape[1]),
                [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Ejemplo n.º 2
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print 'Markov Blanket for %s : %s' % (X, str(TEMP_S))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print 'Unoriented edge dict:\n %s' % str(edge_dict)
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Ejemplo n.º 3
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
	n_rv = data.shape[1]
	data, value_dict = replace_strings(data, return_values=True)
	

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# STEP 1 : COMPUTE MARKOV BLANKETS
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	for X in _T:
		S = []

		grow_condition = True
		while grow_condition:

			grow_condition=False
			for Y in range(n_rv):
				if X!=Y and Y not in S:
					# if there exists some Y such that Y is dependent on X given S,
					# add Y to S
					cols = (X,Y) + tuple(S)
					pval = mi_test(data[:,cols])
					if pval < alpha: # dependent
						grow_condition=True # dependent -> continue searching
						S.append(Y)
		
		shrink_condition = True
		while shrink_condition:

			TEMP_S = []
			shrink_condition=False
			for Y in S:
				s_copy = copy(S)
				s_copy.remove(Y) # condition on S-{Y}
				# if X independent of Y given S-{Y}, leave Y out
				# if X dependent of Y given S-{Y}, keep it in
				cols = (X,Y) + tuple(s_copy)
				pval = mi_test(data[:,cols])
				if pval < alpha: # dependent
					TEMP_S.append(Y)
				else: # independent -> condition searching
					shrink_condition=True
		
		Mb[X] = TEMP_S
		if debug:
			print('Markov Blanket for %s : %s' % (X, str(TEMP_S)))
	
	if feature_selection is None:
		# STEP 2: COMPUTE GRAPH STRUCTURE
		# i.e. Resolve Markov Blanket
		edge_dict = resolve_markov_blanket(Mb,data)
		if debug:
			print('Unoriented edge dict:\n %s' % str(edge_dict))
		
		# STEP 3: ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)
		if debug:
			print('Oriented edge dict:\n %s' % str(oriented_edge_dict))
		

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)
		
		return bn
	else:
		return Mb[_T]
Ejemplo n.º 4
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# LEARN MARKOV BLANKET
	for T in _T:

		V = set(range(n_rv)) - {T}
		Mb_change=True

		# GROWING PHASE
		while Mb_change:
			Mb_change = False
			# find X_max in V-Mb(T)-{T} that maximizes 
			# mutual information of X,T|Mb(T)
			# i.e. max of mi_test(data[:,(X,T,Mb(T))])
			max_val = -1
			max_x = None
			for X in V - set(Mb[T]) - {T}:
				cols = (X,T)+tuple(Mb[T])
				mi_val = mi_test(data[:,cols],test=False)
				if mi_val > max_val:
					max_val = mi_val
					max_x = X
			# if Xmax is dependent on T given Mb(T)
			cols = (max_x,T) + tuple(Mb[T])
			if max_x is not None and are_independent(data[:,cols]):
				Mb[T].append(X)
				Mb_change = True
				if debug:
					print 'Adding %s to MB of %s' % (str(X), str(T))

		# SHRINKING PHASE
		for X in Mb[T]:
			# if x is independent of t given Mb(T) - {x}
			cols = (X,T) + tuple(set(Mb[T]) - {X})
			if are_independent(data[:,cols],alpha):
				Mb[T].remove(X)
				if debug:
					print 'Removing %s from MB of %s' % (str(X), str(T))

	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)
		if debug:
			print 'Unoriented edge dict:\n %s' % str(edge_dict)
			print 'MB: %s' % str(Mb)
		# ORIENT EDGES
		oriented_edge_dict = orient_edges_gs2(edge_dict,Mb,data,alpha)
		if debug:
			print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)

		# CREATE BAYESNET OBJECT
		value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
		bn=BayesNet(oriented_edge_dict,value_dict)

		return bn
	else:
		return Mb[_T]