Example #1
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
	"""
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]

	# LEARN MARKOV BLANKET
	for T in _T:

		V = set(range(n_rv)) - {T}
		Mb_change=True

		# GROWING PHASE
		while Mb_change:
			Mb_change = False
			# find X_max in V-Mb(T)-{T} that maximizes 
			# mutual information of X,T|Mb(T)
			# i.e. max of mi_test(data[:,(X,T,Mb(T))])
			max_val = -1
			max_x = None
			for X in V - set(Mb[T]) - {T}:
				cols = (X,T)+tuple(Mb[T])
				mi_val = mi_test(data[:,cols],test=False)
				if mi_val > max_val:
					max_val = mi_val
					max_x = X
			# if Xmax is dependent on T given Mb(T)
			cols = (max_x,T) + tuple(Mb[T])
			if max_x is not None and are_independent(data[:,cols]):
				Mb[T].append(X)
				Mb_change = True
				if debug:
					print 'Adding %s to MB of %s' % (str(X), str(T))

		# SHRINKING PHASE
		for X in Mb[T]:
			# if x is independent of t given Mb(T) - {x}
			cols = (X,T) + tuple(set(Mb[T]) - {X})
			if are_independent(data[:,cols],alpha):
				Mb[T].remove(X)
				if debug:
					print 'Removing %s from MB of %s' % (str(X), str(T))

	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)
		if debug:
			print 'Unoriented edge dict:\n %s' % str(edge_dict)
			print 'MB: %s' % str(Mb)
		# ORIENT EDGES
		oriented_edge_dict = orient_edges_gs2(edge_dict,Mb,data,alpha)
		if debug:
			print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)

		# CREATE BAYESNET OBJECT
		value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
		bn=BayesNet(oriented_edge_dict,value_dict)

		return bn
	else:
		return Mb[_T]
Example #2
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
	"""
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
	# get values
	value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
	# replace strings
	data = replace_strings(data)

	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])
	N = data.shape[0]
	card = dict(zip(range(n_rv),unique_bins(data)))
	#card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]
	# LEARN MARKOV BLANKET
	for T in _T:
		S = set(range(n_rv)) - {T}
		for A in S:
			if not are_independent(data[:,(A,T)]):
				S.remove(A)
		s_h_dict = dict([(s,0) for s in S])
		while S:
			insufficient_data = False
			break_grow_phase = False
			
			#### GROW PHASE ####
			# Calculate mutual information for all variables
			mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S])
			for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True):
				# Add top MI-score variables until there isn't enough data for bins
				if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k:
					Mb[T].append(x_i)
				else:
					insufficient_data = True
					break

			#### SHRINK PHASE ####
			removed_vars = False
			for A in Mb[T]:
				cols = (A,T) + tuple(set(Mb[T]) - {A})
				# if A is independent of T given Mb[T], remove A
				if are_independent(data[:,cols]):
					Mb[T].remove(A)
					removed_vars=True

			#### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
			if insufficient_data and not removed_vars:
				if debug:
					print 'Breaking..'
				break
			else:
				A = set(range(n_rv)) - {T} - set(Mb[T])
				#A = set(nodes) - {T} - set(Mb[T])
				S = set()
				for a in A:
					cols = (a,T) + tuple(Mb[T])
					if are_independent(data[:,cols]):
						S.add(a)
		if debug:
			print 'Done with %s' % T
	
	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)

		# ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)

		return BN
	else:
		return Mb[_T]
Example #3
0
def iamb(data, alpha=0.05, feature_selection=None, debug=False):
    """
	IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data.

	Arguments
	---------
	*data* : a nested numpy array

	*alpha* : a float
		The type II error rate.

	*feature_selection* : None or a string
		Whether to use IAMB as a structure learning
		or feature selection algorithm.

	Returns
	-------
	*bn* : a BayesNet object or
	*mb* : the markov blanket of a node

	Effects
	-------
	None

	Notes
	-----
	- Works but there are definitely some bugs.

	Speed Test:
		*** 5 vars, 624 obs ***
			- 196 ms
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            # find X_max in V-Mb(T)-{T} that maximizes
            # mutual information of X,T|Mb(T)
            # i.e. max of mi_test(data[:,(X,T,Mb(T))])
            max_val = -1
            max_x = None
            for X in V - set(Mb[T]) - {T}:
                cols = (X, T) + tuple(Mb[T])
                mi_val = mi_test(data[:, cols], test=False)
                if mi_val > max_val:
                    max_val = mi_val
                    max_x = X
            # if Xmax is dependent on T given Mb(T)
            cols = (max_x, T) + tuple(Mb[T])
            if max_x is not None and are_independent(data[:, cols]):
                Mb[T].append(X)
                Mb_change = True
                if debug:
                    print 'Adding %s to MB of %s' % (str(X), str(T))

        # SHRINKING PHASE
        for X in Mb[T]:
            # if x is independent of t given Mb(T) - {x}
            cols = (X, T) + tuple(set(Mb[T]) - {X})
            if are_independent(data[:, cols], alpha):
                Mb[T].remove(X)
                if debug:
                    print 'Removing %s from MB of %s' % (str(X), str(T))

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print 'Unoriented edge dict:\n %s' % str(edge_dict)
            print 'MB: %s' % str(Mb)
        # ORIENT EDGES
        oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha)
        if debug:
            print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)

        # CREATE BAYESNET OBJECT
        value_dict = dict(
            zip(range(data.shape[1]),
                [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Example #4
0
def lambda_iamb(data, L=1.5, alpha=0.05, feature_selection=None):
    """
	Lambda IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data. This Algorithm
	is similar to the iamb algorithm, except that it allows
	for a "lambda" coefficient that helps avoid false positives.

	This algorithm was originally developed for use as a
	feature selection algorithm - discovering the markov
	blanket of a target variable is equivalent to discovering
	the relevant features for classifications.

	In practice, this algorithm does just as well as a feature
	selection method compared to IAMB when naive bayes was 
	used as a classifier, but Lambda-iamb actually does much
	better than traditional iamb when traditional iamb does
	very poorly due to high false positive rates.

	Arguments
	---------
	*data* : a nested numpy array

	*L* : a float
		The lambda hyperparameter - see [1].

	*alpha* : a float
		The type II error rate.

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, {}) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert not isinstance(feature_selection, list), "feature_selection must be only one value"
        _T = [feature_selection]

        # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            cols = tuple({T}) + tuple(Mb[T])
            H_tmb = entropy(data[:, cols])
            # find X1_min in V-Mb[T]-{T} that minimizes
            # entropy of T|X1_inMb[T]
            # i.e. min of entropy(data[:,(T,X,Mb[T])])
            min_val1, min_val2 = 1e7, 1e7
            min_x1, min_x2 = None, None
            for X in V - Mb[T] - {T}:
                cols = (T, X) + tuple(Mb[T])
                ent_val = entropy(data[:, cols])
                if ent_val < min_val:
                    min_val2, min_val1 = min_val1, ent_val
                    min_x2, min_x1 = min_x1, X

                    # if min_x1 is dependent on T given Mb[T]...
            cols = (min_x1, T) + tuple(Mb[T])
            if are_independent(data[:, cols]):
                if (min_val2 - L * min_val1) < ((1 - L) * H_tmb):
                    cols = (min_x2, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        Mb[T].add(min_x1)
                        Mb[T].add(min_x2)
                        Mb_change = True
            else:
                Mb[T].add(X)
                Mb_change = True

                # SHRINKING PHASE
        for X in Mb[T]:
            # if x is indepdent of t given Mb[T] - {x}
            cols = (X, T) + tuple(Mb[T] - {X})
            if mi_test(data[:, cols]) > alpha:
                Mb[T].remove(X)

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_Mb(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Example #5
0
def lambda_iamb(data, L=1.5, alpha=0.05, feature_selection=None):
    """
	Lambda IAMB Algorithm for learning the structure of a
	Discrete Bayesian Network from data. This Algorithm
	is similar to the iamb algorithm, except that it allows
	for a "lambda" coefficient that helps avoid false positives.

	This algorithm was originally developed for use as a
	feature selection algorithm - discovering the markov
	blanket of a target variable is equivalent to discovering
	the relevant features for classifications.

	In practice, this algorithm does just as well as a feature
	selection method compared to IAMB when naive bayes was 
	used as a classifier, but Lambda-iamb actually does much
	better than traditional iamb when traditional iamb does
	very poorly due to high false positive rates.

	Arguments
	---------
	*data* : a nested numpy array

	*L* : a float
		The lambda hyperparameter - see [1].

	*alpha* : a float
		The type II error rate.

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	"""
    n_rv = data.shape[1]
    Mb = dict([(rv, {}) for rv in range(n_rv)])

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # LEARN MARKOV BLANKET
    for T in _T:

        V = set(range(n_rv)) - {T}
        Mb_change = True

        # GROWING PHASE
        while Mb_change:
            Mb_change = False
            cols = tuple({T}) + tuple(Mb[T])
            H_tmb = entropy(data[:, cols])
            # find X1_min in V-Mb[T]-{T} that minimizes
            # entropy of T|X1_inMb[T]
            # i.e. min of entropy(data[:,(T,X,Mb[T])])
            min_val1, min_val2 = 1e7, 1e7
            min_x1, min_x2 = None, None
            for X in V - Mb[T] - {T}:
                cols = (T, X) + tuple(Mb[T])
                ent_val = entropy(data[:, cols])
                if ent_val < min_val:
                    min_val2, min_val1 = min_val1, ent_val
                    min_x2, min_x1 = min_x1, X

            # if min_x1 is dependent on T given Mb[T]...
            cols = (min_x1, T) + tuple(Mb[T])
            if are_independent(data[:, cols]):
                if (min_val2 - L * min_val1) < ((1 - L) * H_tmb):
                    cols = (min_x2, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        Mb[T].add(min_x1)
                        Mb[T].add(min_x2)
                        Mb_change = True
            else:
                Mb[T].add(X)
                Mb_change = True

        # SHRINKING PHASE
        for X in Mb[T]:
            # if x is indepdent of t given Mb[T] - {x}
            cols = (X, T) + tuple(Mb[T] - {X})
            if mi_test(data[:, cols]) > alpha:
                Mb[T].remove(X)

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_Mb(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        value_dict = dict(
            zip(range(data.shape[1]),
                [list(np.unique(col)) for col in data.T]))
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Example #6
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
    """
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
    # get values
    value_dict = dict(
        zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))
    # replace strings
    data = replace_strings(data)

    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])
    N = data.shape[0]
    card = dict(zip(range(n_rv), unique_bins(data)))
    #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]
    # LEARN MARKOV BLANKET
    for T in _T:
        S = set(range(n_rv)) - {T}
        for A in S:
            if not are_independent(data[:, (A, T)]):
                S.remove(A)
        s_h_dict = dict([(s, 0) for s in S])
        while S:
            insufficient_data = False
            break_grow_phase = False

            #### GROW PHASE ####
            # Calculate mutual information for all variables
            mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])]))
                            for s in S])
            for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True):
                # Add top MI-score variables until there isn't enough data for bins
                if (N / card[x_i] * card[T] * np.prod([card[b]
                                                       for b in Mb[T]])) >= k:
                    Mb[T].append(x_i)
                else:
                    insufficient_data = True
                    break

            #### SHRINK PHASE ####
            removed_vars = False
            for A in Mb[T]:
                cols = (A, T) + tuple(set(Mb[T]) - {A})
                # if A is independent of T given Mb[T], remove A
                if are_independent(data[:, cols]):
                    Mb[T].remove(A)
                    removed_vars = True

            #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
            if insufficient_data and not removed_vars:
                if debug:
                    print 'Breaking..'
                break
            else:
                A = set(range(n_rv)) - {T} - set(Mb[T])
                #A = set(nodes) - {T} - set(Mb[T])
                S = set()
                for a in A:
                    cols = (a, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        S.add(a)
        if debug:
            print 'Done with %s' % T

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return BN
    else:
        return Mb[_T]