Ejemplo n.º 1
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
    """
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
    n_rv = data.shape[1]
    data, value_dict = replace_strings(data, return_values=True)

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]

    # STEP 1 : COMPUTE MARKOV BLANKETS
    Mb = dict([(rv, []) for rv in range(n_rv)])

    for X in _T:
        S = []

        grow_condition = True
        while grow_condition:

            grow_condition = False
            for Y in range(n_rv):
                if X != Y and Y not in S:
                    # if there exists some Y such that Y is dependent on X given S,
                    # add Y to S
                    cols = (X, Y) + tuple(S)
                    pval = mi_test(data[:, cols])
                    if pval < alpha:  # dependent
                        grow_condition = True  # dependent -> continue searching
                        S.append(Y)

        shrink_condition = True
        while shrink_condition:

            TEMP_S = []
            shrink_condition = False
            for Y in S:
                s_copy = copy(S)
                s_copy.remove(Y)  # condition on S-{Y}
                # if X independent of Y given S-{Y}, leave Y out
                # if X dependent of Y given S-{Y}, keep it in
                cols = (X, Y) + tuple(s_copy)
                pval = mi_test(data[:, cols])
                if pval < alpha:  # dependent
                    TEMP_S.append(Y)
                else:  # independent -> condition searching
                    shrink_condition = True

        Mb[X] = TEMP_S
        if debug:
            print 'Markov Blanket for %s : %s' % (X, str(TEMP_S))

    if feature_selection is None:
        # STEP 2: COMPUTE GRAPH STRUCTURE
        # i.e. Resolve Markov Blanket
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print 'Unoriented edge dict:\n %s' % str(edge_dict)

        # STEP 3: ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)
        if debug:
            print 'Oriented edge dict:\n %s' % str(oriented_edge_dict)

        # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Ejemplo n.º 2
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
	"""
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
	# get values
	value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
	# replace strings
	data = replace_strings(data)

	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])
	N = data.shape[0]
	card = dict(zip(range(n_rv),unique_bins(data)))
	#card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]
	# LEARN MARKOV BLANKET
	for T in _T:
		S = set(range(n_rv)) - {T}
		for A in S:
			if not are_independent(data[:,(A,T)]):
				S.remove(A)
		s_h_dict = dict([(s,0) for s in S])
		while S:
			insufficient_data = False
			break_grow_phase = False
			
			#### GROW PHASE ####
			# Calculate mutual information for all variables
			mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S])
			for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True):
				# Add top MI-score variables until there isn't enough data for bins
				if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k:
					Mb[T].append(x_i)
				else:
					insufficient_data = True
					break

			#### SHRINK PHASE ####
			removed_vars = False
			for A in Mb[T]:
				cols = (A,T) + tuple(set(Mb[T]) - {A})
				# if A is independent of T given Mb[T], remove A
				if are_independent(data[:,cols]):
					Mb[T].remove(A)
					removed_vars=True

			#### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
			if insufficient_data and not removed_vars:
				if debug:
					print 'Breaking..'
				break
			else:
				A = set(range(n_rv)) - {T} - set(Mb[T])
				#A = set(nodes) - {T} - set(Mb[T])
				S = set()
				for a in A:
					cols = (a,T) + tuple(Mb[T])
					if are_independent(data[:,cols]):
						S.add(a)
		if debug:
			print 'Done with %s' % T
	
	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)

		# ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)

		return BN
	else:
		return Mb[_T]
Ejemplo n.º 3
0
def gs(data, alpha=0.05, feature_selection=None, debug=False):
    """
	Perform growshink algorithm over dataset to learn
	Bayesian network structure.

	This algorithm is clearly a good candidate for
	numba JIT compilation...

	STEPS
	-----
	1. Compute Markov Blanket
	2. Compute Graph Structure
	3. Orient Edges
	4. Remove Cycles
	5. Reverse Edges
	6. Propagate Directions

	Arguments
	---------
	*data* : a nested numpy array
		Data from which you wish to learn structure

	*alpha* : a float
		Type I error rate for independence test

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----

	Speed Test:
		*** 5 variables, 624 observations ***
		- 63.7 ms

	"""
    n_rv = data.shape[1]
    data, value_dict = replace_strings(data, return_values=True)

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert not isinstance(feature_selection, list), "feature_selection must be only one value"
        _T = [feature_selection]

        # STEP 1 : COMPUTE MARKOV BLANKETS
    Mb = dict([(rv, []) for rv in range(n_rv)])

    for X in _T:
        S = []

        grow_condition = True
        while grow_condition:

            grow_condition = False
            for Y in range(n_rv):
                if X != Y and Y not in S:
                    # if there exists some Y such that Y is dependent on X given S,
                    # add Y to S
                    cols = (X, Y) + tuple(S)
                    pval = mi_test(data[:, cols])
                    if pval < alpha:  # dependent
                        grow_condition = True  # dependent -> continue searching
                        S.append(Y)

        shrink_condition = True
        while shrink_condition:

            TEMP_S = []
            shrink_condition = False
            for Y in S:
                s_copy = copy(S)
                s_copy.remove(Y)  # condition on S-{Y}
                # if X independent of Y given S-{Y}, leave Y out
                # if X dependent of Y given S-{Y}, keep it in
                cols = (X, Y) + tuple(s_copy)
                pval = mi_test(data[:, cols])
                if pval < alpha:  # dependent
                    TEMP_S.append(Y)
                else:  # independent -> condition searching
                    shrink_condition = True

        Mb[X] = TEMP_S
        if debug:
            print "Markov Blanket for %s : %s" % (X, str(TEMP_S))

    if feature_selection is None:
        # STEP 2: COMPUTE GRAPH STRUCTURE
        # i.e. Resolve Markov Blanket
        edge_dict = resolve_markov_blanket(Mb, data)
        if debug:
            print "Unoriented edge dict:\n %s" % str(edge_dict)

            # STEP 3: ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)
        if debug:
            print "Oriented edge dict:\n %s" % str(oriented_edge_dict)

            # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return bn
    else:
        return Mb[_T]
Ejemplo n.º 4
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
    """
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
    # get values
    value_dict = dict(
        zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))
    # replace strings
    data = replace_strings(data)

    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])
    N = data.shape[0]
    card = dict(zip(range(n_rv), unique_bins(data)))
    #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]
    # LEARN MARKOV BLANKET
    for T in _T:
        S = set(range(n_rv)) - {T}
        for A in S:
            if not are_independent(data[:, (A, T)]):
                S.remove(A)
        s_h_dict = dict([(s, 0) for s in S])
        while S:
            insufficient_data = False
            break_grow_phase = False

            #### GROW PHASE ####
            # Calculate mutual information for all variables
            mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])]))
                            for s in S])
            for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True):
                # Add top MI-score variables until there isn't enough data for bins
                if (N / card[x_i] * card[T] * np.prod([card[b]
                                                       for b in Mb[T]])) >= k:
                    Mb[T].append(x_i)
                else:
                    insufficient_data = True
                    break

            #### SHRINK PHASE ####
            removed_vars = False
            for A in Mb[T]:
                cols = (A, T) + tuple(set(Mb[T]) - {A})
                # if A is independent of T given Mb[T], remove A
                if are_independent(data[:, cols]):
                    Mb[T].remove(A)
                    removed_vars = True

            #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
            if insufficient_data and not removed_vars:
                if debug:
                    print 'Breaking..'
                break
            else:
                A = set(range(n_rv)) - {T} - set(Mb[T])
                #A = set(nodes) - {T} - set(Mb[T])
                S = set()
                for a in A:
                    cols = (a, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        S.add(a)
        if debug:
            print 'Done with %s' % T

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return BN
    else:
        return Mb[_T]