def chi2_test(data):
    """
	Test null hypothesis that P(X,Y,Z) = P(Z)P(X|Z)P(Y|Z)
	versus empirically observed P(X,Y,Z) in the data using
	the traditional chisquare test based on observed versus
	expected frequency bins.

	Steps
		- Calculate P(XYZ) empirically and expected
		- Compute ddof
		- Perfom one-way chisquare

	Arguments
	---------
	*data* : a nested numpy array
		The data from which to learn - must have at least three
		variables. All conditioned variables (i.e. Z) are compressed
		into one variable.

	Returns
	-------
	*chi2_statistic* : a float
		Chisquare statistic
	*p_val* : a float
		The pvalue from the chi2 and ddof

	Effects
	-------
	None

	Notes
	-----
	- Assuming for now that |Z| = 1... generalize later
	- Should generalize to let data be a Pandas DataFrame --> would
	encourage external use.

	"""
    # compress extra Z variables at the start.. not implemented yet
    #bins = np.amax(data, axis=0)+1
    bins = unique_bins(data)
    hist, _ = np.histogramdd(data, bins=bins)

    Pxyz = hist / hist.sum()  # joint probability distribution over X,Y,Z

    Pz = np.sum(Pxyz, axis=(0, 1))  # P(Z)
    Pxz = np.sum(Pxyz, axis=1)  # P(X,Z)
    Pyz = np.sum(Pxyz, axis=0)  # P(Y,Z)

    Px_z = Pxz / (Pz + 1e-7)  # P(X | Z) = P(X,Z) / P(Z)
    Py_z = Pyz / (Pz + 1e-7)  # P(Y | Z) = P(Y,Z) / P(Z)

    observed_dist = Pxyz  # Empirical distribution
    #Not correct right now -> Pz is wrong dimension
    Px_y_z = np.empty((Pxy_z.shape))  # P(Z)P(X|Z)P(Y|Z)
    for i in xrange(bins[0]):
        for j in xrange(bins[1]):
            for k in xrange(bins[2]):
                Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k]
    Px_y_z *= Pz

    observed = observed_dist.flatten() * len(data)
    expected = expected_dist.flatten() * len(data)

    ddof = (bins[0] - 1) * (bins[1] - 1) * bins[2]
    chi2_statistic, p_val = stats.chisquare(observed, expected, ddof=ddof)

    return chi2_statistic, p_val
def entropy(data):
    """
	In the context of structure learning, and more specifically
	in constraint-based algorithms which rely on the mutual information
	test for conditional independence, it has been proven that the variable
	X in a set which MAXIMIZES mutual information is also the variable which
	MINIMIZES entropy. This fact can be used to reduce the computational
	requirements of tests based on the following relationship:

		Entropy is related to marginal mutual information as follows:
			MI(X;Y) = H(X) - H(X|Y)

		Entropy is related to conditional mutual information as follows:
			MI(X;Y|Z) = H(X|Z) - H(X|Y,Z)

		For one varibale, H(X) is equal to the following:
			-1 * sum of p(x) * log(p(x))

		For two variables H(X|Y) is equal to the following:
			sum over x,y of p(x,y)*log(p(y)/p(x,y))
		
		For three variables, H(X|Y,Z) is equal to the following:
			-1 * sum of p(x,y,z) * log(p(x|y,z)),
				where p(x|y,z) = p(x,y,z)/p(y)*p(z)
	Arguments
	----------
	*data* : a nested numpy array
		The data from which to learn - must have at least three
		variables. All conditioned variables (i.e. Z) are compressed
		into one variable.

	Returns
	-------
	*H* : entropy value

	"""
    try:
        cols = data.shape[1]
    except IndexError:
        cols = 1

    #bins = np.amax(data,axis=0)
    bins = unique_bins(data)

    if cols == 1:
        hist, _ = np.histogramdd(data, bins=(bins))  # frequency counts
        Px = hist / hist.sum()
        H = -1 * np.sum(Px * np.log(Px))

    elif cols == 2:  # two variables -> assume X then Y
        hist, _ = np.histogramdd(data, bins=bins[0:2])  # frequency counts

        Pxy = hist / hist.sum()  # joint probability distribution over X,Y,Z
        Py = np.sum(Pxy, axis=0)  # P(Y)
        Py += 1e-7
        Pxy += 1e-7
        H = np.sum(Pxy * np.log(Py / Pxy))

    else:
        # CHECK FOR > 3 COLUMNS -> concatenate Z into one column
        if cols > 3:
            data = data.astype('str')
            ncols = len(bins)
            for i in xrange(len(data)):
                data[i, 2] = ''.join(data[i, 2:ncols])
            data = data.astype('int')[:, 0:3]

        bins = np.amax(data, axis=0)
        hist, _ = np.histogramdd(data, bins=bins)  # frequency counts

        Pxyz = hist / hist.sum()  # joint probability distribution over X,Y,Z
        Pyz = np.sum(Pxyz, axis=0)

        Pxyz += 1e-7  # for log -inf
        Pyz += 1e-7
        H = -1 * np.sum(Pxyz * np.log(Pxyz)) + np.sum(Pyz * np.log(Pyz))

    return round(H, 4)
def mutual_information(data, conditional=False):
    #bins = np.amax(data, axis=0)+1 # read levels for each variable
    bins = unique_bins(data)
    if len(bins) == 1:
        hist, _ = np.histogramdd(data, bins=(bins))  # frequency counts
        Px = hist / hist.sum()
        MI = -1 * np.sum(Px * np.log(Px))
        return round(MI, 4)

    if len(bins) == 2:
        hist, _ = np.histogramdd(data, bins=bins[0:2])  # frequency counts

        Pxy = hist / hist.sum()  # joint probability distribution over X,Y,Z
        Px = np.sum(Pxy, axis=1)  # P(X,Z)
        Py = np.sum(Pxy, axis=0)  # P(Y,Z)

        PxPy = np.outer(Px, Py)
        Pxy += 1e-7
        PxPy += 1e-7
        MI = np.sum(Pxy * np.log(Pxy / (PxPy)))
        return round(MI, 4)
    elif len(bins) > 2 and conditional == True:
        # CHECK FOR > 3 COLUMNS -> concatenate Z into one column
        if len(bins) > 3:
            data = data.astype('str')
            ncols = len(bins)
            for i in xrange(len(data)):
                data[i, 2] = ''.join(data[i, 2:ncols])
            data = data.astype('int')[:, 0:3]

        bins = np.amax(data, axis=0)
        hist, _ = np.histogramdd(data, bins=bins)  # frequency counts

        Pxyz = hist / hist.sum()  # joint probability distribution over X,Y,Z
        Pz = np.sum(Pxyz, axis=(0, 1))  # P(Z)
        Pxz = np.sum(Pxyz, axis=1)  # P(X,Z)
        Pyz = np.sum(Pxyz, axis=0)  # P(Y,Z)

        Pxy_z = Pxyz / (Pz + 1e-7)  # P(X,Y | Z) = P(X,Y,Z) / P(Z)
        Px_z = Pxz / (Pz + 1e-7)  # P(X | Z) = P(X,Z) / P(Z)
        Py_z = Pyz / (Pz + 1e-7)  # P(Y | Z) = P(Y,Z) / P(Z)

        Px_y_z = np.empty((Pxy_z.shape))  # P(X|Z)P(Y|Z)
        for i in xrange(bins[0]):
            for j in xrange(bins[1]):
                for k in xrange(bins[2]):
                    Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k]
        Pxyz += 1e-7
        Pxy_z += 1e-7
        Px_y_z += 1e-7
        MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z)))

        return round(MI, 4)
    elif len(bins) > 2 and conditional == False:
        data = data.astype('str')
        ncols = len(bins)
        for i in xrange(len(data)):
            data[i, 1] = ''.join(data[i, 1:ncols])
        data = data.astype('int')[:, 0:2]

        hist, _ = np.histogramdd(data, bins=bins[0:2])  # frequency counts

        Pxy = hist / hist.sum()  # joint probability distribution over X,Y,Z
        Px = np.sum(Pxy, axis=1)  # P(X,Z)
        Py = np.sum(Pxy, axis=0)  # P(Y,Z)

        PxPy = np.outer(Px, Py)
        Pxy += 1e-7
        PxPy += 1e-7
        MI = np.sum(Pxy * np.log(Pxy / (PxPy)))
        return round(MI, 4)
Exemple #4
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
	"""
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
	# get values
	value_dict = dict(zip(range(data.shape[1]),
			[list(np.unique(col)) for col in data.T]))
	# replace strings
	data = replace_strings(data)

	n_rv = data.shape[1]
	Mb = dict([(rv,[]) for rv in range(n_rv)])
	N = data.shape[0]
	card = dict(zip(range(n_rv),unique_bins(data)))
	#card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

	if feature_selection is None:
		_T = range(n_rv)
	else:
		assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value'
		_T = [feature_selection]
	# LEARN MARKOV BLANKET
	for T in _T:
		S = set(range(n_rv)) - {T}
		for A in S:
			if not are_independent(data[:,(A,T)]):
				S.remove(A)
		s_h_dict = dict([(s,0) for s in S])
		while S:
			insufficient_data = False
			break_grow_phase = False
			
			#### GROW PHASE ####
			# Calculate mutual information for all variables
			mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S])
			for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True):
				# Add top MI-score variables until there isn't enough data for bins
				if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k:
					Mb[T].append(x_i)
				else:
					insufficient_data = True
					break

			#### SHRINK PHASE ####
			removed_vars = False
			for A in Mb[T]:
				cols = (A,T) + tuple(set(Mb[T]) - {A})
				# if A is independent of T given Mb[T], remove A
				if are_independent(data[:,cols]):
					Mb[T].remove(A)
					removed_vars=True

			#### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
			if insufficient_data and not removed_vars:
				if debug:
					print 'Breaking..'
				break
			else:
				A = set(range(n_rv)) - {T} - set(Mb[T])
				#A = set(nodes) - {T} - set(Mb[T])
				S = set()
				for a in A:
					cols = (a,T) + tuple(Mb[T])
					if are_independent(data[:,cols]):
						S.add(a)
		if debug:
			print 'Done with %s' % T
	
	if feature_selection is None:
		# RESOLVE GRAPH STRUCTURE
		edge_dict = resolve_markov_blanket(Mb, data)

		# ORIENT EDGES
		oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha)

		# CREATE BAYESNET OBJECT
		bn=BayesNet(oriented_edge_dict,value_dict)

		return BN
	else:
		return Mb[_T]
def mi_test(data, test=True):
    """
	This function performs the mutual information (cross entropy)-based
	CONDITIONAL independence test. Because it is conditional, it requires
	at LEAST three columns. For the marginal independence test, use 
	"mi_test_marginal".

	We use the maximum likelihood estimators as probabilities. The
	mutual information value is computed, then the 
	chi square test is used, with degrees of freedom equal to 
	(|X|-1)*(|Y|-1)*Pi_z\inZ(|z|).

	This function works on datasets that contain MORE than three
	columns by concatenating the extra columns into one. For that
	reason, it is a little slower in that case.

	For two variables only:

	This function performs mutual information (cross entropy)-based
	MARGINAL independence test. Because it is marginal, it requires
	EXACTLY TWO columns. For the conditional independence test, use
	"mi_test_conditional".

	This is the same as calculated the KL Divergence, i.e.
	I(X,Y) = Sigma p(x,y)* log p(x,y) *( p(x)/p(y) )

	NOTE: pval < 0.05 means DEPENDENCE, pval > 0.05 means INDEPENDENCE.
	In other words, the pval represent the probability this relationship
	could have happened at random or by chance. if the pval is very small,
	it means the two variables are likely dependent on one another.

	Steps:
		- Calculate the marginal/conditional probabilities
		- Compute the Mutual Information value
		- Calculate chi2 statistic = 2*N*MI
		- Compute the degrees of freedom
		- Compute the chi square p-value

	Arguments
	----------
	*data* : a nested numpy array
		The data from which to learn - must have at least three
		variables. All conditioned variables (i.e. Z) are compressed
		into one variable.

	Returns
	-------
	*p_val* : a float
		The pvalue from the chi2 and ddof

	Effects
	-------
	None

	Notes
	-----
	- Doesn't currently work with strings... 
	- Should generalize to let data be a Pandas DataFrame --> would
	encourage external use.

	"""

    #bins = np.amax(data, axis=0)+1 # read levels for each variable
    bins = unique_bins(data)
    if len(bins) == 2:
        hist, _ = np.histogramdd(data, bins=bins[0:2])  # frequency counts

        #Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z
        Pxy = hist / data.shape[0]
        Px = np.sum(Pxy, axis=1)  # P(X,Z)
        Py = np.sum(Pxy, axis=0)  # P(Y,Z)

        PxPy = np.outer(Px, Py)
        Pxy += 1e-7
        PxPy += 1e-7
        MI = np.sum(Pxy * np.log(Pxy / (PxPy)))
        if not test:
            return round(MI, 4)
        else:
            chi2_statistic = 2 * len(data) * MI
            ddof = (bins[0] - 1) * (bins[1] - 1)
            p_val = 2 * stats.chi2.pdf(chi2_statistic, ddof)
            return round(p_val, 4)
    else:
        # CHECK FOR > 3 COLUMNS -> concatenate Z into one column
        if len(bins) > 3:
            data = data.astype('str')
            ncols = len(bins)
            for i in xrange(len(data)):
                data[i, 2] = ''.join(data[i, 2:ncols])
            data = data.astype('int')[:, 0:3]

        #bins = np.amax(data,axis=0)
        bins = unique_bins(data)
        hist, _ = np.histogramdd(data, bins=bins)  # frequency counts

        #Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z
        Pxyz = hist / data.shape[0]
        Pz = np.sum(Pxyz, axis=(0, 1))  # P(Z)
        Pxz = np.sum(Pxyz, axis=1)  # P(X,Z)
        Pyz = np.sum(Pxyz, axis=0)  # P(Y,Z)

        Pxy_z = Pxyz / (Pz + 1e-7)  # P(X,Y | Z) = P(X,Y,Z) / P(Z)
        Px_z = Pxz / (Pz + 1e-7)  # P(X | Z) = P(X,Z) / P(Z)
        Py_z = Pyz / (Pz + 1e-7)  # P(Y | Z) = P(Y,Z) / P(Z)

        Px_y_z = np.empty((Pxy_z.shape))  # P(X|Z)P(Y|Z)
        for i in xrange(bins[0]):
            for j in xrange(bins[1]):
                for k in xrange(bins[2]):
                    Px_y_z[i][j][k] = Px_z[i][k] * Py_z[j][k]
        Pxyz += 1e-7
        Pxy_z += 1e-7
        Px_y_z += 1e-7
        MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z)))
        if not test:
            return round(MI, 4)
        else:
            chi2_statistic = 2 * len(data) * MI
            ddof = (bins[0] - 1) * (bins[1] - 1) * bins[2]
            p_val = 2 * stats.chi2.pdf(chi2_statistic, ddof)  # 2* for one tail
            return round(p_val, 4)
def chi2_test(data):
	"""
	Test null hypothesis that P(X,Y,Z) = P(Z)P(X|Z)P(Y|Z)
	versus empirically observed P(X,Y,Z) in the data using
	the traditional chisquare test based on observed versus
	expected frequency bins.

	Steps
		- Calculate P(XYZ) empirically and expected
		- Compute ddof
		- Perfom one-way chisquare

	Arguments
	---------
	*data* : a nested numpy array
		The data from which to learn - must have at least three
		variables. All conditioned variables (i.e. Z) are compressed
		into one variable.

	Returns
	-------
	*chi2_statistic* : a float
		Chisquare statistic
	*p_val* : a float
		The pvalue from the chi2 and ddof

	Effects
	-------
	None

	Notes
	-----
	- Assuming for now that |Z| = 1... generalize later
	- Should generalize to let data be a Pandas DataFrame --> would
	encourage external use.

	"""
	# compress extra Z variables at the start.. not implemented yet
	#bins = np.amax(data, axis=0)+1
	bins = unique_bins(data)
	hist,_ = np.histogramdd(data,bins=bins)

	Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z

	Pz = np.sum(Pxyz, axis = (0,1)) # P(Z)
	Pxz = np.sum(Pxyz, axis = 1) # P(X,Z)
	Pyz = np.sum(Pxyz, axis = 0) # P(Y,Z)

	Px_z = Pxz / (Pz+1e-7) # P(X | Z) = P(X,Z) / P(Z)	
	Py_z = Pyz / (Pz+1e-7) # P(Y | Z) = P(Y,Z) / P(Z)

	observed_dist = Pxyz # Empirical distribution
	#Not correct right now -> Pz is wrong dimension
	Px_y_z = np.empty((Pxy_z.shape)) # P(Z)P(X|Z)P(Y|Z)
	for i in xrange(bins[0]):
		for j in xrange(bins[1]):
			for k in xrange(bins[2]):
				Px_y_z[i][j][k] = Px_z[i][k]*Py_z[j][k]
	Px_y_z *= Pz

	observed = observed_dist.flatten() * len(data)
	expected = expected_dist.flatten() * len(data)

	ddof = (bins[0] - 1) * (bins[1]- 1) * bins[2]
	chi2_statistic, p_val = stats.chisquare(observed,expected, ddof=ddof)

	return chi2_statistic, p_val
def mutual_information(data, conditional=False):
	#bins = np.amax(data, axis=0)+1 # read levels for each variable
	bins = unique_bins(data)
	if len(bins) == 1:
		hist,_ = np.histogramdd(data, bins=(bins)) # frequency counts
		Px = hist/hist.sum()
		MI = -1 * np.sum( Px * np.log( Px ) )
		return round(MI, 4)
		
	if len(bins) == 2:
		hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts

		Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z
		Px = np.sum(Pxy, axis = 1) # P(X,Z)
		Py = np.sum(Pxy, axis = 0) # P(Y,Z)	

		PxPy = np.outer(Px,Py)
		Pxy += 1e-7
		PxPy += 1e-7
		MI = np.sum(Pxy * np.log(Pxy / (PxPy)))
		return round(MI,4)
	elif len(bins) > 2 and conditional==True:
		# CHECK FOR > 3 COLUMNS -> concatenate Z into one column
		if len(bins) > 3:
			data = data.astype('str')
			ncols = len(bins)
			for i in xrange(len(data)):
				data[i,2] = ''.join(data[i,2:ncols])
			data = data.astype('int')[:,0:3]

		bins = np.amax(data,axis=0)
		hist,_ = np.histogramdd(data, bins=bins) # frequency counts

		Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z
		Pz = np.sum(Pxyz, axis = (0,1)) # P(Z)
		Pxz = np.sum(Pxyz, axis = 1) # P(X,Z)
		Pyz = np.sum(Pxyz, axis = 0) # P(Y,Z)	

		Pxy_z = Pxyz / (Pz+1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z)
		Px_z = Pxz / (Pz+1e-7) # P(X | Z) = P(X,Z) / P(Z)	
		Py_z = Pyz / (Pz+1e-7) # P(Y | Z) = P(Y,Z) / P(Z)

		Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z)
		for i in xrange(bins[0]):
			for j in xrange(bins[1]):
				for k in xrange(bins[2]):
					Px_y_z[i][j][k] = Px_z[i][k]*Py_z[j][k]
		Pxyz += 1e-7
		Pxy_z += 1e-7
		Px_y_z += 1e-7
		MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z)))
		
		return round(MI,4)
	elif len(bins) > 2 and conditional == False:
		data = data.astype('str')
		ncols = len(bins)
		for i in xrange(len(data)):
			data[i,1] = ''.join(data[i,1:ncols])
		data = data.astype('int')[:,0:2]

		hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts

		Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z
		Px = np.sum(Pxy, axis = 1) # P(X,Z)
		Py = np.sum(Pxy, axis = 0) # P(Y,Z)	

		PxPy = np.outer(Px,Py)
		Pxy += 1e-7
		PxPy += 1e-7
		MI = np.sum(Pxy * np.log(Pxy / (PxPy)))
		return round(MI,4)
def entropy(data):
	"""
	In the context of structure learning, and more specifically
	in constraint-based algorithms which rely on the mutual information
	test for conditional independence, it has been proven that the variable
	X in a set which MAXIMIZES mutual information is also the variable which
	MINIMIZES entropy. This fact can be used to reduce the computational
	requirements of tests based on the following relationship:

		Entropy is related to marginal mutual information as follows:
			MI(X;Y) = H(X) - H(X|Y)

		Entropy is related to conditional mutual information as follows:
			MI(X;Y|Z) = H(X|Z) - H(X|Y,Z)

		For one varibale, H(X) is equal to the following:
			-1 * sum of p(x) * log(p(x))

		For two variables H(X|Y) is equal to the following:
			sum over x,y of p(x,y)*log(p(y)/p(x,y))
		
		For three variables, H(X|Y,Z) is equal to the following:
			-1 * sum of p(x,y,z) * log(p(x|y,z)),
				where p(x|y,z) = p(x,y,z)/p(y)*p(z)
	Arguments
	----------
	*data* : a nested numpy array
		The data from which to learn - must have at least three
		variables. All conditioned variables (i.e. Z) are compressed
		into one variable.

	Returns
	-------
	*H* : entropy value

	"""
	try:
		cols = data.shape[1]
	except IndexError:
		cols = 1

	#bins = np.amax(data,axis=0)
	bins = unique_bins(data)

	if cols == 1:
		hist,_ = np.histogramdd(data, bins=(bins)) # frequency counts
		Px = hist/hist.sum()
		H = -1 * np.sum( Px * np.log( Px ) )

	elif cols == 2: # two variables -> assume X then Y
		hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts

		Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z
		Py = np.sum(Pxy, axis = 0) # P(Y)	
		Py += 1e-7
		Pxy += 1e-7
		H = np.sum( Pxy * np.log( Py / Pxy ) )

	else:
		# CHECK FOR > 3 COLUMNS -> concatenate Z into one column
		if cols  > 3:
			data = data.astype('str')
			ncols = len(bins)
			for i in xrange(len(data)):
				data[i,2] = ''.join(data[i,2:ncols])
			data = data.astype('int')[:,0:3]

		bins = np.amax(data,axis=0)
		hist,_ = np.histogramdd(data, bins=bins) # frequency counts

		Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z
		Pyz = np.sum(Pxyz, axis=0)

		Pxyz += 1e-7 # for log -inf
		Pyz += 1e-7
		H = -1 * np.sum( Pxyz * np.log( Pxyz ) ) + np.sum( Pyz * np.log( Pyz ) ) 

	return round(H,4)
def mi_test(data, test=True):
	"""
	This function performs the mutual information (cross entropy)-based
	CONDITIONAL independence test. Because it is conditional, it requires
	at LEAST three columns. For the marginal independence test, use 
	"mi_test_marginal".

	We use the maximum likelihood estimators as probabilities. The
	mutual information value is computed, then the 
	chi square test is used, with degrees of freedom equal to 
	(|X|-1)*(|Y|-1)*Pi_z\inZ(|z|).

	This function works on datasets that contain MORE than three
	columns by concatenating the extra columns into one. For that
	reason, it is a little slower in that case.

	For two variables only:

	This function performs mutual information (cross entropy)-based
	MARGINAL independence test. Because it is marginal, it requires
	EXACTLY TWO columns. For the conditional independence test, use
	"mi_test_conditional".

	This is the same as calculated the KL Divergence, i.e.
	I(X,Y) = Sigma p(x,y)* log p(x,y) *( p(x)/p(y) )

	NOTE: pval < 0.05 means DEPENDENCE, pval > 0.05 means INDEPENDENCE.
	In other words, the pval represent the probability this relationship
	could have happened at random or by chance. if the pval is very small,
	it means the two variables are likely dependent on one another.

	Steps:
		- Calculate the marginal/conditional probabilities
		- Compute the Mutual Information value
		- Calculate chi2 statistic = 2*N*MI
		- Compute the degrees of freedom
		- Compute the chi square p-value

	Arguments
	----------
	*data* : a nested numpy array
		The data from which to learn - must have at least three
		variables. All conditioned variables (i.e. Z) are compressed
		into one variable.

	Returns
	-------
	*p_val* : a float
		The pvalue from the chi2 and ddof

	Effects
	-------
	None

	Notes
	-----
	- Doesn't currently work with strings... 
	- Should generalize to let data be a Pandas DataFrame --> would
	encourage external use.

	"""
	
	#bins = np.amax(data, axis=0)+1 # read levels for each variable
	bins = unique_bins(data)
	if len(bins)==2:
		hist,_ = np.histogramdd(data, bins=bins[0:2]) # frequency counts

		#Pxy = hist / hist.sum()# joint probability distribution over X,Y,Z
		Pxy = hist / data.shape[0]
		Px = np.sum(Pxy, axis = 1) # P(X,Z)
		Py = np.sum(Pxy, axis = 0) # P(Y,Z)	

		PxPy = np.outer(Px,Py)
		Pxy += 1e-7
		PxPy += 1e-7
		MI = np.sum(Pxy * np.log(Pxy / (PxPy)))
		if not test:
			return round(MI,4)
		else:
			chi2_statistic = 2*len(data)*MI
			ddof = (bins[0] - 1) * (bins[1] - 1)
			p_val = 2*stats.chi2.pdf(chi2_statistic, ddof)
			return round(p_val,4)
	else:
		# CHECK FOR > 3 COLUMNS -> concatenate Z into one column
		if len(bins) > 3:
			data = data.astype('str')
			ncols = len(bins)
			for i in xrange(len(data)):
				data[i,2] = ''.join(data[i,2:ncols])
			data = data.astype('int')[:,0:3]

		#bins = np.amax(data,axis=0)
		bins = unique_bins(data)
		hist,_ = np.histogramdd(data, bins=bins) # frequency counts

		#Pxyz = hist / hist.sum()# joint probability distribution over X,Y,Z
		Pxyz = hist / data.shape[0]
		Pz = np.sum(Pxyz, axis = (0,1)) # P(Z)
		Pxz = np.sum(Pxyz, axis = 1) # P(X,Z)
		Pyz = np.sum(Pxyz, axis = 0) # P(Y,Z)	

		Pxy_z = Pxyz / (Pz+1e-7) # P(X,Y | Z) = P(X,Y,Z) / P(Z)
		Px_z = Pxz / (Pz+1e-7) # P(X | Z) = P(X,Z) / P(Z)	
		Py_z = Pyz / (Pz+1e-7) # P(Y | Z) = P(Y,Z) / P(Z)

		Px_y_z = np.empty((Pxy_z.shape)) # P(X|Z)P(Y|Z)
		for i in xrange(bins[0]):
			for j in xrange(bins[1]):
				for k in xrange(bins[2]):
					Px_y_z[i][j][k] = Px_z[i][k]*Py_z[j][k]
		Pxyz += 1e-7
		Pxy_z += 1e-7
		Px_y_z += 1e-7
		MI = np.sum(Pxyz * np.log(Pxy_z / (Px_y_z)))
		if not test:
			return round(MI,4)
		else:
			chi2_statistic = 2*len(data)*MI
			ddof = (bins[0] - 1) * (bins[1] - 1) * bins[2]
			p_val = 2*stats.chi2.pdf(chi2_statistic, ddof) # 2* for one tail
			return round(p_val,4)
Exemple #10
0
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False):
    """
	From [1]:
		"A novel algorithm for the induction of
		Markov blankets from data, called Fast-IAMB, that employs
		a heuristic to quickly recover the Markov blanket. Empirical
		results show that Fast-IAMB performs in many cases
		faster and more reliably than existing algorithms without
		adversely affecting the accuracy of the recovered Markov
		blankets."

	Arguments
	---------
	*data* : a nested numpy array

	*k* : an integer
		The max number of edges to add at each iteration of 
		the algorithm.

	*alpha* : a float
		Probability of Type I error

	Returns
	-------
	*bn* : a BayesNet object

	Effects
	-------
	None

	Notes
	-----
	- Currently does not work. I think it's stuck in an infinite loop...

	"""
    # get values
    value_dict = dict(
        zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T]))
    # replace strings
    data = replace_strings(data)

    n_rv = data.shape[1]
    Mb = dict([(rv, []) for rv in range(n_rv)])
    N = data.shape[0]
    card = dict(zip(range(n_rv), unique_bins(data)))
    #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0)))

    if feature_selection is None:
        _T = range(n_rv)
    else:
        assert (not isinstance(feature_selection, list)
                ), 'feature_selection must be only one value'
        _T = [feature_selection]
    # LEARN MARKOV BLANKET
    for T in _T:
        S = set(range(n_rv)) - {T}
        for A in S:
            if not are_independent(data[:, (A, T)]):
                S.remove(A)
        s_h_dict = dict([(s, 0) for s in S])
        while S:
            insufficient_data = False
            break_grow_phase = False

            #### GROW PHASE ####
            # Calculate mutual information for all variables
            mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])]))
                            for s in S])
            for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True):
                # Add top MI-score variables until there isn't enough data for bins
                if (N / card[x_i] * card[T] * np.prod([card[b]
                                                       for b in Mb[T]])) >= k:
                    Mb[T].append(x_i)
                else:
                    insufficient_data = True
                    break

            #### SHRINK PHASE ####
            removed_vars = False
            for A in Mb[T]:
                cols = (A, T) + tuple(set(Mb[T]) - {A})
                # if A is independent of T given Mb[T], remove A
                if are_independent(data[:, cols]):
                    Mb[T].remove(A)
                    removed_vars = True

            #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS ####
            if insufficient_data and not removed_vars:
                if debug:
                    print 'Breaking..'
                break
            else:
                A = set(range(n_rv)) - {T} - set(Mb[T])
                #A = set(nodes) - {T} - set(Mb[T])
                S = set()
                for a in A:
                    cols = (a, T) + tuple(Mb[T])
                    if are_independent(data[:, cols]):
                        S.add(a)
        if debug:
            print 'Done with %s' % T

    if feature_selection is None:
        # RESOLVE GRAPH STRUCTURE
        edge_dict = resolve_markov_blanket(Mb, data)

        # ORIENT EDGES
        oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha)

        # CREATE BAYESNET OBJECT
        bn = BayesNet(oriented_edge_dict, value_dict)

        return BN
    else:
        return Mb[_T]