Exemple #1
0
	def test_should_have_nan_entries_if_specified(self):
		# for one column
		columns_list = [0]
		optargs = [dict(missing_data=1.0)] # every entry will be missing NaN
		X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list,
			optional_settings=optargs)

		assert numpy.all(numpy.isnan(X))
		
		# for two columns
		columns_list = [0,1]
		optargs = [dict(missing_data=1.0)]*2 
		X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list,
			optional_settings=optargs)

		assert numpy.all(numpy.isnan(X))

		# for one of two columns (no dict means no missing data)
		columns_list = [0,1]
		optargs = [dict(missing_data=1.0), None]
		X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list,
			optional_settings=optargs)

		assert numpy.all(numpy.isnan(X[:,0]))
		assert not numpy.any(numpy.isnan(X[:,1]))

		# for one of two columns. Missing data specified 0 for second column
		columns_list = [0,1]
		optargs = [dict(missing_data=1.0), dict(missing_data=0.0)]
		X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list,
			optional_settings=optargs)

		assert numpy.all(numpy.isnan(X[:,0]))
		assert not numpy.any(numpy.isnan(X[:,1]))
Exemple #2
0
	def test_should_return_array_of_proper_size(self):
		columns_list = [0]
		X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list)
		assert isinstance(X, numpy.ndarray)
		assert X.shape[0] == self.num_rows
		assert X.shape[1] == len(columns_list)

		columns_list = [0,1]
		X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list)
		assert isinstance(X, numpy.ndarray)
		assert X.shape[0] == self.num_rows
		assert X.shape[1] == len(columns_list)
def check_predictive_sample_improvement(component_model_type, seed=0, show_plot=True):
	""" Shows the error of predictive sample over iterations.
	"""

	num_transitions = 100
	num_samples = 10	
	num_clusters = 2
	separation = .9	# cluster separation
	N = 150
	
	random.seed(seed)
	get_next_seed = lambda : random.randrange(2147483647)

	# generate a single column of data from the component_model 
	cctype = component_model_type.cctype
	T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], 
				seed=get_next_seed(), distargs=[distargs[cctype]], 
				return_structure=True)

	T_array = numpy.array(T)

	X = numpy.zeros((N,num_transitions))
	KL = numpy.zeros((num_samples, num_transitions))


	support = qtu.get_mixture_support(cctype, component_model_type, 
					struc['component_params'][0], nbins=1000, support=.995)
	true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, 
					struc['component_params'][0],[.5,.5])

	for s in range(num_samples):
		# generate the state
		state = State.p_State(M_c, T, SEED=get_next_seed())

		for i in range(num_transitions):
			# transition
			state.transition()

			# get partitions and generate a predictive column
			X_L = state.get_X_L()
			X_D = state.get_X_D()

			T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], 
					seed=get_next_seed())

			if cctype == 'multinomial':
				K = distargs[cctype]['K']
				weights = numpy.zeros(numpy.array(K))
				for params in struc['component_params'][0]:
					weights += numpy.array(params['weights'])*(1.0/num_clusters)
				weights *= float(N)
				inf_hist = qtu.bincount(T_inf, bins=list(range(K)))
				err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson')
				err = numpy.ones(N)*err
			else:
				err = (T_array-T_inf)**2.0

			KL[s,i] = qtu.KL_divergence(component_model_type, 
						struc['component_params'][0], [.5,.5], M_c, X_L, X_D,
						true_log_pdf=true_log_pdf, support=support)

			for j in range(N):
				X[j,i] += err[j]

	X /= num_samples

	# mean and standard error
	X_mean = numpy.mean(X,axis=0)
	X_err = numpy.std(X,axis=0)/float(num_samples)**.5

	KL_mean = numpy.mean(KL, axis=0)
	KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5

	if show_plot:
		pylab.subplot(1,2,1)
		pylab.errorbar(list(range(num_transitions)), X_mean, yerr=X_err)
		pylab.xlabel('iteration')
		pylab.ylabel('error across each data point')
		pylab.title('error of predictive sample over iterations, N=%i' % N)

		pylab.subplot(1,2,2)
		pylab.errorbar(list(range(num_transitions)), KL_mean, yerr=KL_err)
		pylab.xlabel('iteration')
		pylab.ylabel('KL divergence')
		pylab.title('KL divergence, N=%i' % N)

		pylab.show()

	# error should decrease over time
	return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
Exemple #4
0
def check_impute_vs_column_average_single(component_model_type,
                                          num_clusters,
                                          seed=0):
    """	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

    random.seed(seed)

    N = 100

    get_next_seed = lambda: random.randrange(2147483647)

    C = .9  # highly-separated clusters

    cctype = component_model_type.cctype

    component_model_parameters = sdg.generate_separated_model_parameters(
        cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype])

    # generte a partition of rows to clusters (evenly-weighted)
    Z = range(num_clusters)
    for z in range(N - num_clusters):
        Z.append(random.randrange(num_clusters))

    random.shuffle(Z)

    # generate the data
    T = numpy.array([[0]] * N, dtype=float)

    for x in range(N):
        z = Z[x]
        T[x] = component_model_type.generate_data_from_parameters(
            component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

    T_list = T.tolist()

    # intialize the state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T)

    # transitions
    state.transition(n_steps=100)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate a row from the sample
    T_generated = sdg.predictive_columns(M_c,
                                         X_L,
                                         X_D, [0],
                                         seed=get_next_seed())

    # generate a row of column averages
    T_colave = numpy.ones(T.shape) * numpy.mean(T)

    # get the mean squared error
    err_sample = numpy.mean((T_generated - T)**2.0)
    err_colave = numpy.mean((T_colave - T)**2.0)

    return err_sample, err_colave
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9
    
    get_next_seed = lambda : random.randrange(2147483647)

    cluster_weights = [[1.0/float(num_clusters)]*num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights,
                        [separation], seed=get_next_seed(),
                        distargs=[distargs[cctype]],
                        return_structure=True)


    T_list = list(T)
    T = numpy.array(T)
    
    # pdb.set_trace()    
    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])
    
    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(cctype, component_model_type,
                         structure['component_params'][0], nbins=250)
    
    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0],
                            seed=get_next_seed()).flatten(1)
    

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)
    
    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = list(range(len(discrete_support)))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(50,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, 
                structure['component_params'][0], [1.0/num_clusters]*num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1)
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(lpdf), 
            c="blue", 
            edgecolor="none",
            s=100, 
            label="true pdf", 
            alpha=1,
            zorder=3)
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            edgecolor="none",
            s=100, 
            label="predictive probability", 
            alpha=1,
            zorder=4)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def check_impute_vs_column_average_single(component_model_type, num_clusters, seed=0):
	"""	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

	random.seed(seed)

	N = 100

	get_next_seed = lambda : random.randrange(2147483647)

	C = .9 # highly-separated clusters

	cctype = component_model_type.cctype

	component_model_parameters = sdg.generate_separated_model_parameters(
						cctype, C, num_clusters, get_next_seed,
						distargs=distargs[cctype])

	# generte a partition of rows to clusters (evenly-weighted)
	Z = range(num_clusters)
	for z in range(N-num_clusters):
		Z.append(random.randrange(num_clusters))

	random.shuffle(Z)

	# generate the data
	T = numpy.array([[0]]*N, dtype=float)

	for x in range(N):
		z = Z[x]
		T[x] = component_model_type.generate_data_from_parameters(
				component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

	T_list = T.tolist()

	# intialize the state
	M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

	state = State.p_State(M_c, T)

	# transitions
	state.transition(n_steps=100)

	# get the sample
	X_L = state.get_X_L()
	X_D = state.get_X_D()

	# generate a row from the sample
	T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed())

	# generate a row of column averages
	T_colave = numpy.ones(T.shape)*numpy.mean(T)

	# get the mean squared error
	err_sample = numpy.mean( (T_generated-T)**2.0 )
	err_colave = numpy.mean( (T_colave-T)**2.0 )

	return err_sample, err_colave
Exemple #7
0
def check_one_feature_mixture(component_model_type,
                              num_clusters=3,
                              show_plot=False,
                              seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = list(range(len(discrete_support)))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p