Exemple #1
0
	def test_should_return_list_of_params(self):
		ret = sdg.generate_separated_model_parameters('continuous',
			.5, self.num_clusters, self.get_next_seed )

		assert isinstance(ret, list)
		assert len(ret) == self.num_clusters
		for entry in ret:
			assert isinstance(entry, dict)
			for key in entry.keys():
				assert key in ['mu', 'rho']

			assert len(entry.keys()) == 2

		ret = sdg.generate_separated_model_parameters('multinomial',
			.5, self.num_clusters, self.get_next_seed,
			distargs=self.distargs_multinomial)

		assert isinstance(ret, list)
		assert len(ret) == self.num_clusters
		for entry in ret:
			assert isinstance(entry, dict)
			for key in entry.keys():
				assert key in ['weights']

			assert len(entry.keys()) == 1
Exemple #2
0
	def test_normal_means_should_be_farther_apart_if_they_have_higer_separation(self):
		random.seed(0)	
		closer = sdg.generate_separated_model_parameters('continuous',
			.1, 2, self.get_next_seed )

		sum_std_close = closer[0]['rho']**(-.5) + closer[1]['rho']**(-.5)
		distance_close = ((closer[0]['mu']-closer[1]['mu'])/sum_std_close)**2.0

		random.seed(0)
		farther = sdg.generate_separated_model_parameters('continuous',
			.5, 2, self.get_next_seed )

		sum_std_far = farther[0]['rho']**(-.5) + farther[1]['rho']**(-.5)
		distance_far = ((farther[0]['mu']-farther[1]['mu'])/sum_std_far)**2.0

		random.seed(0)
		farthest = sdg.generate_separated_model_parameters('continuous',
			1.0, 2, self.get_next_seed )

		sum_std_farthest = farthest[0]['rho']**(-.5) + farthest[1]['rho']**(-.5)
		distance_farthest = ((farthest[0]['mu']-farthest[1]['mu'])/sum_std_farthest)**2.0

		assert distance_far  > distance_close
		assert distance_farthest  > distance_far
Exemple #3
0
def check_impute_vs_column_average_single(component_model_type,
                                          num_clusters,
                                          seed=0):
    """	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

    random.seed(seed)

    N = 100

    get_next_seed = lambda: random.randrange(2147483647)

    C = .9  # highly-separated clusters

    cctype = component_model_type.cctype

    component_model_parameters = sdg.generate_separated_model_parameters(
        cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype])

    # generte a partition of rows to clusters (evenly-weighted)
    Z = range(num_clusters)
    for z in range(N - num_clusters):
        Z.append(random.randrange(num_clusters))

    random.shuffle(Z)

    # generate the data
    T = numpy.array([[0]] * N, dtype=float)

    for x in range(N):
        z = Z[x]
        T[x] = component_model_type.generate_data_from_parameters(
            component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

    T_list = T.tolist()

    # intialize the state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T)

    # transitions
    state.transition(n_steps=100)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate a row from the sample
    T_generated = sdg.predictive_columns(M_c,
                                         X_L,
                                         X_D, [0],
                                         seed=get_next_seed())

    # generate a row of column averages
    T_colave = numpy.ones(T.shape) * numpy.mean(T)

    # get the mean squared error
    err_sample = numpy.mean((T_generated - T)**2.0)
    err_colave = numpy.mean((T_colave - T)**2.0)

    return err_sample, err_colave
def check_impute_vs_column_average_single(component_model_type, num_clusters, seed=0):
	"""	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

	random.seed(seed)

	N = 100

	get_next_seed = lambda : random.randrange(2147483647)

	C = .9 # highly-separated clusters

	cctype = component_model_type.cctype

	component_model_parameters = sdg.generate_separated_model_parameters(
						cctype, C, num_clusters, get_next_seed,
						distargs=distargs[cctype])

	# generte a partition of rows to clusters (evenly-weighted)
	Z = range(num_clusters)
	for z in range(N-num_clusters):
		Z.append(random.randrange(num_clusters))

	random.shuffle(Z)

	# generate the data
	T = numpy.array([[0]]*N, dtype=float)

	for x in range(N):
		z = Z[x]
		T[x] = component_model_type.generate_data_from_parameters(
				component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

	T_list = T.tolist()

	# intialize the state
	M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

	state = State.p_State(M_c, T)

	# transitions
	state.transition(n_steps=100)

	# get the sample
	X_L = state.get_X_L()
	X_D = state.get_X_D()

	# generate a row from the sample
	T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed())

	# generate a row of column averages
	T_colave = numpy.ones(T.shape)*numpy.mean(T)

	# get the mean squared error
	err_sample = numpy.mean( (T_generated-T)**2.0 )
	err_colave = numpy.mean( (T_colave-T)**2.0 )

	return err_sample, err_colave