def test_support(self): row = pd.DataFrame([[True, False, None, 'dataset-a', 2.5]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) expected_support = pd.DataFrame([[ pi.loc['dataset-a', 'K0'] * p.loc['K0', 'X1'] * (1 - p.loc['K0', 'X2']), pi.loc['dataset-a', 'K1'] * p.loc['K1', 'X1'] * (1 - p.loc['K1', 'X2']), ]], columns=pi.columns, index=row.index) expected_log_support = expected_support.apply(np.log) dataset_ids_as_ilocs = model._dataset_ids_as_pis_ilocs(row) actual_log_support = model._log_support(dataset_ids_as_ilocs, *model._to_bool(row)) assert_frame_equal(expected_log_support, actual_log_support)
def test_mle_support_for_states_data(self): row = pd.DataFrame([[True, False, None, 'dataset-a', 2.5]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) # see the test for log_support for how the expected support of: # K0 K1 # 0 0.048 0.072 # has been estimated, making $K1$ the most likely state expected_mle_states = pd.Series(['K1'], index=row.index) actual_mle_states = model.mle_states(row) assert_series_equal(expected_mle_states, actual_mle_states, check_names=False)
def test_responsibilities_from_log_support(self): data = pd.DataFrame([[True, False, None, 'dataset-a', 2.5], [False, True, True, 'dataset-b', 15], [False, False, False, 'dataset-c', 20]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) dataset_ids_as_ilocs = model._dataset_ids_as_pis_ilocs(data) log_support = model._log_support(dataset_ids_as_ilocs, *model._to_bool(data)) support = np.exp(log_support) expected_responsibilities = support.divide(support.sum(axis=1), axis=0) actual_responsibilities = _responsibilities_from_log_support( log_support) assert_frame_equal(expected_responsibilities, actual_responsibilities)
def test_log_likelihood_weighs_data_correctly(self): sample_data = pd.DataFrame( [[True, True, None, 'dataset-a', 2.5], [False, None, False, 'dataset-b', 1], [True, False, True, 'dataset-a', 10], [False, False, True, 'dataset-c', 3]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) individual_lls = model._individual_log_likelihoods(sample_data) expected_log_likelihood = individual_lls.sum() actual_log_likelihood = model.log_likelihood(sample_data) self.assertEqual(expected_log_likelihood, actual_log_likelihood)
def test_individual_log_likelihood(self): row = pd.DataFrame([[True, False, None, 'dataset-a', 2.5]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) expected_log_likelihood = pd.Series([ 2.5 * (np.log( sum([ pi.loc['dataset-a', k] * p.loc[k, 'X1'] * (1 - p.loc[k, 'X2']) for k in ['K0', 'K1'] ]))) ], index=row.index) actual_log_likelihood = model._individual_log_likelihoods(row) assert_series_equal(actual_log_likelihood, expected_log_likelihood, check_names=False)
def test_p_update_from_data(self): sample_data = pd.DataFrame( [[True, True, None, 'dataset-a', 2.5], [False, None, False, 'dataset-b', 1.5], [True, False, True, 'dataset-a', 5], [False, False, True, 'dataset-c', 1]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) zstar = pd.DataFrame([[0.9, 0.1], [0.1, 0.9], [0.5, 0.5], [0.4, 0.6]], index=sample_data.index, columns=pi.columns) denominators = pd.Series([ 0.9 * 2.5 + 0.1 * 1.5 + 0.5 * 5 + 0.4 * 1, 0.1 * 2.5 + 0.9 * 1.5 + 0.5 * 5 + 0.6 * 1 ], index=p.index) denominators = pd.concat([denominators] * len(p.columns), axis=1) denominators.columns = p.columns expected_p = pd.DataFrame( [ np.array([ 2.5 * 0.9 + 5 * 0.5, 2.5 * 0.9 + 1.5 * 0.1 * 0.2, 2.5 * 0.9 * 0.3 + 5 * 0.5 + 1 * 0.4 ]), np.array([ 2.5 * 0.1 + 5 * 0.5, 2.5 * 0.1 + 1.5 * 0.9 * 0.8, 2.5 * 0.1 * 0.7 + 5 * 0.5 + 1 * 0.6 ]) ], index=p.index, columns=p.columns, ) expected_p /= denominators data_as_bool, not_null_mask = model._to_bool(sample_data) zstar_times_weight = zstar.multiply(sample_data['weight'], axis=0) actual_p = model._p_update_from_data(zstar_times_weight.values, data_as_bool, not_null_mask) assert_frame_equal(expected_p, actual_p)
def test_posterior_calculation_from_log_likelihood(self): pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) pi_priors = pd.Series([4, 5], index=pi.columns) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) p_priors = pd.DataFrame([[6, 7], [8, 9], [10, 11]], index=p.columns, columns=['alpha', 'beta']) model = MultiDatasetMixtureModel(pi, p, prior_mixing_coefficients=pi_priors, prior_emission_probabilities=p_priors) log_likelihood = np.log(1e-5) pi_prior_weight = (pi.apply(np.log) * (pi_priors - 1)).sum().sum() pi_prior_gammas = len(pi.index) * (gammaln(pi_priors.sum()) - pi_priors.apply(gammaln).sum()) p_prior_weight = (p.apply(np.log) * (p_priors['alpha'] - 1) + (1 - p).apply(np.log) * (p_priors['beta'] - 1)).sum().sum() p_prior_gammas = len( p.index) * (p_priors.sum(axis=1).apply(gammaln) - p_priors['alpha'].apply(gammaln) - p_priors['beta'].apply(gammaln)).sum() expected_posterior_no_gammas = log_likelihood + pi_prior_weight + p_prior_weight expected_posterior_with_gammas = expected_posterior_no_gammas \ + pi_prior_gammas + p_prior_gammas posterior_no_gammas = model._unnormalised_posterior( log_likelihood, compute_gammas=False) posterior_gammas = model._unnormalised_posterior(log_likelihood, compute_gammas=True) self.assertAlmostEqual(expected_posterior_no_gammas, posterior_no_gammas) self.assertAlmostEqual(expected_posterior_with_gammas, posterior_gammas)
def test_log_likelihood_validates_weights_greater_than_zero(self): """ log likelihood function should validate that dataset weights are > 0 """ dataset_weight_zero = pd.DataFrame( [[False, True, False, 'x', 1], [False, False, False, 'y', 0]], columns=['a', 'b', 'c', 'dataset_id', 'weight']) dataset_weight_nan = pd.DataFrame( [[False, True, False, 'x', None], [False, False, False, 'y', 1.0]], columns=['a', 'b', 'c', 'dataset_id', 'weight']) dataset_weight_negative = pd.DataFrame( [[False, True, False, 'x', -1], [False, False, False, 'y', 1.0]], columns=['a', 'b', 'c', 'dataset_id', 'weight']) es = pd.DataFrame([[0.1, 0.2, 0.6], [0.3, 0.2, 0.1], [0.2, 0.1, 0.4]], columns=['a', 'b', 'c']) ms_two = pd.DataFrame([[0.1, 0.5, 0.4], [0.1, 0.5, 0.4]], columns=es.index, index=['x', 'y']) mixture = MultiDatasetMixtureModel(ms_two, es) self.assertRaises(ValueError, mixture.log_likelihood, dataset_weight_zero) self.assertRaises(ValueError, mixture.log_likelihood, dataset_weight_nan) self.assertRaises(ValueError, mixture.log_likelihood, dataset_weight_negative)
def test_log_likelihood_checks_that_dataset_id_and_weight_columns_exist( self): """ log likelihood function should validate that data has dataset_id and weight columns :return: """ dataset_no_weight = pd.DataFrame( [[False, True, False, 0], [False, False, False, 0]], columns=['a', 'b', 'c', 'dataset_id']) dataset_no_dataset_id = pd.DataFrame( [[False, True, False, 1], [False, False, False, 1]], columns=['a', 'b', 'c', 'weight']) es = pd.DataFrame([[0.1, 0.2, 0.6], [0.3, 0.2, 0.1], [0.2, 0.1, 0.4]], columns=['a', 'b', 'c']) ms_one = pd.Series([0.1, 0.5, 0.4], index=es.index) mixture = MultiDatasetMixtureModel(ms_one, es) self.assertRaises(ValueError, mixture.log_likelihood, dataset_no_weight) self.assertRaises(ValueError, mixture.log_likelihood, dataset_no_dataset_id)
def test_pi_update_with_priors(self): sample_data = pd.DataFrame( [[True, True, None, 'dataset-a', 2.5], [False, None, False, 'dataset-b', 1.5], [True, False, True, 'dataset-a', 5], [False, False, True, 'dataset-c', 1]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) pi_prior = pd.Series([2, 3], index=pi.columns) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.9, 0.8, 0.7]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p, prior_mixing_coefficients=pi_prior) zstar = pd.DataFrame([[0.9, 0.1], [0.1, 0.9], [0.5, 0.5], [0.4, 0.6]], index=sample_data.index, columns=pi.columns) pi_expected = pd.DataFrame( [[(2.5 * 0.9 + 5 * 0.5), (2.5 * 0.1 + 5 * 0.5)], [(1.5 * 0.1), (1.5 * 0.9)], [(1 * 0.4), (1 * 0.6)]], index=pi.index, columns=pi.columns) pi_expected += pi_prior - 1 denominators = pd.DataFrame([[2.5 + 5, 2.5 + 5], [1.5, 1.5], [1, 1]], index=pi.index, columns=pi.columns) denominators += pi_prior.sum() - len(pi_prior) pi_expected /= denominators print(pi_expected) pi_actual = model._pi_update_from_data(sample_data, zstar) assert_frame_equal(pi_expected, pi_actual)
def test_complete_likelihood_for_icl(self): data = pd.DataFrame([[False, False, None, 'dataset-a', 2.5], [True, False, False, 'dataset-b', 3], [True, None, False, 'dataset-c', 1], [False, True, False, 'dataset-a', 2]], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) pi = pd.DataFrame([[0.6, 0.4], [0.2, 0.8], [0.5, 0.5]], index=['dataset-a', 'dataset-b', 'dataset-c'], columns=['K0', 'K1']) p = pd.DataFrame([[0.1, 0.2, 0.3], [0.98, 0.89, 0.51]], index=['K0', 'K1'], columns=['X1', 'X2', 'X3']) model = MultiDatasetMixtureModel(pi, p) mle_states = model.mle_states(data) expected_ans = 0 for ix, state in mle_states.iteritems(): dataset = data.loc[ix, 'dataset_id'] row_ans = np.log(pi.loc[dataset, state]) for col, x_ik in data.loc[ix].iteritems(): if col in ['dataset_id', 'weight']: continue if x_ik is None: if p.loc[state, col] >= 0.5: x_ik = 1 else: x_ik = 0 if x_ik == 1: row_ans += np.log(p.loc[state, col]) else: row_ans += np.log(1 - p.loc[state, col]) expected_ans += row_ans * data.loc[ix, 'weight'] actual_ans = model.complete_mle_log_likelihood(data) self.assertAlmostEqual(expected_ans, actual_ans)
def test_mle_estimate_for_states(self): log_support = pd.DataFrame([[-1.0, -2.0, -3.0], [1.0, 2.0, 3.0], [0.0, 0.1, 0.0], [-10.0, -0.5, 0.0]], index=['a', 'b', 'c', 'd'], columns=['K1', 'K2', 'K3']) expected_mle_states = pd.Series(['K1', 'K3', 'K2', 'K3'], index=log_support.index) actual_mle_states = MultiDatasetMixtureModel._mle_states(log_support) assert_series_equal(expected_mle_states, actual_mle_states, check_names=False)
def test_log_likelihood_validates_all_data_columns_present(self): """ log likelihood function should validate that data provided has all the data columns """ dataset = pd.DataFrame( [[False, True, False, 0, 1], [False, False, False, 0, 1]], columns=['a', 'b', 'c', 'dataset_id', 'weight']) es = pd.DataFrame([[0.1, 0.2, 0.6], [0.3, 0.2, 0.1], [0.2, 0.1, 0.4]], columns=['a', 'b', 'd']) # column d not in data ms_one = pd.Series([0.1, 0.5, 0.4], index=es.index) mixture = MultiDatasetMixtureModel(ms_one, es) self.assertRaises(ValueError, mixture.log_likelihood, dataset)
def test_dataset_collapse(self): sample_data = pd.DataFrame( [ [True, True, None, 'dataset-a', 2.5], # A [False, None, False, 'dataset-b', 1.5], # B [True, True, None, 'dataset-a', 3.5], # A [True, False, True, 'dataset-a', 5], # C [False, False, True, 'dataset-c', 1], # D [True, False, True, 'dataset-a', 3.5], # C [True, False, True, 'dataset-a', 2], # C [False, False, True, 'dataset-c', 1.5], # D [None, None, True, 'dataset-c', 1.5], # E, [None, None, True, 'dataset-b', 1], # F, [None, None, True, 'dataset-c', 2.5], # E, [None, None, True, 'dataset-b', 2.1], # F, [True, True, np.nan, 'dataset-a', 15], # A (with nan) ], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) expected_collapsed = pd.DataFrame( [ [True, True, None, 'dataset-a', 2.5 + 3.5 + 15], # A [False, None, False, 'dataset-b', 1.5], # B [True, False, True, 'dataset-a', 5 + 3.5 + 2], # C [False, False, True, 'dataset-c', 1 + 1.5], # D [None, None, True, 'dataset-c', 1.5 + 2.5], # E, [None, None, True, 'dataset-b', 1 + 2.1], # F, ], columns=['X1', 'X2', 'X3', 'dataset_id', 'weight']) expected_collapsed.sort_values(by='weight', inplace=True) # sorting messes up indices expected_collapsed.index = range(len(expected_collapsed)) actual_collapsed = MultiDatasetMixtureModel.collapse_dataset( sample_data) actual_collapsed.sort_values(by='weight', inplace=True) actual_collapsed.index = range( len(actual_collapsed)) # Fix sorting index print('Expected') print(expected_collapsed) print('\nActual:') print(actual_collapsed) print() assert_frame_equal(expected_collapsed, actual_collapsed)
def test_log_likelihood_validates_dataset_id_index(self): """ log likelihood function should validate that dataset index corresponds to the one provided """ dataset = pd.DataFrame( [[False, True, False, 'x', 1], [False, False, False, 'y', 1]], columns=['a', 'b', 'c', 'dataset_id', 'weight']) es = pd.DataFrame([[0.1, 0.2, 0.6], [0.3, 0.2, 0.1], [0.2, 0.1, 0.4]], columns=['a', 'b', 'c']) ms_two = pd.DataFrame([[0.1, 0.5, 0.4], [0.1, 0.5, 0.4]], columns=es.index, index=['a', 'b']) mixture = MultiDatasetMixtureModel(ms_two, es) self.assertRaises(ValueError, mixture.log_likelihood, dataset)
def random_mixture_generator(number_of_components, dataset, random_state=None, epsilon=0.005, alpha=0.75, prior_mixing_coefficients=None, prior_emission_probabilities=None): """ Returns a generator for `MultiDatasetBernoulli` initialiser. The mixing coefficients are always chosen uniform. Emission probabilities are generated from alpha * rand_component + (1-alpha) random_row_from_data As described in "EM initialisation for Bernoulli Mixture learning" by A. Juan, et al. Probabilities are also smoothed to be within range [epsilon, 1-epsilon]. :param number_of_components: number of components to generate samples for :param dataset: dataset to use for initialisation :param random_state: random seed :param epsilon: probabilities will be adjusted to be within range [epsilon, 1-epsilon] :param alpha: mixing coefficient for random coefficient and random row from data :param prior_mixing_coefficients: priors for MAP estimation of mixing coefficients :param prior_emission_probabilities: priors for MAP estimation of emission probabilities :return: """ dataset = pd.DataFrame(dataset) assert DATASET_ID_COLUMN in dataset.columns assert WEIGHT_COLUMN in dataset.columns data = dataset[dataset.columns.difference( [DATASET_ID_COLUMN, WEIGHT_COLUMN])] dataset_id_counts = dataset[DATASET_ID_COLUMN].value_counts() random = np.random.RandomState(random_state) components_index = ['K{}'.format(k) for k in range(number_of_components)] mixing_coefficients = pd.DataFrame( [np.repeat(1 / number_of_components, number_of_components)], index=dataset_id_counts.index, columns=components_index) random_domain = (0, 1) dataset_domain = (0, 1) expected_domain = _expected_domain(random_domain, dataset_domain, alpha=alpha) while True: N, D = data.shape random_emissions = _random_numbers_within_domain( random, random_domain, (number_of_components, D)) random_row_emissions = _random_rows_from_dataset( data, n_rows=number_of_components, random=random) random_row_emissions = np.asarray(random_row_emissions, dtype=bool) emissions = alpha * random_emissions + (1 - alpha) * random_row_emissions emissions = _adjust_probabilities(emissions, epsilon, domain=expected_domain) emissions = pd.DataFrame(emissions, index=components_index, columns=data.columns) yield MultiDatasetMixtureModel( mixing_coefficients, emissions, prior_emission_probabilities=prior_emission_probabilities, prior_mixing_coefficients=prior_mixing_coefficients)