def test_sweep_alpha(): s_start = [0] * N N1_max = N bandwidth = 10 adaptive = True for algorithm in ('M-space', 'N1-space'): results = [] for alpha_idx, alpha in enumerate(alpha_grid): results.append( fbmp2.band_search(alpha, A, y, A_fix, s_start, N1_max, bandwidth, adaptive, a, b, algorithm)) results_quest = fbmp2.sweep_alpha(alpha_grid, A, y, A_fix, s_start, N1_max, bandwidth, adaptive, a, b, algorithm, max_processes=None) results.sort(key=lambda res: res['alpha']) for alpha_idx, alpha in enumerate(alpha_grid): assert results_quest[alpha_idx]['alpha'] == \ results[alpha_idx]['alpha'] assert (results_quest[alpha_idx]['s'] == results[alpha_idx]['s']) assert is_approx(results_quest[alpha_idx]['logL'], results[alpha_idx]['logL'])
def compute_likelihoods(self, alpha_grid=(0.001, 0.01, 0.1, 1), a=0, b=0, max_active=None, bandwidth=1, adaptive=False, algorithm='N1-space', processes=None): """Computes the likelihood for a large set of models. This is the main function of FBMP2, it takes about 5 us per node to run. The number of considered nodes is B * F * min(F, max_active) for non-adaptive runs, and about 0.5 * F times more for adaptive runs, where F is the number of features and B is the bandwidth. Running this function updates the internal variable `_sweep_alpha_results`, which enables calling :func:`compute_posterior` to obtain the final results. Args: alpha_grid (sequence of :obj:`float`): alpha values to consider (alpha = sigma / sigma_coeffs, where sigma is the uncorrelated noise strength, and sigma_coeffs is the stdev of the non-zero features, all understood to correspond to the normalized inputs, X_norm and y_norm) a (float): "a" parameter of the inverse-gamma prior of sigma_x^2 (must be >= 0) b (float): "b" parameter of the inverse-gamma prior of sigma_x^2 (must be >= 0) max_active (int): maximum number of active features to consider (must be >= 0, optional, default: [number of samples] - 2 - [number of fixed features]) bandwidth (int): width of the band search algorithm, higher the better, but runtime increases approximately linearly with bandwidth (must be > 0, optional) adaptive (bool): If True, the band search is performed in a way that ensures that the set of discovered models include both active and inactive states of every feature at least `bandwidth` times. Setting it to True typically increases runtime by a factor of 0.5*[features], but improves reliability of the result more than one would get by setting `bandwidth` to 0.5*[features]. algorithm (str): Either "M-space" or "N1-space", indicating whether the updates should be computed in sample("M") space or active feature ("N1") space processes (int): Max number of parallel computational processes (optional, default: number of CPUs) """ M, N_fix = self.X_fix_norm.shape M, N = self.X_norm.shape if max_active is None: max_active = M - 2 - N_fix if max_active > M - 2 - N_fix: warn(f'Despite max_active being set to {max_active}, ' f' only models up to {M} - 2 - {N_fix} = {M-2 -N_fix} ' f'active features will ' f'be considered, because there is only {M} samples, ' f'and there are {N_fix} fixed features') max_active = M - 2 - N_fix if algorithm not in {'M-space', 'N1-space'}: raise ValueError('`algorithm` must one of ' '{"M-space", "N1-space"}') if processes is None: processes = fbmp2.cpu_count() # Run FBMP2 on normalized inputs starting from s = (0,0,0, ... 0) s_start = [0] * N sweep_alpha_results = fbmp2.sweep_alpha(alpha_grid, self.X_norm, self.y_norm, self.X_fix_norm, s_start, max_active, bandwidth, adaptive, a, b, algorithm, max_processes=processes) self._sweep_alpha_results = sweep_alpha_results
def main(): N_full = 50 # number of features M = 20 # samples replicates = range(10) active_features_list = [0, 1, 2, 3] sigma_x_list = [1.0] sigma_list = [0.01, 0.1, 1, 10] run_id = 0 with open('runs.tsv', 'w') as fout: header = ['run_id', 'N', 'M', 'replicate_id', 'active_features', 'sigma_x', 'sigma', 'data_file', 'result_file'] fout.write('\t'.join(header) + '\n') total_runs = len(list(product(replicates, active_features_list, sigma_x_list, sigma_list))) for replicate_id, active_features, sigma_x, sigma in \ product(replicates, active_features_list, sigma_x_list, sigma_list): print(f'Run {run_id+1} / {total_runs}', end=', ') try: data_file = f'simulated_data/data_{run_id}.pkl' result_file = f'results/result_{run_id}.pkl' A, y, s_true, x_true = generate_data(M, N_full, active_features, sigma_x, sigma) A_norm, y_norm, feature_is_uniform = normalize_data(A, y) M, N = A_norm.shape with open('runs.tsv', 'a') as fout: active_features = np.sum(s_true[~feature_is_uniform]) record = [run_id, N, M, replicate_id, active_features, sigma_x, sigma, data_file, result_file] fout.write('\t'.join(list(map(str, record))) + '\n') with open(data_file, 'wb') as fout: dump({'s_true': s_true[~feature_is_uniform], 'x_true': x_true[~feature_is_uniform], 'A_norm': A_norm, 'y_norm': y_norm}, fout) time_start = timer() s_start = (0,) * N N1_max = M - 1 kappa = 2 pi = 1.0 / N log_ps = calculate_s_logprior(N, kappa, pi) alpha_grid = np.logspace(np.log10(0.001), np.log10(10), 8) results = sweep_alpha(alpha_grid, A_norm, y_norm, s_start, N1_max, bandwidth=10, adaptive=True) posteriors = calculate_posteriors(results, log_ps, full=False) with open(result_file, 'wb') as fout: dump(posteriors, fout) print(f'{timer() - time_start} seconds') except: print('failed') run_id += 1
def test_unnormalize_posterior(): A_nonuniform = np.arange(12).reshape((4, 3)) uniform_col = np.ones((4, 1)) A = np.concatenate( (-1 * uniform_col, A_nonuniform[:, 0:1], -2 * uniform_col, A_nonuniform[:, 1:3], -3 * uniform_col), axis=1) A_fix_nonuniform = np.arange(4).reshape((4, 1)) A_fix = np.concatenate( (-1 * uniform_col, A_fix_nonuniform[:, 0:1], -2 * uniform_col), axis=1) y = np.arange(4) normalization_results = fbmp2.normalize_input( A, y, A_fix, normalize_feature_scales=True) A_norm = normalization_results['A_norm'] y_norm = normalization_results['y_norm'] A_fix_norm = normalization_results['A_fix_norm'] M, N = A_norm.shape kappa = 2 pi = 1.0 / N log_ps = fbmp2.calculate_s_logprior(N, kappa, pi) s_start = [0] * N N1_max = M - 1 bandwidth = 1 adaptive = False sweep_alpha_results = fbmp2.sweep_alpha(alpha_grid, A_norm, y_norm, A_fix, s_start, N1_max, bandwidth, adaptive, a, b, 'N1-space', max_processes=None) post = fbmp2.calculate_posteriors(sweep_alpha_results, log_ps, A_norm, y_norm, A_fix_norm, True, True) unnormed_post = fbmp2.unnormalize_posteriors(post, normalization_results, kappa, pi) assert is_approx(unnormed_post['alpha'], post['alpha']) assert is_approx(unnormed_post['logQ'], post['logQ']) assert len(unnormed_post['log_pn_0']) == 6 assert is_approx( unnormed_post['log_pn_0'][~normalization_results['uniform_features']], post['log_pn_0']) s_matrix = unnormed_post['full_log_psy_df'][[f's_{n}' for n in range(6)]].values assert (np.isnan(s_matrix[:, normalization_results['uniform_features']]))\ .all() assert (s_matrix[:, ~normalization_results['uniform_features']] == post['full_log_psy_df'][[f's_{n}' for n in range(3)]].values).all() assert is_approx(unnormed_post['full_log_psy_df']['log_psy'].values, post['full_log_psy_df']['log_psy'].values) assert (np.isnan(unnormed_post['x_mmse']) == np.array( [True, False, True, False, False, True])).all() assert is_approx( unnormed_post['x_mmse'][[1, 3, 4]], post['x_mmse'] * normalization_results['y_scale'] / normalization_results['A_scale'][[1, 3, 4]]) assert (np.isnan(unnormed_post['x_fix_mmse']) == np.array( [True, False, True])).all() assert is_approx( unnormed_post['x_fix_mmse'][[1]], post['x_fix_mmse'] * normalization_results['y_scale'] / normalization_results['A_fix_scale'][[1]])
def test_calculate_posteriors(): kappa = 2 pi = 1.0 / N log_ps = fbmp2.calculate_s_logprior(N, kappa, pi) s_start = [0] * N N1_max = M - 1 bandwidth = 10 adaptive = True sweep_alpha_results = fbmp2.sweep_alpha(alpha_grid, A, y, A_fix, s_start, N1_max, bandwidth, adaptive, a, b, 'N1-space', max_processes=None) Q = np.zeros(len(sweep_alpha_results)) for alpha_idx, res in enumerate(sweep_alpha_results): res['s'] = np.frombuffer(b''.join(res['s']), dtype=np.uint8) \ .reshape([len(res['s']), N]) res['logL'] = np.array(res['logL']) res['N1'] = np.sum(res['s'], axis=1).astype(int) L = np.exp(res['logL'] - np.max(res['logL'])) * \ np.exp(np.max(res['logL'])) p = np.exp(log_ps[res['N1']] - np.max(log_ps)) * \ np.exp(np.max(log_ps)) pL = p * L res['pL'] = pL Q[alpha_idx] = np.sum(pL) Q = Q / np.sum(Q) p_sn0 = np.zeros([len(sweep_alpha_results), N]) p_sn1 = np.zeros([len(sweep_alpha_results), N]) p_N1 = np.zeros([len(sweep_alpha_results), N + 1]) for alpha_idx, res in enumerate(sweep_alpha_results): pL = res['pL'] Q_alpha = Q[alpha_idx] for n in range(N): p_sn0[alpha_idx, n] = Q_alpha * np.sum(pL[res['s'][:, n] == 0]) p_sn1[alpha_idx, n] = Q_alpha * np.sum(pL[res['s'][:, n] == 1]) for K in range(np.max(res['N1']) + 1): p_N1[alpha_idx, K] = Q_alpha * np.sum(pL[res['N1'] == K]) pn_0 = np.sum(p_sn0, axis=0) pn_1 = np.sum(p_sn1, axis=0) pn_0 = pn_0 / (pn_0 + pn_1) pN1 = np.sum(p_N1, axis=0) pN1 = pN1 / np.sum(pN1) post_quest = fbmp2.calculate_posteriors(sweep_alpha_results, log_ps, A, y, A_fix, True, True) assert is_approx(alpha_grid, post_quest['alpha']) assert is_approx(Q, np.exp(post_quest['logQ'])) assert is_approx(pn_0, np.exp(post_quest['log_pn_0'])) assert is_approx(pN1, np.exp(post_quest['log_pN1'])) selected_s, selected_x, selected_x_fix = \ fbmp2.estimate_coefficients(sweep_alpha_results, post_quest['logQ'], log_ps, A, y, A_fix, test=True) best_alpha = np.exp( np.sum(np.log(post_quest['alpha']) * np.exp(post_quest['logQ']))) for s, x, x_fix in zip(selected_s, selected_x, selected_x_fix): A_active = np.concatenate([A_fix, A.dot(np.diag(s))], axis=1) expected = A_active.T.dot( np.linalg.inv( A_active.dot(A_active.T) + best_alpha**2 * np.eye(M))).dot(y) received = np.concatenate([x_fix, x], axis=0) assert is_approx(expected, received)