Ejemplo n.º 1
0
def test_sweep_alpha():
    s_start = [0] * N
    N1_max = N
    bandwidth = 10
    adaptive = True

    for algorithm in ('M-space', 'N1-space'):
        results = []
        for alpha_idx, alpha in enumerate(alpha_grid):
            results.append(
                fbmp2.band_search(alpha, A, y, A_fix, s_start, N1_max,
                                  bandwidth, adaptive, a, b, algorithm))

        results_quest = fbmp2.sweep_alpha(alpha_grid,
                                          A,
                                          y,
                                          A_fix,
                                          s_start,
                                          N1_max,
                                          bandwidth,
                                          adaptive,
                                          a,
                                          b,
                                          algorithm,
                                          max_processes=None)

        results.sort(key=lambda res: res['alpha'])
        for alpha_idx, alpha in enumerate(alpha_grid):
            assert results_quest[alpha_idx]['alpha'] == \
                results[alpha_idx]['alpha']
            assert (results_quest[alpha_idx]['s'] == results[alpha_idx]['s'])
            assert is_approx(results_quest[alpha_idx]['logL'],
                             results[alpha_idx]['logL'])
Ejemplo n.º 2
0
    def compute_likelihoods(self,
                            alpha_grid=(0.001, 0.01, 0.1, 1),
                            a=0,
                            b=0,
                            max_active=None,
                            bandwidth=1,
                            adaptive=False,
                            algorithm='N1-space',
                            processes=None):
        """Computes the likelihood for a large set of models.

        This is the main function of FBMP2, it takes about 5 us per node to
        run. The number of considered nodes is B * F * min(F, max_active)
        for non-adaptive runs, and about 0.5 * F times more for adaptive runs,
        where F is the number of features and B is the bandwidth.

        Running this function updates the internal variable
        `_sweep_alpha_results`, which enables calling
        :func:`compute_posterior` to obtain the final results.

        Args:
            alpha_grid (sequence of :obj:`float`): alpha values to consider
                (alpha = sigma / sigma_coeffs,
                where sigma is the uncorrelated noise strength, and
                sigma_coeffs is the stdev of the non-zero features,
                all understood to correspond to
                the normalized inputs, X_norm and y_norm)
            a (float): "a" parameter of the inverse-gamma prior of sigma_x^2
                (must be >= 0)
            b (float): "b" parameter of the inverse-gamma prior of sigma_x^2
                (must be >= 0)
            max_active (int): maximum number of active features to consider
                (must be >= 0, optional,
                default: [number of samples] - 2 - [number of fixed features])
            bandwidth (int): width of the band search algorithm, higher the
                better, but runtime increases approximately linearly with
                bandwidth (must be > 0, optional)
            adaptive (bool): If True, the band search is performed in a way
                that ensures that the set of discovered models include both
                active and inactive states of every feature at least
                `bandwidth` times. Setting it to True typically increases
                runtime by a factor of 0.5*[features], but improves
                reliability of the result more than one would get by setting
                `bandwidth` to 0.5*[features].
            algorithm (str): Either "M-space" or "N1-space", indicating
                whether the updates should be computed in sample("M") space
                or active feature ("N1") space
            processes (int): Max number of parallel computational processes
                (optional, default: number of CPUs)
        """
        M, N_fix = self.X_fix_norm.shape
        M, N = self.X_norm.shape
        if max_active is None:
            max_active = M - 2 - N_fix
        if max_active > M - 2 - N_fix:
            warn(f'Despite max_active being set to {max_active}, '
                 f' only models up to {M} - 2 - {N_fix} = {M-2 -N_fix} '
                 f'active features will '
                 f'be considered, because there is only {M} samples, '
                 f'and there are {N_fix} fixed features')
            max_active = M - 2 - N_fix

        if algorithm not in {'M-space', 'N1-space'}:
            raise ValueError('`algorithm` must one of '
                             '{"M-space", "N1-space"}')

        if processes is None:
            processes = fbmp2.cpu_count()

        # Run FBMP2 on normalized inputs starting from s = (0,0,0, ... 0)
        s_start = [0] * N
        sweep_alpha_results = fbmp2.sweep_alpha(alpha_grid,
                                                self.X_norm,
                                                self.y_norm,
                                                self.X_fix_norm,
                                                s_start,
                                                max_active,
                                                bandwidth,
                                                adaptive,
                                                a,
                                                b,
                                                algorithm,
                                                max_processes=processes)
        self._sweep_alpha_results = sweep_alpha_results
Ejemplo n.º 3
0
def main():
    N_full = 50  # number of features
    M = 20  # samples

    replicates = range(10)

    active_features_list = [0, 1, 2, 3]
    sigma_x_list = [1.0]
    sigma_list = [0.01, 0.1, 1, 10]

    run_id = 0

    with open('runs.tsv', 'w') as fout:
        header = ['run_id', 'N', 'M', 'replicate_id', 'active_features', 'sigma_x', 'sigma', 'data_file', 'result_file']
        fout.write('\t'.join(header) + '\n')

    total_runs = len(list(product(replicates, active_features_list, sigma_x_list, sigma_list)))
    for replicate_id, active_features, sigma_x, sigma in \
            product(replicates, active_features_list, sigma_x_list, sigma_list):
        print(f'Run {run_id+1} / {total_runs}', end=',  ')

        try:
            data_file = f'simulated_data/data_{run_id}.pkl'
            result_file = f'results/result_{run_id}.pkl'

            A, y, s_true, x_true = generate_data(M, N_full, active_features, sigma_x, sigma)
            A_norm, y_norm, feature_is_uniform = normalize_data(A, y)
            M, N = A_norm.shape

            with open('runs.tsv', 'a') as fout:
                active_features = np.sum(s_true[~feature_is_uniform])
                record = [run_id, N, M, replicate_id, active_features,
                          sigma_x, sigma, data_file, result_file]
                fout.write('\t'.join(list(map(str, record))) + '\n')
            with open(data_file, 'wb') as fout:
                dump({'s_true': s_true[~feature_is_uniform],
                      'x_true': x_true[~feature_is_uniform],
                      'A_norm': A_norm,
                      'y_norm': y_norm},
                     fout)

            time_start = timer()

            s_start = (0,) * N
            N1_max = M - 1
            kappa = 2
            pi = 1.0 / N

            log_ps = calculate_s_logprior(N, kappa, pi)

            alpha_grid = np.logspace(np.log10(0.001), np.log10(10), 8)

            results = sweep_alpha(alpha_grid, A_norm, y_norm, s_start, N1_max,
                                  bandwidth=10, adaptive=True)

            posteriors = calculate_posteriors(results, log_ps, full=False)

            with open(result_file, 'wb') as fout:
                dump(posteriors, fout)

            print(f'{timer() - time_start} seconds')

        except:
            print('failed')

        run_id += 1
Ejemplo n.º 4
0
def test_unnormalize_posterior():
    A_nonuniform = np.arange(12).reshape((4, 3))
    uniform_col = np.ones((4, 1))
    A = np.concatenate(
        (-1 * uniform_col, A_nonuniform[:, 0:1], -2 * uniform_col,
         A_nonuniform[:, 1:3], -3 * uniform_col),
        axis=1)
    A_fix_nonuniform = np.arange(4).reshape((4, 1))
    A_fix = np.concatenate(
        (-1 * uniform_col, A_fix_nonuniform[:, 0:1], -2 * uniform_col), axis=1)
    y = np.arange(4)

    normalization_results = fbmp2.normalize_input(
        A, y, A_fix, normalize_feature_scales=True)
    A_norm = normalization_results['A_norm']
    y_norm = normalization_results['y_norm']
    A_fix_norm = normalization_results['A_fix_norm']
    M, N = A_norm.shape

    kappa = 2
    pi = 1.0 / N
    log_ps = fbmp2.calculate_s_logprior(N, kappa, pi)

    s_start = [0] * N
    N1_max = M - 1
    bandwidth = 1
    adaptive = False
    sweep_alpha_results = fbmp2.sweep_alpha(alpha_grid,
                                            A_norm,
                                            y_norm,
                                            A_fix,
                                            s_start,
                                            N1_max,
                                            bandwidth,
                                            adaptive,
                                            a,
                                            b,
                                            'N1-space',
                                            max_processes=None)

    post = fbmp2.calculate_posteriors(sweep_alpha_results, log_ps, A_norm,
                                      y_norm, A_fix_norm, True, True)
    unnormed_post = fbmp2.unnormalize_posteriors(post, normalization_results,
                                                 kappa, pi)

    assert is_approx(unnormed_post['alpha'], post['alpha'])
    assert is_approx(unnormed_post['logQ'], post['logQ'])
    assert len(unnormed_post['log_pn_0']) == 6
    assert is_approx(
        unnormed_post['log_pn_0'][~normalization_results['uniform_features']],
        post['log_pn_0'])
    s_matrix = unnormed_post['full_log_psy_df'][[f's_{n}'
                                                 for n in range(6)]].values
    assert (np.isnan(s_matrix[:, normalization_results['uniform_features']]))\
        .all()
    assert (s_matrix[:, ~normalization_results['uniform_features']] ==
            post['full_log_psy_df'][[f's_{n}'
                                     for n in range(3)]].values).all()
    assert is_approx(unnormed_post['full_log_psy_df']['log_psy'].values,
                     post['full_log_psy_df']['log_psy'].values)
    assert (np.isnan(unnormed_post['x_mmse']) == np.array(
        [True, False, True, False, False, True])).all()
    assert is_approx(
        unnormed_post['x_mmse'][[1, 3, 4]],
        post['x_mmse'] * normalization_results['y_scale'] /
        normalization_results['A_scale'][[1, 3, 4]])
    assert (np.isnan(unnormed_post['x_fix_mmse']) == np.array(
        [True, False, True])).all()
    assert is_approx(
        unnormed_post['x_fix_mmse'][[1]],
        post['x_fix_mmse'] * normalization_results['y_scale'] /
        normalization_results['A_fix_scale'][[1]])
Ejemplo n.º 5
0
def test_calculate_posteriors():
    kappa = 2
    pi = 1.0 / N
    log_ps = fbmp2.calculate_s_logprior(N, kappa, pi)

    s_start = [0] * N
    N1_max = M - 1
    bandwidth = 10
    adaptive = True
    sweep_alpha_results = fbmp2.sweep_alpha(alpha_grid,
                                            A,
                                            y,
                                            A_fix,
                                            s_start,
                                            N1_max,
                                            bandwidth,
                                            adaptive,
                                            a,
                                            b,
                                            'N1-space',
                                            max_processes=None)

    Q = np.zeros(len(sweep_alpha_results))
    for alpha_idx, res in enumerate(sweep_alpha_results):
        res['s'] = np.frombuffer(b''.join(res['s']), dtype=np.uint8) \
            .reshape([len(res['s']), N])
        res['logL'] = np.array(res['logL'])
        res['N1'] = np.sum(res['s'], axis=1).astype(int)
        L = np.exp(res['logL'] - np.max(res['logL'])) * \
            np.exp(np.max(res['logL']))
        p = np.exp(log_ps[res['N1']] - np.max(log_ps)) * \
            np.exp(np.max(log_ps))
        pL = p * L
        res['pL'] = pL
        Q[alpha_idx] = np.sum(pL)
    Q = Q / np.sum(Q)

    p_sn0 = np.zeros([len(sweep_alpha_results), N])
    p_sn1 = np.zeros([len(sweep_alpha_results), N])
    p_N1 = np.zeros([len(sweep_alpha_results), N + 1])
    for alpha_idx, res in enumerate(sweep_alpha_results):
        pL = res['pL']
        Q_alpha = Q[alpha_idx]
        for n in range(N):
            p_sn0[alpha_idx, n] = Q_alpha * np.sum(pL[res['s'][:, n] == 0])
            p_sn1[alpha_idx, n] = Q_alpha * np.sum(pL[res['s'][:, n] == 1])

        for K in range(np.max(res['N1']) + 1):
            p_N1[alpha_idx, K] = Q_alpha * np.sum(pL[res['N1'] == K])

    pn_0 = np.sum(p_sn0, axis=0)
    pn_1 = np.sum(p_sn1, axis=0)
    pn_0 = pn_0 / (pn_0 + pn_1)

    pN1 = np.sum(p_N1, axis=0)
    pN1 = pN1 / np.sum(pN1)

    post_quest = fbmp2.calculate_posteriors(sweep_alpha_results, log_ps, A, y,
                                            A_fix, True, True)

    assert is_approx(alpha_grid, post_quest['alpha'])
    assert is_approx(Q, np.exp(post_quest['logQ']))
    assert is_approx(pn_0, np.exp(post_quest['log_pn_0']))
    assert is_approx(pN1, np.exp(post_quest['log_pN1']))

    selected_s, selected_x, selected_x_fix = \
        fbmp2.estimate_coefficients(sweep_alpha_results,
                                    post_quest['logQ'],
                                    log_ps,
                                    A, y, A_fix,
                                    test=True)
    best_alpha = np.exp(
        np.sum(np.log(post_quest['alpha']) * np.exp(post_quest['logQ'])))
    for s, x, x_fix in zip(selected_s, selected_x, selected_x_fix):
        A_active = np.concatenate([A_fix, A.dot(np.diag(s))], axis=1)
        expected = A_active.T.dot(
            np.linalg.inv(
                A_active.dot(A_active.T) + best_alpha**2 * np.eye(M))).dot(y)
        received = np.concatenate([x_fix, x], axis=0)
        assert is_approx(expected, received)