Ejemplo n.º 1
0
def sample_precision_space(parameters, number=100):
    """Launch a large number of times the same estimation, with different
    starting points.

    number: int
        number of samples to generate.
    """
    # Estimation
    max_iter = 200

    # Generate signals
    next_num, cache_dir, gt = create_signals(parameters,
                                             output_dir="_gsc_sensitivity")
    precisions, topology, signals = (gt["precisions"], gt["topology"],
                                     gt["signals"])

    emp_covs, n_samples = empirical_covariances(signals)

    print("alpha max: %.3e" % compute_alpha_max(emp_covs, n_samples)[0])

    # Estimate a lot of precision matrices
    parameters = joblib.Parallel(n_jobs=7, verbose=1)(
        joblib.delayed(save_group_sparse_covariance)(
            emp_covs, n_samples, parameters["alpha"], max_iter=max_iter,
            tol=parameters["tol"], cache_dir=cache_dir, num=n)
        for n in xrange(next_num, next_num + number))
def benchmark1():
    parameters = dict(n_var=200,
                      n_tasks=5,
                      density=0.15,

                      tol=1e-2,
                      n_alphas=5,
                      max_iter=50,
                      min_samples=100,
                      max_samples=150)

    next_num, cache_dir, gt = create_signals(parameters, output_dir=output_dir)

    emp_covs, n_samples = empirical_covariances(gt['signals'])
    max_alpha, _ = compute_alpha_max(emp_covs, n_samples)

    min_alpha = max_alpha / 100.
    print(min_alpha, max_alpha)
    alphas = np.logspace(np.log10(min_alpha), np.log10(max_alpha),
                       parameters['n_alphas'])[::-1]

    joblib.Parallel(n_jobs=1, verbose=1)(
        joblib.delayed(save_group_sparse_covariance)(
            emp_covs, n_samples, alpha, max_iter=parameters['max_iter'],
            tol=parameters['tol'], debug=False, cache_dir=cache_dir, num=num)
        for alpha, num in zip(alphas, itertools.count(next_num)))
def plot_benchmark1():
    """Plot various quantities obtained for varying values of alpha."""
    parameters = dict(n_var=200,
                      n_tasks=5,
                      density=0.15,

                      tol=1e-2,
#                      max_iter=50,
                      min_samples=100,
                      max_samples=150)

    cache_dir = get_cache_dir(parameters, output_dir=output_dir)
    gt = get_ground_truth(cache_dir)
    gt['precisions'] = np.dstack(gt['precisions'])

    emp_covs, n_samples = empirical_covariances(gt['signals'])
    n_samples /= n_samples.sum()

    alpha = []
    objective = []
    log_likelihood = []
    ll_penalized = []
    sparsity = []
    kl = []

    true_covs = np.empty(gt['precisions'].shape)
    for k in range(gt['precisions'].shape[-1]):
        true_covs[..., k] = np.linalg.inv(gt['precisions'][..., k])

    for out in iter_outputs(cache_dir):
        alpha.append(out['alpha'])
        objective.append(- out['objective'][-1])
        ll, llpen = group_sparse_scores(out['precisions'],
                                       n_samples, true_covs, out['alpha'])
        log_likelihood.append(ll)
        ll_penalized.append(llpen)
        sparsity.append(1. * (out['precisions'][..., 0] != 0).sum()
                        / out['precisions'].shape[0] ** 2)
        kl.append(distance(out['precisions'], gt['precisions']))

    gt["true_sparsity"] = (1. * (gt['precisions'][..., 0] != 0).sum()
                           / gt['precisions'].shape[0] ** 2)
    title = (("n_var: {n_var}, n_tasks: {n_tasks}, "
             + "true sparsity: {true_sparsity:.2f} "
             + "\ntol: {tol:.2e} samples: {min_samples}-{max_samples}").format(
                 true_sparsity=gt["true_sparsity"],
                 **parameters))

    plot(alpha, objective, label="objective", title=title)
    plot(alpha, log_likelihood, label="log-likelihood", new_figure=False)
    plot(alpha, ll_penalized, label="penalized L-L", new_figure=False)

    plot(alpha, sparsity, label="sparsity", title=title)
    pl.hlines(gt["true_sparsity"], min(alpha), max(alpha))

    plot(alpha, kl, label="distance", title=title)
    pl.show()
Ejemplo n.º 4
0
def split_signals(signals, fold_n=0):
    """Split signals into train and test sets."""
    # Smallest signal is 77-sample long.
    # Keep first 50 samples for train set, everything else for test set.
    #    train_test = [(s[:50, ...], s[50:, ...]) for s in signals]
    # Keep first two-third for train set, everything else for test set

    folds = [tuple(KFold(s.shape[0], 3)) for s in signals]
    train_test = [(s[fold[fold_n][0], ...], s[fold[fold_n][1], ...])
                  for s, fold in zip(signals, folds)]
    signals, test_signals = zip(*train_test)

    emp_covs, n_samples = empirical_covariances(signals)
    test_emp_covs, test_n_samples = empirical_covariances(test_signals)

    n_samples_norm = n_samples.copy()
    n_samples_norm /= n_samples_norm.sum()

    return signals, test_signals, emp_covs, test_emp_covs, n_samples_norm
def benchmark(parameters, output_d="_convergence"):
    _, _, gt = create_signals(parameters, output_dir=output_d)

    emp_covs, n_samples = empirical_covariances(gt["signals"])
    print("alpha_max: %.3e, %.3e" % compute_alpha_max(emp_covs, n_samples))

    sp = ScoreProbe(duality_gap=True)
    _group_sparse_covariance(
        emp_covs, n_samples, alpha=parameters["alpha"], tol=parameters["tol"],
        max_iter=parameters["max_iter"], probe_function=sp, verbose=1)

    return {"log_lik": np.asarray(sp.log_lik),
            "objective": np.asarray(sp.objective),
            "precisions": np.asarray(sp.precisions),
            "duality_gap": np.asarray(sp.duality_gap),
            "time": np.asarray(sp.wall_clock)}, gt
def singular_cov_case():
    """Check behaviour of algorithm for singular input matrix."""
    parameters = {'n_tasks': 10, 'n_var': 40, 'density': 0.15,
                  'rho': .1, 'tol': 1e-2, 'max_iter': 50,
                  'min_samples': 10, 'max_samples': 15}

    _, _, gt = create_signals(parameters, output_dir=output_dir)
    signals = gt["signals"]

    emp_covs, _ = empirical_covariances(signals)

    # Check that all covariance matrices are singular.
    eps = np.finfo(float).eps
    for k in range(emp_covs.shape[-1]):
        eigvals = np.linalg.eigvalsh(emp_covs[..., k])
        assert(abs(eigvals.min()) <= 50 * eps)

    _, gsc_precisions = utils.timeit(group_sparse_covariance)(
        signals, parameters['rho'], max_iter=parameters['max_iter'],
        tol=parameters['tol'], verbose=1, debug=False)

    print('found sparsity: {0:.3f}'
          ''.format(1. * (gsc_precisions[..., 0] != 0).sum()
                    / gsc_precisions.shape[0] ** 2))
def benchmark1():
    """Plot different quantities for varying alpha."""
    # Signals
    min_samples, max_samples = 100, 150  # train signals length
    n_var = 50
    n_tasks = 40
    density = 0.1
    random_state = np.random.RandomState(0)

    test_samples = 4000  # number of samples for test signals

    # Estimation
    n_alphas = 10
    max_iter = 200
    tol = 1e-3

    # Generate signals
    signals, precisions, topology = \
             testing.generate_group_sparse_gaussian_graphs(
        n_subjects=n_tasks, n_features=n_var, density=density,
        random_state=random_state, min_n_samples=min_samples,
        max_n_samples=max_samples)

    emp_covs, n_samples = empirical_covariances(signals)

    # Estimate precision matrices
    alpha_1, _ = compute_alpha_max(emp_covs, n_samples)
    alpha_0 = 1e-2 * alpha_1
    ## alpha_1 = 0.067
    ## alpha_0 = 0.044

    alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1]

    parameters = joblib.Parallel(n_jobs=7, verbose=1)(
        joblib.delayed(group_sparse_covariance)(emp_covs, n_samples, alpha,
                                                max_iter=max_iter, tol=tol)
        for alpha in alphas)

    # Compute scores
    test_signals = testing.generate_signals_from_precisions(
        precisions, min_n_samples=test_samples, max_n_samples=test_samples + 1,
        random_state=random_state)

    test_emp_covs, _ = empirical_covariances(test_signals)
    del test_signals

    for params in parameters:
        params["ll_score"], params["pen_score"] = group_sparse_scores(
            params["precisions"], n_samples, test_emp_covs, params["alpha"])

    # Plot graphs
    alpha, ll_score, pen_score = get_series(
        parameters, ("alpha", "ll_score", "pen_score"))
    non_zero = [(p["precisions"][..., 0] != 0).sum() for p in parameters]

    pl.figure()
    pl.semilogx(alpha, ll_score, "-+", label="log-likelihood")
    pl.semilogx(alpha, pen_score, "-+", label="penalized LL")
    pl.xlabel("alpha")
    pl.ylabel("score")
    pl.grid()

    pl.figure()
    pl.semilogx(alpha, non_zero, "-+")
    pl.xlabel("alpha")
    pl.ylabel("non_zero")
    pl.grid()

    pl.figure()
    pl.loglog(alpha, non_zero, "-+")
    pl.xlabel("alpha")
    pl.ylabel("non_zero")
    pl.grid()

    pl.figure()
    pl.imshow(topology, interpolation="nearest")
    pl.title("true topology")

    ## precisions = get_series(parameters, ("precisions", ))
    ## for prec, alpha in zip(precisions, alpha):
    ##     pl.figure()
    ##     pl.imshow(prec[..., 0] != 0, interpolation="nearest")
    ##     pl.title(alpha)

    pl.show()
def benchmark3():
    """Compare group_sparse_covariance result for different initializations.
    """
    ## parameters = {'n_tasks': 10, 'n_var': 50, 'density': 0.15,
    ##               'alpha': .001, 'tol': 1e-2, 'max_iter': 100}
    parameters = {'n_var': 40, 'n_tasks': 10, 'density': 0.15,
                  'alpha': .01, 'tol': 1e-3, 'max_iter': 100}

    mem = joblib.Memory(".")

    _, _, gt = create_signals(parameters,
                              output_dir="_prof_group_sparse_covariance")
    signals = gt["signals"]

    emp_covs, n_samples = empirical_covariances(signals)
    print("alpha max: " + str(compute_alpha_max(emp_covs, n_samples)))

    # With diagonal elements initialization
    probe1 = ScoreProbe()
    est_precs1, probe1 = mem.cache(modified_gsc)(signals, parameters, probe1)
    probe1.comment = "diagonal"  # set after execution for joblib not to see it
    probe1.plot()

    # With Ledoit-Wolf initialization
    ld = np.empty(emp_covs.shape)
    for k in range(emp_covs.shape[-1]):
        ld[..., k] = np.linalg.inv(ledoit_wolf(signals[k])[0])

    probe1 = ScoreProbe()
    est_precs1, probe1 = utils.timeit(mem.cache(modified_gsc))(
        signals, parameters, probe=probe1)
    probe1.comment = "diagonal"  # for joblib to ignore this value

    probe2 = ScoreProbe()
    parameters["precisions_init"] = ld
    est_precs2, probe2 = utils.timeit(mem.cache(modified_gsc))(
        signals, parameters, probe=probe2)
    probe2.comment = "ledoit-wolf"

    print("difference between final estimates (max norm) %.2e"
          % abs(est_precs1 - est_precs2).max())

    pl.figure()
    pl.semilogy(probe1.timings[1:], probe1.max_norm,
                "+-", label=probe1.comment)
    pl.semilogy(probe2.timings[1:], probe2.max_norm,
                "+-", label=probe2.comment)
    pl.xlabel("Time [s]")
    pl.ylabel("Max norm")
    pl.grid()
    pl.legend(loc="best")

    pl.figure()
    pl.plot(probe1.timings, probe1.objective,
                "+-", label=probe1.comment)
    pl.plot(probe2.timings, probe2.objective,
                "+-", label=probe2.comment)
    pl.xlabel("Time [s]")
    pl.ylabel("objective")
    pl.grid()
    pl.legend(loc="best")

    pl.show()
def compute_stats(cache_dir):
    l = 0  # task to plot

    data_fnames = glob.glob(os.path.join(cache_dir, "precisions_*.pickle"))
    d = pickle.load(open(data_fnames[0], "rb"))
    mean = np.zeros(d["precisions"].shape)
    acc2 = mean.copy()
##    topo_count = mean.copy()
    minimum = float("inf") * np.ones(mean.shape)
    maximum = float("-inf") * np.ones(mean.shape)

    data_fnames = data_fnames[:5]

    for data_fname in data_fnames:
        print(data_fname)
        d = pickle.load(open(data_fname, "rb"))
        mean += d["precisions"]
        acc2 += d["precisions"] ** 2
        minimum = np.where(d["precisions"] < minimum, d["precisions"], minimum)
        maximum = np.where(d["precisions"] > maximum, d["precisions"], maximum)
##        topo_count += d["precisions"] != 0

    mean /= len(data_fnames)
    acc2 /= len(data_fnames)
    var = acc2 - mean ** 2
    assert var.min() >= -1e-13
    var[var < 0] = 0  # remove very small negative values
    matshow(var[..., l], title="variance")

    std = np.sqrt(var)
    assert np.all(np.isreal(std))
    matshow(std[..., l], title="std")
    matshow(mean[..., l], title="mean")

#    matshow(mean[..., l] != 0, title="topology")
#    matshow(maximum[..., l], title="maximum")
#    matshow(minimum[..., l], title="minimum")
    matshow(maximum[..., l] - minimum[..., 0], title="ptp")

    mean_no_diag = mean.copy()
    for k in range(mean_no_diag.shape[-1]):
        mean_no_diag[..., k].flat[::mean_no_diag.shape[0] + 1] = 0

    matshow(mean_no_diag[..., l], title="mean without diagonal")

    ratio = (std / abs(mean))[..., 0]
    ratio[np.logical_not(np.isfinite(ratio))] = 0
    matshow(ratio, title="ratio")

    # load estimated covariance
    gt = pickle.load(
        open(os.path.join(cache_dir, "ground_truth.pickle"), "rb"))

    emp_covs, n_samples = empirical_covariances(gt["signals"])
    rho = 0.02

    # Estimate first-order sensitivity
    n_samples /= n_samples.sum()
    last_m = d["precisions"]
    last_m_inv = np.empty(last_m.shape)
    for k in range(last_m.shape[-1]):
        last_m_inv[..., k] = np.linalg.inv(last_m[..., k])

    norms = np.sqrt(np.sum(last_m ** 2, axis=-1))
    last_m_normed = np.empty(last_m.shape)
    for k in range(last_m.shape[-1]):
        last_m_normed[..., k] = last_m[..., k] / norms
        # set diagonal to zero
        last_m_normed[..., k].flat[::last_m_normed.shape[0] + 1] = 0

    derivative = (n_samples * (last_m_inv - emp_covs) - rho * last_m_normed)
    derivative[np.logical_not(np.isfinite(derivative))] = 0
    derivative = derivative ** 2

    # estimate second-order sensibility
    sens2 = np.empty(last_m.shape)
    for k in range(last_m.shape[-1]):
        sens2[..., k] = n_samples[k] * (
            np.dot(
                np.dot(last_m_inv[..., k],
                       derivative[..., k]),
                last_m_inv[..., k])
            )

    sens2 = np.abs(sens2 / 2.)

    matshow(np.sqrt(derivative[..., l]), title="sensitivity 1")
    matshow(np.sqrt(sens2[..., l]), title="sensitivity 2")
    matshow(np.sqrt(sens2[..., l] + derivative[..., l]),
            title="sensitivity 1+2")
    ## matshow((last_m - mean)[..., l], title="difference with mean")
    ## matshow(topo_count[..., l], title="non-zero count")
    pl.show()