Ejemplo n.º 1
0
    def __init__(self, y):
        # Pre-cache a sparse LU decomposition of the FL matrix
        from pygfl.utils import get_1d_penalty_matrix
        from scipy.sparse.linalg import factorized
        from scipy.sparse import csc_matrix
        D = get_1d_penalty_matrix(y.shape[0])
        D = np.vstack([D, np.zeros(y.shape[0])])
        D[-1,-1] = 1e-6 # Nugget for full rank matrix
        D = csc_matrix(D)
        self.invD = factorized(D)

        # Setup the fast GFL solver
        from pygfl.solver import TrailSolver
        from pygfl.trails import decompose_graph
        from pygfl.utils import hypercube_edges, chains_to_trails
        from networkx import Graph
        edges = hypercube_edges(y.shape)
        g = Graph()
        g.add_edges_from(edges)
        chains = decompose_graph(g, heuristic='greedy')
        ntrails, trails, breakpoints, edges = chains_to_trails(chains)
        self.solver = TrailSolver()
        self.solver.set_data(y, edges, ntrails, trails, breakpoints)

        from pygfl.easy import solve_gfl
        self.beta = solve_gfl(y)
Ejemplo n.º 2
0
def train_gtv(X, y, q, minlam=0.2, maxlam=10., numlam=30, verbose=1, tf_k=0, penalty='gfl', **kwargs):
    if isinstance(q, int):
        q = (q, q)

    grid = generate_grid(X, q)

    # Divide the space into q^2 bins
    data = np.zeros(q[0]*q[1])
    weights = np.zeros(q[0]*q[1])
    i = 0
    for x1_left, x1_right in zip(grid[0][:-1], grid[0][1:]):
        for x2_left, x2_right in zip(grid[1][:-1], grid[1][1:]):
            vals = np.where((X[:,0] >= x1_left) * (X[:,0] < x1_right) * (X[:,1] >= x2_left) * (X[:,1] < x2_right))[0]
            weights[i] = len(vals)
            data[i] = y[vals].mean() if len(vals) > 0 else 0
            i += 1

    # Get the edges for a 2d grid
    edges = hypercube_edges(q)

    ########### Setup the graph
    if penalty == 'gfl':
        g = Graph()
        g.add_edges_from(edges)
        chains = decompose_graph(g, heuristic='greedy')
        ntrails, trails, breakpoints, edges = chains_to_trails(chains)
    elif penalty == 'dp' or penalty == 'gamlasso':
        trails = np.array(edges, dtype='int32').flatten()
        breakpoints = np.array(range(2, len(trails)+1, 2), dtype='int32')
        ntrails = len(breakpoints)

    print '\tSetting up trail solver'
    solver = TrailSolver(maxsteps=30000, penalty=penalty)

    # Set the data and pre-cache any necessary structures
    solver.set_data(data, edges, ntrails, trails, breakpoints, weights=weights)

    print '\tSolving'
    # Grid search to find the best lambda
    results = solver.solution_path(minlam, maxlam, numlam, verbose=verbose)
    results['grid'] = grid
    return results
Ejemplo n.º 3
0
def smooth_fdr(data,
               fdr_level,
               edges=None,
               initial_values=None,
               verbose=0,
               null_dist=None,
               signal_dist=None,
               num_sweeps=10,
               missing_val=None):
    flat_data = data.flatten()
    nonmissing_flat_data = flat_data

    if edges is None:
        if verbose:
            print(
                'Using default edge set of a grid of same shape as the data: {0}'
                .format(data.shape))
        edges = hypercube_edges(data.shape)
        if missing_val is not None:
            if verbose:
                print(
                    'Removing all data points whose data value is {0}'.format(
                        missing_val))
            edges = [(e1, e2) for (e1, e2) in edges
                     if flat_data[e1] != missing_val
                     and flat_data[e2] != missing_val]
            nonmissing_flat_data = flat_data[flat_data != missing_val]

    # Decompose the graph into trails
    g = Graph()
    g.add_edges_from(edges)
    chains = decompose_graph(g, heuristic='greedy')
    ntrails, trails, breakpoints, edges = chains_to_trails(chains)

    if null_dist is None:
        # empirical null estimation
        mu0, sigma0 = empirical_null(nonmissing_flat_data,
                                     verbose=max(0, verbose - 1))
    elif isinstance(null_dist, GaussianKnown):
        mu0, sigma0 = null_dist.mean, null_dist.stdev
    else:
        mu0, sigma0 = null_dist
    null_dist = GaussianKnown(mu0, sigma0)

    if verbose:
        print('Empirical null: {0}'.format(null_dist))

    # signal distribution estimation
    if verbose:
        print('Running predictive recursion for {0} sweeps'.format(num_sweeps))
    if signal_dist is None:
        grid_x = np.linspace(min(-20,
                                 nonmissing_flat_data.min() - 1),
                             max(nonmissing_flat_data.max() + 1, 20), 220)
        pr_results = predictive_recursion(nonmissing_flat_data,
                                          num_sweeps,
                                          grid_x,
                                          mu0=mu0,
                                          sig0=sigma0)
        signal_dist = GridDistribution(pr_results['grid_x'],
                                       pr_results['y_signal'])

    if verbose:
        print('Smoothing priors via solution path algorithm')

    solver = TrailSolver()
    solver.set_data(flat_data, edges, ntrails, trails, breakpoints)

    results = solution_path_smooth_fdr(flat_data,
                                       solver,
                                       null_dist,
                                       signal_dist,
                                       verbose=max(0, verbose - 1))

    results['discoveries'] = calc_fdr(results['posteriors'], fdr_level)
    results['null_dist'] = null_dist
    results['signal_dist'] = signal_dist

    # Reshape everything back to the original data shape
    results['betas'] = results['betas'].reshape(data.shape)
    results['priors'] = results['priors'].reshape(data.shape)
    results['posteriors'] = results['posteriors'].reshape(data.shape)
    results['discoveries'] = results['discoveries'].reshape(data.shape)
    results['beta_iters'] = np.array(
        [x.reshape(data.shape) for x in results['beta_iters']])
    results['prior_iters'] = np.array(
        [x.reshape(data.shape) for x in results['prior_iters']])
    results['posterior_iters'] = np.array(
        [x.reshape(data.shape) for x in results['posterior_iters']])

    return results
Ejemplo n.º 4
0
def smooth_fdr_known_dists(data,
                           fdr_level,
                           null_dist,
                           signal_dist,
                           edges=None,
                           initial_values=None,
                           verbose=0,
                           missing_val=None):
    '''FDR smoothing where the null and alternative distributions are known
    (and not necessarily Gaussian). Both must define the function pdf.'''
    flat_data = data.flatten()
    nonmissing_flat_data = flat_data

    if edges is None:
        if verbose:
            print(
                'Using default edge set of a grid of same shape as the data: {0}'
                .format(data.shape))
        edges = hypercube_edges(data.shape)
        if missing_val is not None:
            if verbose:
                print(
                    'Removing all data points whose data value is {0}'.format(
                        missing_val))
            edges = [(e1, e2) for (e1, e2) in edges
                     if flat_data[e1] != missing_val
                     and flat_data[e2] != missing_val]
            nonmissing_flat_data = flat_data[flat_data != missing_val]

    # Decompose the graph into trails
    g = Graph()
    g.add_edges_from(edges)
    chains = decompose_graph(g, heuristic='greedy')
    ntrails, trails, breakpoints, edges = chains_to_trails(chains)

    if verbose:
        print('Smoothing priors via solution path algorithm')

    solver = TrailSolver()
    solver.set_data(flat_data, edges, ntrails, trails, breakpoints)

    results = solution_path_smooth_fdr(flat_data,
                                       solver,
                                       null_dist,
                                       signal_dist,
                                       verbose=max(0, verbose - 1))

    results['discoveries'] = calc_fdr(results['posteriors'], fdr_level)
    results['null_dist'] = null_dist
    results['signal_dist'] = signal_dist

    # Reshape everything back to the original data shape
    results['betas'] = results['betas'].reshape(data.shape)
    results['priors'] = results['priors'].reshape(data.shape)
    results['posteriors'] = results['posteriors'].reshape(data.shape)
    results['discoveries'] = results['discoveries'].reshape(data.shape)
    results['beta_iters'] = np.array(
        [x.reshape(data.shape) for x in results['beta_iters']])
    results['prior_iters'] = np.array(
        [x.reshape(data.shape) for x in results['prior_iters']])
    results['posterior_iters'] = np.array(
        [x.reshape(data.shape) for x in results['posterior_iters']])

    return results
Ejemplo n.º 5
0
def count_plateaus(data):
    shape = (100, 100)
    edges = edge_map_from_edge_list(hypercube_edges(shape))
    return len(calc_plateaus(data, edges))
Ejemplo n.º 6
0
        print dataset
        for i, (model, skips) in enumerate(zip(models, skiprows)):
            print '\t{0}'.format(model)
            for trial in xrange(numtrials):
                print '\t\t{0}'.format(trial)
                results[i, j, :2, trial] = np.loadtxt(
                    'data/uci/{0}/results/{1}/{2}.csv'.format(
                        dataset, model, trial),
                    delimiter=',',
                    skiprows=skips)[:2]
                sweep = np.loadtxt('data/uci/{0}/sweeps/{1}/{2}.csv'.format(
                    dataset, model, trial),
                                   delimiter=',',
                                   skiprows=skips)
                shape = (1000, 1000)
                edges = edge_map_from_edge_list(hypercube_edges(shape))
                results[i, j, 2, trial] = len(calc_plateaus(sweep, edges))
                results[
                    i, j, 3,
                    trial] = -0.5 * n * results[i, j, 0, trial]**2 + results[
                        i, j, 2, trial] * (np.log(n) - np.log(2 * np.pi))

    agg_results = results.mean(axis=3)
    agg_std = results.std(axis=3)

    dargs = {}
    for j, (dataset, n) in enumerate(zip(datasets, N)):
        dargs['dataset'] = dataset
        dargs['N'] = n
        print '''\multicolumn{1}{l}{}{dataset} & \multicolumn{3}{c}{''' + '{dataset} (N = {N})'.format(
            **dargs) + '''} \\'''
Ejemplo n.º 7
0
from pygfl.utils import calc_plateaus, hypercube_edges, edge_map_from_edge_list

if __name__ == '__main__':
    cities = [('austin', 'austin2014', 100), ('chicago', 'chicago2015', 200)]
    for city, _, q in cities:
        models = ['cart', 'crisp', 'gapcrisp', 'gfl']
        names = ['CART', 'CRISP', 'GapCRISP', 'GapTV']
        skiprows = [1, 1, 1, 0]
        numtrials = 20
        data = np.loadtxt('data/crime/{0}/all.csv'.format(city), delimiter=',')
        shape = (q, q)
        N = shape[0] * shape[1]

        # Get all the nodes that exist in the grid
        nodeset = set([i * shape[0] + j for i, j, _ in data])
        edges = hypercube_edges(shape)
        edges = edge_map_from_edge_list(edges)

        results = np.zeros((len(models), 6, numtrials))
        for i, (model, skips) in enumerate(zip(models, skiprows)):
            print '\t{0}'.format(model)
            for trial in xrange(numtrials):
                results[i, :2, trial] = np.loadtxt(
                    'data/crime/{0}/results/{1}/{2}.csv'.format(
                        city, model, trial),
                    delimiter=',',
                    skiprows=skips)[:2]
                sweep = np.loadtxt('data/crime/{0}/sweeps/{1}/{2}.csv'.format(
                    city, model, trial),
                                   delimiter=',',
                                   skiprows=skips)