def __init__(self, y): # Pre-cache a sparse LU decomposition of the FL matrix from pygfl.utils import get_1d_penalty_matrix from scipy.sparse.linalg import factorized from scipy.sparse import csc_matrix D = get_1d_penalty_matrix(y.shape[0]) D = np.vstack([D, np.zeros(y.shape[0])]) D[-1,-1] = 1e-6 # Nugget for full rank matrix D = csc_matrix(D) self.invD = factorized(D) # Setup the fast GFL solver from pygfl.solver import TrailSolver from pygfl.trails import decompose_graph from pygfl.utils import hypercube_edges, chains_to_trails from networkx import Graph edges = hypercube_edges(y.shape) g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) self.solver = TrailSolver() self.solver.set_data(y, edges, ntrails, trails, breakpoints) from pygfl.easy import solve_gfl self.beta = solve_gfl(y)
def train_gtv(X, y, q, minlam=0.2, maxlam=10., numlam=30, verbose=1, tf_k=0, penalty='gfl', **kwargs): if isinstance(q, int): q = (q, q) grid = generate_grid(X, q) # Divide the space into q^2 bins data = np.zeros(q[0]*q[1]) weights = np.zeros(q[0]*q[1]) i = 0 for x1_left, x1_right in zip(grid[0][:-1], grid[0][1:]): for x2_left, x2_right in zip(grid[1][:-1], grid[1][1:]): vals = np.where((X[:,0] >= x1_left) * (X[:,0] < x1_right) * (X[:,1] >= x2_left) * (X[:,1] < x2_right))[0] weights[i] = len(vals) data[i] = y[vals].mean() if len(vals) > 0 else 0 i += 1 # Get the edges for a 2d grid edges = hypercube_edges(q) ########### Setup the graph if penalty == 'gfl': g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) elif penalty == 'dp' or penalty == 'gamlasso': trails = np.array(edges, dtype='int32').flatten() breakpoints = np.array(range(2, len(trails)+1, 2), dtype='int32') ntrails = len(breakpoints) print '\tSetting up trail solver' solver = TrailSolver(maxsteps=30000, penalty=penalty) # Set the data and pre-cache any necessary structures solver.set_data(data, edges, ntrails, trails, breakpoints, weights=weights) print '\tSolving' # Grid search to find the best lambda results = solver.solution_path(minlam, maxlam, numlam, verbose=verbose) results['grid'] = grid return results
def smooth_fdr(data, fdr_level, edges=None, initial_values=None, verbose=0, null_dist=None, signal_dist=None, num_sweeps=10, missing_val=None): flat_data = data.flatten() nonmissing_flat_data = flat_data if edges is None: if verbose: print( 'Using default edge set of a grid of same shape as the data: {0}' .format(data.shape)) edges = hypercube_edges(data.shape) if missing_val is not None: if verbose: print( 'Removing all data points whose data value is {0}'.format( missing_val)) edges = [(e1, e2) for (e1, e2) in edges if flat_data[e1] != missing_val and flat_data[e2] != missing_val] nonmissing_flat_data = flat_data[flat_data != missing_val] # Decompose the graph into trails g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) if null_dist is None: # empirical null estimation mu0, sigma0 = empirical_null(nonmissing_flat_data, verbose=max(0, verbose - 1)) elif isinstance(null_dist, GaussianKnown): mu0, sigma0 = null_dist.mean, null_dist.stdev else: mu0, sigma0 = null_dist null_dist = GaussianKnown(mu0, sigma0) if verbose: print('Empirical null: {0}'.format(null_dist)) # signal distribution estimation if verbose: print('Running predictive recursion for {0} sweeps'.format(num_sweeps)) if signal_dist is None: grid_x = np.linspace(min(-20, nonmissing_flat_data.min() - 1), max(nonmissing_flat_data.max() + 1, 20), 220) pr_results = predictive_recursion(nonmissing_flat_data, num_sweeps, grid_x, mu0=mu0, sig0=sigma0) signal_dist = GridDistribution(pr_results['grid_x'], pr_results['y_signal']) if verbose: print('Smoothing priors via solution path algorithm') solver = TrailSolver() solver.set_data(flat_data, edges, ntrails, trails, breakpoints) results = solution_path_smooth_fdr(flat_data, solver, null_dist, signal_dist, verbose=max(0, verbose - 1)) results['discoveries'] = calc_fdr(results['posteriors'], fdr_level) results['null_dist'] = null_dist results['signal_dist'] = signal_dist # Reshape everything back to the original data shape results['betas'] = results['betas'].reshape(data.shape) results['priors'] = results['priors'].reshape(data.shape) results['posteriors'] = results['posteriors'].reshape(data.shape) results['discoveries'] = results['discoveries'].reshape(data.shape) results['beta_iters'] = np.array( [x.reshape(data.shape) for x in results['beta_iters']]) results['prior_iters'] = np.array( [x.reshape(data.shape) for x in results['prior_iters']]) results['posterior_iters'] = np.array( [x.reshape(data.shape) for x in results['posterior_iters']]) return results
def smooth_fdr_known_dists(data, fdr_level, null_dist, signal_dist, edges=None, initial_values=None, verbose=0, missing_val=None): '''FDR smoothing where the null and alternative distributions are known (and not necessarily Gaussian). Both must define the function pdf.''' flat_data = data.flatten() nonmissing_flat_data = flat_data if edges is None: if verbose: print( 'Using default edge set of a grid of same shape as the data: {0}' .format(data.shape)) edges = hypercube_edges(data.shape) if missing_val is not None: if verbose: print( 'Removing all data points whose data value is {0}'.format( missing_val)) edges = [(e1, e2) for (e1, e2) in edges if flat_data[e1] != missing_val and flat_data[e2] != missing_val] nonmissing_flat_data = flat_data[flat_data != missing_val] # Decompose the graph into trails g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) if verbose: print('Smoothing priors via solution path algorithm') solver = TrailSolver() solver.set_data(flat_data, edges, ntrails, trails, breakpoints) results = solution_path_smooth_fdr(flat_data, solver, null_dist, signal_dist, verbose=max(0, verbose - 1)) results['discoveries'] = calc_fdr(results['posteriors'], fdr_level) results['null_dist'] = null_dist results['signal_dist'] = signal_dist # Reshape everything back to the original data shape results['betas'] = results['betas'].reshape(data.shape) results['priors'] = results['priors'].reshape(data.shape) results['posteriors'] = results['posteriors'].reshape(data.shape) results['discoveries'] = results['discoveries'].reshape(data.shape) results['beta_iters'] = np.array( [x.reshape(data.shape) for x in results['beta_iters']]) results['prior_iters'] = np.array( [x.reshape(data.shape) for x in results['prior_iters']]) results['posterior_iters'] = np.array( [x.reshape(data.shape) for x in results['posterior_iters']]) return results
def count_plateaus(data): shape = (100, 100) edges = edge_map_from_edge_list(hypercube_edges(shape)) return len(calc_plateaus(data, edges))
print dataset for i, (model, skips) in enumerate(zip(models, skiprows)): print '\t{0}'.format(model) for trial in xrange(numtrials): print '\t\t{0}'.format(trial) results[i, j, :2, trial] = np.loadtxt( 'data/uci/{0}/results/{1}/{2}.csv'.format( dataset, model, trial), delimiter=',', skiprows=skips)[:2] sweep = np.loadtxt('data/uci/{0}/sweeps/{1}/{2}.csv'.format( dataset, model, trial), delimiter=',', skiprows=skips) shape = (1000, 1000) edges = edge_map_from_edge_list(hypercube_edges(shape)) results[i, j, 2, trial] = len(calc_plateaus(sweep, edges)) results[ i, j, 3, trial] = -0.5 * n * results[i, j, 0, trial]**2 + results[ i, j, 2, trial] * (np.log(n) - np.log(2 * np.pi)) agg_results = results.mean(axis=3) agg_std = results.std(axis=3) dargs = {} for j, (dataset, n) in enumerate(zip(datasets, N)): dargs['dataset'] = dataset dargs['N'] = n print '''\multicolumn{1}{l}{}{dataset} & \multicolumn{3}{c}{''' + '{dataset} (N = {N})'.format( **dargs) + '''} \\'''
from pygfl.utils import calc_plateaus, hypercube_edges, edge_map_from_edge_list if __name__ == '__main__': cities = [('austin', 'austin2014', 100), ('chicago', 'chicago2015', 200)] for city, _, q in cities: models = ['cart', 'crisp', 'gapcrisp', 'gfl'] names = ['CART', 'CRISP', 'GapCRISP', 'GapTV'] skiprows = [1, 1, 1, 0] numtrials = 20 data = np.loadtxt('data/crime/{0}/all.csv'.format(city), delimiter=',') shape = (q, q) N = shape[0] * shape[1] # Get all the nodes that exist in the grid nodeset = set([i * shape[0] + j for i, j, _ in data]) edges = hypercube_edges(shape) edges = edge_map_from_edge_list(edges) results = np.zeros((len(models), 6, numtrials)) for i, (model, skips) in enumerate(zip(models, skiprows)): print '\t{0}'.format(model) for trial in xrange(numtrials): results[i, :2, trial] = np.loadtxt( 'data/crime/{0}/results/{1}/{2}.csv'.format( city, model, trial), delimiter=',', skiprows=skips)[:2] sweep = np.loadtxt('data/crime/{0}/sweeps/{1}/{2}.csv'.format( city, model, trial), delimiter=',', skiprows=skips)