def estimate_K(q_hat, R_pow_Y, X): n, p = X.shape TOL = 1e-13 EPS = 1.e-10 K = np.ones((1, p)) R_pow_Y_q_hat = R_pow_Y * q_hat for iters in range(100): K_RtoY_qHat = K * R_pow_Y_q_hat Z = 1. - q_hat + K_RtoY_qHat s1 = 2. * K_RtoY_qHat T1 = np.nansum(s1/Z - X, axis=0) s1 = 2. * (1.-q_hat) * R_pow_Y_q_hat T2 = np.nansum(s1/(Z**2), axis=0) K_new = K - T1/T2 K_new = np.fmax(EPS, K_new) test = np.max(abs(K_new-K)) K = K_new if test < TOL: break common.print_log("Num K iterations = {0}, err = {1}".format(iters, test)) return K
def estimate_K(q_hat, R_pow_Y, X): n, p = X.shape TOL = 1e-13 EPS = 1.e-10 K = np.ones((1, p)) R_pow_Y_q_hat = R_pow_Y * q_hat for iters in range(100): K_RtoY_qHat = K * R_pow_Y_q_hat Z = 1. - q_hat + K_RtoY_qHat s1 = 2. * K_RtoY_qHat T1 = np.nansum(s1 / Z - X, axis=0) s1 = 2. * (1. - q_hat) * R_pow_Y_q_hat T2 = np.nansum(s1 / (Z**2), axis=0) K_new = K - T1 / T2 K_new = np.fmax(EPS, K_new) test = np.max(abs(K_new - K)) K = K_new if test < TOL: break common.print_log("Num K iterations = {0}, err = {1}".format(iters, test)) return K
def simulate_square(allele_freq_fn, n, p, pc, **kwargs): loc, Q, X = simulate_localization.simulate_square(allele_freq_fn, n, p, **kwargs) pr = p - pc common.print_log("Number of non-zero effect size SNPs = {0}".format(pc)) common.print_log("Total number of SNPs = {0}".format(p)) null_snps = np.arange(0, pr) causal_snps = np.arange(pr, p) geno_h = float(kwargs.get("geno_h", 0.90)) anc_h = float(kwargs.get("anc_h", 0.05)) env_h = float(kwargs.get("env_h", 0.05)) is_discrete = "discrete" in kwargs common.print_log("Genetic heritability proportion = {0}".format(geno_h)) common.print_log("Ancestry heritability proportion = {0}".format(anc_h)) if not is_discrete: common.print_log( "Environment heritability proportion = {0}".format(env_h)) # generate genotype contribution to phenotype betas = np.zeros(p) betas[causal_snps] = np.random.normal(0, 1, pc) geno_contribution = np.dot(X, betas) geno_contribution = geno_contribution * np.sqrt(geno_h) / np.std( geno_contribution) # generate location-dependent contribution to phenotype # if alleleFreqFn == simulate.logisticDirectionalExpDecayCovAlleleFreqFn: theta = float(kwargs.get("theta", 0.)) u = np.array([np.cos(theta), np.sin(theta)]) anc_contribution = np.dot(loc, u) anc_contribution = anc_contribution * np.sqrt(anc_h) / np.std( anc_contribution) # phenotype is the sum of genotype contribution and env contribution Y = geno_contribution + anc_contribution if is_discrete: # phenotype is discrete prob = 1. / (1. + np.exp(-Y)) rnd = np.random.rand(n) Y = np.array(rnd <= prob, dtype=int) else: # generate independent env/noise contribution to phenotype for continuous phenotype env_contribution = np.random.normal(0, 1, n) env_contribution = env_contribution * np.sqrt(env_h) / np.std( env_contribution) Y = Y + env_contribution if is_discrete: return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution else: return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution
def simulate_square(allele_freq_fn, n, p, pc, **kwargs): loc, Q, X = simulate_localization.simulate_square(allele_freq_fn, n, p, **kwargs) pr = p - pc common.print_log("Number of non-zero effect size SNPs = {0}".format(pc)) common.print_log("Total number of SNPs = {0}".format(p)) null_snps = np.arange(0, pr) causal_snps = np.arange(pr, p) geno_h = float(kwargs.get("geno_h", 0.90)) anc_h = float(kwargs.get("anc_h", 0.05)) env_h = float(kwargs.get("env_h", 0.05)) is_discrete = "discrete" in kwargs common.print_log("Genetic heritability proportion = {0}".format(geno_h)) common.print_log("Ancestry heritability proportion = {0}".format(anc_h)) if not is_discrete: common.print_log("Environment heritability proportion = {0}".format(env_h)) # generate genotype contribution to phenotype betas = np.zeros(p) betas[causal_snps] = np.random.normal(0, 1, pc) geno_contribution = np.dot(X, betas) geno_contribution = geno_contribution * np.sqrt(geno_h) / np.std(geno_contribution) # generate location-dependent contribution to phenotype # if alleleFreqFn == simulate.logisticDirectionalExpDecayCovAlleleFreqFn: theta = float(kwargs.get("theta", 0.)) u = np.array([np.cos(theta), np.sin(theta)]) anc_contribution = np.dot(loc, u) anc_contribution = anc_contribution * np.sqrt(anc_h) / np.std(anc_contribution) # phenotype is the sum of genotype contribution and env contribution Y = geno_contribution + anc_contribution if is_discrete: # phenotype is discrete prob = 1. / (1. + np.exp(- Y)) rnd = np.random.rand(n) Y = np.array(rnd <= prob, dtype=int) else: # generate independent env/noise contribution to phenotype for continuous phenotype env_contribution = np.random.normal(0, 1, n) env_contribution = env_contribution * np.sqrt(env_h) / np.std(env_contribution) Y = Y + env_contribution if is_discrete: return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution else: return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution
def localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs): n, p = X.shape common.print_log() common.print_log("Running PCA") loc_PCA, variance_explained, reconstruction_proportion = localization.pca( X, **kwargs) df_inferred = pd.DataFrame({ "ind_id": inds_df.ind_id, "coord1": loc_PCA[:, 0], "coord2": loc_PCA[:, 1] }) if inds_training_df is not None: df_inferred, training_rmse = rescale_locations(df_inferred, inds_training_df) common.print_log("RMSE on training data: {0}".format(training_rmse)) pca_output_path = "{0}.pca".format(out_prefix) df_inferred.to_csv(pca_output_path, sep="\t", header=False, index=False, columns=["ind_id", "coord1", "coord2"]) common.print_log( "Wrote PCA inferred locations to {0}".format(pca_output_path))
def get_candidate_taus_above_threshold(Ds, thresh, **kwargs): upper_tri_Ds = Ds[np.triu_indices_from(Ds, k=1)] if "nz_frac" in kwargs: nz_frac = float(kwargs["nz_frac"]) common.print_log("Setting tau so that fraction of distances below threshold = {0}".format(nz_frac)) all_taus = np.array(sorted(upper_tri_Ds)) n_all_taus = len(all_taus) idx = min(max(int(nz_frac*n_all_taus), 0), n_all_taus-1) tau = all_taus[idx] if tau < thresh: common.print_log("Parameter tau was set below the minimum value which makes the graph connected. Changing it to {0}".format(thresh)) tau = thresh candidate_taus = np.array([tau]) else: grid_size = int(kwargs.get("grid_size", 20)) linspace_tau = bool(kwargs.get("linspace_tau", False)) if linspace_tau: candidate_taus = np.linspace(thresh, np.max(Ds[Ds > 0]), grid_size) else: all_taus = np.array(sorted(upper_tri_Ds[upper_tri_Ds > thresh])) n_all_taus = len(all_taus) tau_indices = np.asarray(np.concatenate([np.linspace(0, 1, grid_size)]) * (n_all_taus - 1), dtype=int) candidate_taus = sorted(all_taus[tau_indices]) nz_fracs = [100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds) for tau in candidate_taus] common.print_log("Found {0} candidate thresholds:".format(len(candidate_taus)), candidate_taus) common.print_log("Percentage of distances below threshold:", nz_fracs) return candidate_taus
def simulate_square(allele_freq_fn, n=1000, p=50000, **kwargs): """simulate individual locations in a unit square with coordinates drawn independently from a Beta(b, b) distribution, using allele frequencies drawn from the stochastic process encoded in function allele_freq_fn """ beta = float(kwargs.get("beta", 1.0)) common.print_log("Simulating from the unit square, n = {0}, p = {1}".format(n, p)) common.print_log("Coordinate distribution, beta =", beta) loc = np.random.beta(beta, beta, size=(n, 2)) - 0.5 Q = allele_freq_fn(loc, p, **kwargs) X = generate_genotypes(Q) return loc, Q, X
def simulate_square(allele_freq_fn, n=1000, p=50000, **kwargs): """simulate individual locations in a unit square with coordinates drawn independently from a Beta(b, b) distribution, using allele frequencies drawn from the stochastic process encoded in function allele_freq_fn """ beta = float(kwargs.get("beta", 1.0)) common.print_log( "Simulating from the unit square, n = {0}, p = {1}".format(n, p)) common.print_log("Coordinate distribution, beta =", beta) loc = np.random.beta(beta, beta, size=(n, 2)) - 0.5 Q = allele_freq_fn(loc, p, **kwargs) X = generate_genotypes(Q) return loc, Q, X
def mds(dist_mat, verbose=True): n = dist_mat.shape[0] sparse_graph = csr_matrix(dist_mat) n_components, _ = connected_components(sparse_graph) if n_components != 1: common.print_log("Choose larger threshold tau!") return None, None, None shortest_path_dist = shortest_path(sparse_graph, method='D') if verbose: common.print_log( "Shortest path distance matrix entries, mean = %f, std dev = %f, max = %f" % (np.mean(shortest_path_dist), np.std(shortest_path_dist), np.max(shortest_path_dist))) shortest_path_dist_sq = shortest_path_dist**2 C = np.eye(n) - 1. / n * np.ones((n, n)) tmp = -0.5 * np.dot(np.dot(C, shortest_path_dist_sq), C) tmp = (tmp + tmp.T) / 2. S, U = np.linalg.eigh(tmp) S_MDS, U_MDS = S[[-1, -2]], U[:, [-1, -2]] assert np.all(S_MDS > 0.) loc_MDS = np.sqrt(S_MDS) * U_MDS if verbose: common.print_log("Num positive eigenvalues of MDS matrix =", np.sum(S >= 0.)) common.print_log("Num negative eigenvalues of MDS matrix =", np.sum(S < 0.)) reconstruction_proportion = np.sum(S_MDS) / np.sum(np.abs(S)) if verbose: # (||LD'L||_F / ||LDL||_F) common.print_log( "Distance matrix reconstruction proportion = {0}".format( reconstruction_proportion)) return loc_MDS, reconstruction_proportion
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("-b", "--prefix", required=True, help="genotype file (without extension) in plink bed file format") parser.add_argument("-l", "--out_prefix", required=True, help="prefix for file names where localization outputs will be stored") parser.add_argument("-t", "--training_file", required=False, default=None, help="file containing a subset of the individuals with known locations") parser.add_argument("-c", "--cv_folds", required=False, type=int, default=1, help="number of folds for cross-validation") parser.add_argument("args", nargs=argparse.REMAINDER, help="specify either gap or pca (or both) for the localization algorithm to run") args = parser.parse_args(argv[1:]) genotype_files_prefix = args.prefix out_prefix = args.out_prefix training_file = args.training_file if args.training_file else None cv_folds = args.cv_folds kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) fam_file_path = "{0}.fam".format(genotype_files_prefix) inds_df = pd.read_table(fam_file_path, header=None, delim_whitespace=True, names=["ind_id"], usecols=[1]) inds_training_df = None if training_file: inds_training_df = pd.read_table(training_file, delim_whitespace=True, header=None, names=["ind_id", "coord1", "coord2"]) inds_training_df = pd.merge(inds_df, inds_training_df, how="inner", on=["ind_id"]) cv_folds = min(cv_folds, inds_training_df.shape[0]) bed_file_path = "{0}.bed".format(genotype_files_prefix) X = common.read_bed_file(bed_file_path) n, p = X.shape assert len(inds_df) == n X = np.asarray(X, dtype=float) X[(X < 0) | (X > 2)] = np.nan common.print_log("Input matrix dimensions, n = {0}, p = {1}".format(n, p)) if "pca" in kwargs: localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs) if "gap" in kwargs: localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs)
def compute_connectivity_threshold(Ds, **kwargs): n, _ = Ds.shape Ds_nz = Ds[Ds > 0] # find the range of values of the distance for which the graph is connected lo = np.min(Ds_nz) hi = np.max(Ds_nz) eps = 0.01*(hi - lo) while (hi - lo) >= eps: mid = (lo + hi) / 2. sparse_graph = csr_matrix(Ds <= mid) n_components, _ = connected_components(sparse_graph) if n_components != 1: # graph is not connected lo = mid else: hi = mid thresh = hi common.print_log("Smallest threshold tau which makes graph connected = {0}".format(thresh)) return thresh
def run_experiment(info, experiments_dir, tmp_data_dir, exp_name): to_local_time = lambda sec: time.asctime(time.localtime(sec)) exp_dir = os.path.join(experiments_dir, exp_name) exp_conf = info.exp_config_dir(exp_name) # set up a temporary data directory for that experiment exp_data_dir = os.path.join(tmp_data_dir, exp_name) idemp_mkdir(exp_data_dir) # Mark the start and the end of an experiment start_time = time.time() start_msg = f'Experiment {exp_name} starts @ {to_local_time(start_time)}' print_log(start_msg) # run the run.sh file on the configs directory and the destination directory subprocess.call([os.path.join(exp_dir, 'run.sh'), exp_conf, exp_data_dir], cwd=exp_dir) end_time = time.time() delta = datetime.timedelta(seconds=end_time - start_time) # collect the status file from the destination directory, copy to status dir status = validate_status(exp_data_dir) # show experiment status to terminal if status['success']: end_msg = f'Experiment {exp_name} ends @ {to_local_time(end_time)}\nTime Delta: {delta}' print_log(end_msg) else: print_log(f'*** {exp_name} FAILED ***\n*** Reason: {status["message"]} ***') # record start & end & duration of an experiment status['start_time'] = to_local_time(start_time) status['end_time'] = to_local_time(end_time) status['time_delta'] = str(delta) # not literally copying because validate may have produced a status that generated an error info.report_exp_status(exp_name, 'run', status) return status['success']
def localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs): n, p = X.shape common.print_log() common.print_log("Running PCA") loc_PCA, variance_explained, reconstruction_proportion = localization.pca(X, **kwargs) df_inferred = pd.DataFrame({"ind_id": inds_df.ind_id, "coord1": loc_PCA[:, 0], "coord2": loc_PCA[:, 1]}) if inds_training_df is not None: df_inferred, training_rmse = rescale_locations(df_inferred, inds_training_df) common.print_log("RMSE on training data: {0}".format(training_rmse)) pca_output_path = "{0}.pca".format(out_prefix) df_inferred.to_csv(pca_output_path, sep="\t", header=False, index=False, columns=["ind_id", "coord1", "coord2"]) common.print_log("Wrote PCA inferred locations to {0}".format(pca_output_path))
def compute_connectivity_threshold(Ds, **kwargs): n, _ = Ds.shape Ds_nz = Ds[Ds > 0] # find the range of values of the distance for which the graph is connected lo = np.min(Ds_nz) hi = np.max(Ds_nz) eps = 0.01 * (hi - lo) while (hi - lo) >= eps: mid = (lo + hi) / 2. sparse_graph = csr_matrix(Ds <= mid) n_components, _ = connected_components(sparse_graph) if n_components != 1: # graph is not connected lo = mid else: hi = mid thresh = hi common.print_log( "Smallest threshold tau which makes graph connected = {0}".format( thresh)) return thresh
def mds(dist_mat, verbose=True): n = dist_mat.shape[0] sparse_graph = csr_matrix(dist_mat) n_components, _ = connected_components(sparse_graph) if n_components != 1: common.print_log("Choose larger threshold tau!") return None, None, None shortest_path_dist = shortest_path(sparse_graph, method='D') if verbose: common.print_log("Shortest path distance matrix entries, mean = %f, std dev = %f, max = %f" % (np.mean(shortest_path_dist), np.std(shortest_path_dist), np.max(shortest_path_dist))) shortest_path_dist_sq = shortest_path_dist**2 C = np.eye(n) - 1./n * np.ones((n, n)) tmp = - 0.5 * np.dot(np.dot(C, shortest_path_dist_sq), C) tmp = (tmp + tmp.T) / 2. S, U = np.linalg.eigh(tmp) S_MDS, U_MDS = S[[-1, -2]], U[:, [-1, -2]] assert np.all(S_MDS > 0.) loc_MDS = np.sqrt(S_MDS) * U_MDS if verbose: common.print_log("Num positive eigenvalues of MDS matrix =", np.sum(S >= 0.)) common.print_log("Num negative eigenvalues of MDS matrix =", np.sum(S < 0.)) reconstruction_proportion = np.sum(S_MDS) / np.sum(np.abs(S)) if verbose: # (||LD'L||_F / ||LDL||_F) common.print_log("Distance matrix reconstruction proportion = {0}".format(reconstruction_proportion)) return loc_MDS, reconstruction_proportion
def get_candidate_taus_above_threshold(Ds, thresh, **kwargs): upper_tri_Ds = Ds[np.triu_indices_from(Ds, k=1)] if "nz_frac" in kwargs: nz_frac = float(kwargs["nz_frac"]) common.print_log( "Setting tau so that fraction of distances below threshold = {0}". format(nz_frac)) all_taus = np.array(sorted(upper_tri_Ds)) n_all_taus = len(all_taus) idx = min(max(int(nz_frac * n_all_taus), 0), n_all_taus - 1) tau = all_taus[idx] if tau < thresh: common.print_log( "Parameter tau was set below the minimum value which makes the graph connected. Changing it to {0}" .format(thresh)) tau = thresh candidate_taus = np.array([tau]) else: grid_size = int(kwargs.get("grid_size", 20)) linspace_tau = bool(kwargs.get("linspace_tau", False)) if linspace_tau: candidate_taus = np.linspace(thresh, np.max(Ds[Ds > 0]), grid_size) else: all_taus = np.array(sorted(upper_tri_Ds[upper_tri_Ds > thresh])) n_all_taus = len(all_taus) tau_indices = np.asarray( np.concatenate([np.linspace(0, 1, grid_size)]) * (n_all_taus - 1), dtype=int) candidate_taus = sorted(all_taus[tau_indices]) nz_fracs = [ 100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds) for tau in candidate_taus ] common.print_log( "Found {0} candidate thresholds:".format(len(candidate_taus)), candidate_taus) common.print_log("Percentage of distances below threshold:", nz_fracs) return candidate_taus
def pca(X, **kwargs): col_sums = np.nansum(X, axis=0) n_inds = np.sum(~np.isnan(X), axis=0) mu_hat = col_sums / n_inds # dimension p (or n if rowNormalize is true) # normalization by estimated std deviation sd_hat = (1. + col_sums) / (2. + 2. * n_inds) sd_hat = np.sqrt(sd_hat * (1. - sd_hat)) Xn = X.copy() Xn -= mu_hat Xn[np.isnan(Xn)] = 0. Xn /= sd_hat grm = np.dot(Xn, Xn.T) eig_indices = kwargs.get("eig_indices", [1, 2]) eig_indices = np.array(sorted([int(x) for x in eig_indices])) common.print_log("Computing principal components:", eig_indices) S, U = np.linalg.eigh(grm) S_PCA, U_PCA = S[-eig_indices], U[:, -eig_indices] assert np.all(S_PCA > 0.) loc_PCA = np.sqrt(S_PCA) * U_PCA variance_explained = np.sum(S_PCA) / np.trace(grm) * 100. reconstruction_proportion = np.sum(S_PCA) / np.sum(np.abs(S)) # (||LD'L||_F / ||LDL||_F) common.print_log( "Percent variance explained by PCA projection = {0}".format( variance_explained)) common.print_log("Distance matrix reconstruction proportion = {0}".format( reconstruction_proportion)) return loc_PCA, variance_explained, reconstruction_proportion
def pca(X, **kwargs): col_sums = np.nansum(X, axis=0) n_inds = np.sum(~ np.isnan(X), axis=0) mu_hat = col_sums / n_inds # dimension p (or n if rowNormalize is true) # normalization by estimated std deviation sd_hat = (1. + col_sums) / (2. + 2.*n_inds) sd_hat = np.sqrt(sd_hat * (1. - sd_hat)) Xn = X.copy() Xn -= mu_hat Xn[np.isnan(Xn)] = 0. Xn /= sd_hat grm = np.dot(Xn, Xn.T) eig_indices = kwargs.get("eig_indices", [1, 2]) eig_indices = np.array(sorted([int(x) for x in eig_indices])) common.print_log("Computing principal components:", eig_indices) S, U = np.linalg.eigh(grm) S_PCA, U_PCA = S[-eig_indices], U[:, -eig_indices] assert np.all(S_PCA > 0.) loc_PCA = np.sqrt(S_PCA) * U_PCA variance_explained = np.sum(S_PCA) / np.trace(grm) * 100. reconstruction_proportion = np.sum(S_PCA) / np.sum(np.abs(S)) # (||LD'L||_F / ||LDL||_F) common.print_log("Percent variance explained by PCA projection = {0}".format(variance_explained)) common.print_log("Distance matrix reconstruction proportion = {0}".format(reconstruction_proportion)) return loc_PCA, variance_explained, reconstruction_proportion
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("-b", "--out_prefix", required=True, help="file (without extension) to write output") parser.add_argument("-f", "--allele_freq_model", required=False, default="isotropic", help="Either 'isotropic' or 'directional'. Default, 'isotropic'") parser.add_argument("-n", "--n", required=True, type=int, help="Number of individuals n to simulate") parser.add_argument("-p", "--p", required=True, type=int, help="Number of SNPs p to simulate") parser.add_argument("--pc", required=False, type=int, default=10, help="Number of SNPs with non-zero effect sizes") parser.add_argument("-g", "--geno_h", required=False, type=float, default=0.95, help="Genetic heritability contribution (fraction between 0 and 1)") parser.add_argument("-a", "--anc_h", required=False, type=float, default=0.05, help="Ancestry heritability contribution (fraction between 0 and 1)") parser.add_argument("-e", "--env_h", required=False, type=float, default=0.05, help="Environment heritability contribution (fraction between 0 and 1)") parser.add_argument("args", nargs=argparse.REMAINDER) args = parser.parse_args(argv[1:]) kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) out_prefix = args.out_prefix allele_freq_model_name = args.allele_freq_model n = args.n p = args.p pc = args.pc allele_freq_fn = simulate_localization.allele_freq_fns[allele_freq_model_name] is_discrete = "discrete" in kwargs if is_discrete: loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution = simulate_square(allele_freq_fn, n, p, pc, geno_h=args.geno_h, anc_h=args.anc_h, **kwargs) else: loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution = simulate_square(allele_freq_fn, n, p, pc, geno_h=args.geno_h, anc_h=args.anc_h, env_h=args.env_h, **kwargs) assert X.shape == (n, p) # output true locations to file output_locations_file = "{0}.loc".format(out_prefix) df = pd.DataFrame(loc) df.to_csv(output_locations_file, sep="\t", header=False) common.print_log("Wrote ancestry information to {0}".format(output_locations_file)) # output allele frequencies to file if needed if "save_allele_frequencies" in kwargs: allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix) np.save(allele_frequency_file, Q) common.print_log("Wrote allele frequencies to binary file {0}".format(allele_frequency_file)) # output genotype data to bed/fam/bim file bed_file = "{0}.bed".format(out_prefix) common.write_bed_file_dims(X, bed_file, n, p) common.print_log("Wrote genotypes to file {0}".format(bed_file)) # output phenotype contribution from different components fam_file = "{0}.fam".format(out_prefix) pheno_dict = {"fam_id": range(n), "ind_id": range(n), "pat_id": [0]*n, "mat_id": [0]*n, "sex": [0]*n, "phenotype": Y, "geno_contribution": geno_contribution, "anc_contribution": anc_contribution} if not is_discrete: pheno_dict["env_contribution"] = env_contribution pheno_df = pd.DataFrame(pheno_dict) if is_discrete: pheno_df.to_csv(fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "phenotype", "geno_contribution", "anc_contribution"]) else: pheno_df.to_csv(fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "phenotype", "geno_contribution", "anc_contribution", "env_contribution"]) common.print_log("Wrote phenotype to file {0}".format(fam_file)) bim_file = "{0}.bim".format(out_prefix) # null SNPs are simulated to be on chr 1, causal on chr 2 null_snps = set(null_snps) chrs = [1 if snp_idx in null_snps else 2 for snp_idx in range(p)] df = pd.DataFrame({"chr": chrs, "snp": range(p), "dist": range(p), "pos": range(p), "allele1": [0]*p, "allele2": [1]*p}) df.to_csv(bim_file, sep="\t", header=False, index=False, columns=["chr", "snp", "dist", "pos", "allele1", "allele2"]) common.print_log("Wrote snp list to file {0}".format(bim_file))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("-b", "--out_prefix", required=True, help="file (without extension) to write output") parser.add_argument("-f", "--allele_freq_model", required=False, default="isotropic", help="Either 'isotropic' or 'directional'. Default, 'isotropic'") parser.add_argument("-n", "--n", required=True, type=int, help="Number of individuals n to simulate") parser.add_argument("-p", "--p", required=True, type=int, help="Number of SNPs p to simulate") parser.add_argument("args", nargs=argparse.REMAINDER) args = parser.parse_args(argv[1:]) kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) out_prefix = args.out_prefix allele_freq_model_name = args.allele_freq_model n = args.n p = args.p allele_freq_fn = allele_freq_fns[allele_freq_model_name] loc, Q, X = simulate_square(allele_freq_fn, n, p, **kwargs) assert X.shape == (n, p) # output true locations to file output_locations_file = "{0}.loc".format(out_prefix) df = pd.DataFrame(loc) df.to_csv(output_locations_file, sep="\t", header=False) common.print_log("Wrote ind locations to {0}".format(output_locations_file)) # output allele frequencies to file if needed if "save_allele_frequencies" in kwargs: allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix) np.save(allele_frequency_file, Q) common.print_log("Wrote allele frequencies to binary file {0}".format(allele_frequency_file)) # output genotype data to bed/fam/bim file bed_file = "{0}.bed".format(out_prefix) common.write_bed_file_dims(X, bed_file, n, p) common.print_log("Wrote genotypes to file {0}".format(bed_file)) fam_file = "{0}.fam".format(out_prefix) df = pd.DataFrame({"fam_id": range(n), "ind_id": range(n), "pat_id": [0]*n, "mat_id": [0]*n, "sex": [0]*n, "status": [0]*n}) df.to_csv(fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "status"]) common.print_log("Wrote ind list to file {0}".format(fam_file)) bim_file = "{0}.bim".format(out_prefix) df = pd.DataFrame({"chr": [1]*p, "snp": range(p), "dist": range(p), "pos": range(p), "allele1": [0]*p, "allele2": [1]*p}) df.to_csv(bim_file, sep="\t", header=False, index=False, columns=["chr", "snp", "dist", "pos", "allele1", "allele2"]) common.print_log("Wrote snp list to file {0}".format(bim_file))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument( "-b", "--prefix", required=True, help="genotype file (without extension) in plink bed file format") parser.add_argument("-o", "--out_prefix", required=True, help="prefix for association test output file") parser.add_argument( '-l', '--locations_file', required=True, help="PCA or GAP coordinates that will be used for allele frequency \ estimation and smoothing") parser.add_argument('args', nargs=argparse.REMAINDER) args = parser.parse_args(argv[1:]) genotype_files_prefix = args.prefix out_prefix = args.out_prefix locations_file = args.locations_file kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) bed_file_path = "{0}.bed".format(genotype_files_prefix) X = common.read_bed_file(bed_file_path) X = np.asarray(X, dtype=float) X[(X < 0) | (X > 2)] = np.nan fam_file_path = "{0}.fam".format(genotype_files_prefix) phenotype_df = pd.read_table(fam_file_path, header=None, delim_whitespace=True, names=["ind_id", "phenotype"], usecols=[1, 5]) assert X.shape[0] == len(phenotype_df) loc_df = pd.read_table(locations_file, delim_whitespace=True, header=None, names=["ind_id", "coord1", "coord2"]) loc = np.array(loc_df[["coord1", "coord2"]]) assert X.shape[0] == loc.shape[0] bim_file_path = "{0}.bim".format(genotype_files_prefix) snp_df = common.read_bim_file(bim_file_path) assert X.shape[1] == len(snp_df) n, p = X.shape common.print_log( "Input genotype matrix dimensions, n = {0}, p = {1}".format(n, p)) Y = np.array(phenotype_df["phenotype"]) # remove all inds with nan phenotypes non_missing_pheno_inds = ~np.isnan(Y) Y = Y[non_missing_pheno_inds] X = X[non_missing_pheno_inds, :] loc = loc[non_missing_pheno_inds, :] common.print_log("Found {0} individuals with phenotype".format( np.sum(non_missing_pheno_inds))) llr, K, R = association_test.association_test(X, Y, loc) llr[llr < 0] = np.nan p_vals = 1. - stats.chi2.cdf(llr, df=1) output_df = pd.DataFrame({ "snp": snp_df["snp"], "llr": llr, "K": K, "R": R, "p": p_vals }) output_file_path = "{0}.scgap".format(out_prefix) output_df.to_csv(output_file_path, sep="\t", header=False, index=False, na_rep="nan", columns=["snp", "llr", "K", "R", "p"]) common.print_log( "Output of association test written to {0}".format(output_file_path))
def optimize_R_K(q_hat, R, K, X, Y): # optimize K and R simultaneously by Newton's method n, p = X.shape TOL = 1e-6 EPS = 1.e-10 for iters in range(100): R_pow_Y_q_hat = R**Y * q_hat K_R_pow_Y_q_hat = K * R_pow_Y_q_hat Z = 1. - q_hat + K_R_pow_Y_q_hat s1 = 2. * K_R_pow_Y_q_hat s2 = s1 / Z - X F1 = np.nansum(s2 * Y, axis=0) F2 = np.nansum(s2, axis=0) tmp = 2. * R_pow_Y_q_hat * (1. - q_hat) / Z**2 J22 = np.nansum(tmp, axis=0) tmp = Y * tmp J12 = np.nansum(tmp, axis=0) tmp = K * tmp / R J21 = np.nansum(tmp, axis=0) tmp = Y * tmp J11 = np.nansum(tmp, axis=0) det = J11 * J22 - J12 * J21 R_new = R - (J22 * F1 - J12 * F2) / det K_new = K - (-J21 * F1 + J11 * F2) / det R_new = np.fmax(EPS, R_new) K_new = np.fmax(EPS, K_new) test_R = np.max(abs((R_new - R) / np.fmax(R_new, R))) test_K = np.max(abs((K_new - K) / np.fmax(K_new, K))) R = R_new K = K_new if iters % 10 == 0: common.print_log( "Iteration {0} of joint R, K optimization".format(iters)) common.print_log("Max rel err in R = {0}".format(test_R)) common.print_log("Max rel err in K = {0}".format(test_K)) if test_R < TOL and test_K < TOL: break common.print_log("Num iterations of joint R, K optimization", iters) common.print_log("Max rel err in R", test_R) common.print_log("Max rel err in K", test_K) return R, K
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("-b", "--out_prefix", required=True, help="file (without extension) to write output") parser.add_argument( "-f", "--allele_freq_model", required=False, default="isotropic", help="Either 'isotropic' or 'directional'. Default, 'isotropic'") parser.add_argument("-n", "--n", required=True, type=int, help="Number of individuals n to simulate") parser.add_argument("-p", "--p", required=True, type=int, help="Number of SNPs p to simulate") parser.add_argument("args", nargs=argparse.REMAINDER) args = parser.parse_args(argv[1:]) kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) out_prefix = args.out_prefix allele_freq_model_name = args.allele_freq_model n = args.n p = args.p allele_freq_fn = allele_freq_fns[allele_freq_model_name] loc, Q, X = simulate_square(allele_freq_fn, n, p, **kwargs) assert X.shape == (n, p) # output true locations to file output_locations_file = "{0}.loc".format(out_prefix) df = pd.DataFrame(loc) df.to_csv(output_locations_file, sep="\t", header=False) common.print_log( "Wrote ind locations to {0}".format(output_locations_file)) # output allele frequencies to file if needed if "save_allele_frequencies" in kwargs: allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix) np.save(allele_frequency_file, Q) common.print_log("Wrote allele frequencies to binary file {0}".format( allele_frequency_file)) # output genotype data to bed/fam/bim file bed_file = "{0}.bed".format(out_prefix) common.write_bed_file_dims(X, bed_file, n, p) common.print_log("Wrote genotypes to file {0}".format(bed_file)) fam_file = "{0}.fam".format(out_prefix) df = pd.DataFrame({ "fam_id": range(n), "ind_id": range(n), "pat_id": [0] * n, "mat_id": [0] * n, "sex": [0] * n, "status": [0] * n }) df.to_csv( fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "status"]) common.print_log("Wrote ind list to file {0}".format(fam_file)) bim_file = "{0}.bim".format(out_prefix) df = pd.DataFrame({ "chr": [1] * p, "snp": range(p), "dist": range(p), "pos": range(p), "allele1": [0] * p, "allele2": [1] * p }) df.to_csv(bim_file, sep="\t", header=False, index=False, columns=["chr", "snp", "dist", "pos", "allele1", "allele2"]) common.print_log("Wrote snp list to file {0}".format(bim_file))
def localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs): gdm = localization.compute_genetic_distance(X, **kwargs) common.print_log() common.print_log("Running GAP") cand_taus = localization.get_candidate_taus(gdm, **kwargs) upper_tri_Ds = gdm[np.triu_indices_from(gdm, k=1)] training_df_folds = None if inds_training_df is not None: training_df_folds = split_folds(inds_training_df, cv_folds) common.print_log( "Using {0}-fold cross-validation to optimize threshold tau".format( cv_folds)) else: common.print_log( "No training data provided. Using distance matrix reconstruction proportion to optimize threshold tau." ) best_cv_rmse = np.inf best_reconstruction_proportion = 0.0 for tau_idx, tau in enumerate(cand_taus): thresholded_gdm = gdm * (gdm <= tau) common.print_log() common.print_log( "tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}". format(tau_idx, tau, 100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds))) loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm) if loc_GAP is None: continue df_inferred = pd.DataFrame({ "ind_id": inds_df.ind_id, "coord1": loc_GAP[:, 0], "coord2": loc_GAP[:, 1] }) if training_df_folds is not None: cv_rmse = 0. for fold_idx in range(cv_folds): _, fold_rmse = rescale_locations(df_inferred, training_df_folds[fold_idx]) cv_rmse += fold_rmse cv_rmse /= cv_folds common.print_log("Cross-validation RMSE = {0}".format(cv_rmse)) if cv_rmse <= best_cv_rmse: best_tau_idx, best_tau, best_cv_rmse = tau_idx, tau, cv_rmse else: if reconstruction_proportion >= best_reconstruction_proportion: best_tau_idx, best_tau, best_reconstruction_proportion = tau_idx, tau, reconstruction_proportion output_df = df_inferred common.print_log() common.print_log( "Optimal tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}" .format(best_tau_idx, best_tau, 100. * np.sum(upper_tri_Ds <= best_tau) / len(upper_tri_Ds))) if training_df_folds is not None: # compute RMSE on all training data thresholded_gdm = gdm * (gdm <= best_tau) loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm, verbose=False) df_inferred = pd.DataFrame({ "ind_id": inds_df.ind_id, "coord1": loc_GAP[:, 0], "coord2": loc_GAP[:, 1] }) output_df, training_rmse = rescale_locations(df_inferred, inds_training_df) common.print_log( "Best cross-validation RMSE = {0}".format(best_cv_rmse)) common.print_log("RMSE on training data = {0}".format(training_rmse)) else: common.print_log("Best reconstruction proportion = {0}".format( best_reconstruction_proportion)) gap_output_path = "{0}.gap".format(out_prefix) output_df.to_csv(gap_output_path, sep="\t", header=False, index=False, columns=["ind_id", "coord1", "coord2"]) common.print_log() common.print_log("Wrote GAP locations to {0}".format(gap_output_path))
def association_test(X, Y, loc): """ performs SCGAP association test """ n, p = X.shape Y = Y[:, np.newaxis] kernel = compute_smoothing_kernel(loc, threshold=1e-4) # estimation under null hypothesis Rn = np.ones((1, p)) Kn = np.ones((1, p)) Rn_pow_y = Rn**Y common.print_log("Null hypothesis optimization") q_hat = estimate_Q(X, kernel) Kn = estimate_K(q_hat, Rn_pow_y, X) loglik_null = likelihood(Rn_pow_y, Kn, X, q_hat) # estimation under alternate hypothesis max_num_restarts = 5 R = np.ones((1, p)) K = np.ones((1, p)) best_loglik_alt = - np.ones(p) * np.inf best_R_alt = np.ones(p) best_K_alt = np.ones(p) X_cur = X q_cur = q_hat neg_llr_inds = range(p) restart_idx = 0 while True: if restart_idx > 0: # random restart R = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx, (1, len(neg_llr_inds))) K = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx, (1, len(neg_llr_inds))) X_cur = X[:, neg_llr_inds] q_cur = q_hat[:, neg_llr_inds] R_pow_Y = R**Y common.print_log("Alternate hypothesis optimization") R, K = optimize_R_K(q_cur, R, K, X_cur, Y) R_pow_Y = R**Y loglik_alt = likelihood(R_pow_Y, K, X_cur, q_cur) best_loglik_alt[neg_llr_inds] = np.fmax(loglik_alt, best_loglik_alt[neg_llr_inds]) best_R_alt[neg_llr_inds] = R.flatten() best_K_alt[neg_llr_inds] = K.flatten() neg_llr_inds, = np.where(best_loglik_alt < loglik_null) if len(neg_llr_inds) == 0: common.print_log("") common.print_log("Alternate hypotheses optimization needed", restart_idx, "restarts") break if restart_idx == max_num_restarts: common.print_log("Terminating restart procedure after", restart_idx, "restarts") break restart_idx = restart_idx + 1 common.print_log("") common.print_log("Restart idx", restart_idx) common.print_log("Num SNPs", len(neg_llr_inds)) llr = 2.*(best_loglik_alt - loglik_null) return llr, best_K_alt, best_R_alt
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("-b", "--prefix", required=True, help="genotype file (without extension) in plink bed file format") parser.add_argument("-o", "--out_prefix", required=True, help="prefix for association test output file") parser.add_argument('-l', '--locations_file', required=True, help="PCA or GAP coordinates that will be used for allele frequency \ estimation and smoothing") parser.add_argument('args', nargs=argparse.REMAINDER) args = parser.parse_args(argv[1:]) genotype_files_prefix = args.prefix out_prefix = args.out_prefix locations_file = args.locations_file kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) bed_file_path = "{0}.bed".format(genotype_files_prefix) X = common.read_bed_file(bed_file_path) X = np.asarray(X, dtype=float) X[(X < 0) | (X > 2)] = np.nan fam_file_path = "{0}.fam".format(genotype_files_prefix) phenotype_df = pd.read_table(fam_file_path, header=None, delim_whitespace=True, names=["ind_id", "phenotype"], usecols=[1, 5]) assert X.shape[0] == len(phenotype_df) loc_df = pd.read_table(locations_file, delim_whitespace=True, header=None, names=["ind_id", "coord1", "coord2"]) loc = np.array(loc_df[["coord1", "coord2"]]) assert X.shape[0] == loc.shape[0] bim_file_path = "{0}.bim".format(genotype_files_prefix) snp_df = common.read_bim_file(bim_file_path) assert X.shape[1] == len(snp_df) n, p = X.shape common.print_log("Input genotype matrix dimensions, n = {0}, p = {1}".format(n, p)) Y = np.array(phenotype_df["phenotype"]) # remove all inds with nan phenotypes non_missing_pheno_inds = ~np.isnan(Y) Y = Y[non_missing_pheno_inds] X = X[non_missing_pheno_inds, :] loc = loc[non_missing_pheno_inds, :] common.print_log("Found {0} individuals with phenotype".format(np.sum(non_missing_pheno_inds))) llr, K, R = association_test.association_test(X, Y, loc) llr[llr < 0] = np.nan p_vals = 1. - stats.chi2.cdf(llr, df=1) output_df = pd.DataFrame({"snp": snp_df["snp"], "llr": llr, "K": K, "R": R, "p": p_vals}) output_file_path = "{0}.scgap".format(out_prefix) output_df.to_csv(output_file_path, sep="\t", header=False, index=False, na_rep="nan", columns=["snp", "llr", "K", "R", "p"]) common.print_log("Output of association test written to {0}".format(output_file_path))
import string text = "".join( random.choice(string.ascii_letters + string.digits) for _ in range(64)) f.write(text) return file_name, file_name if __name__ == '__main__': from common import get_repo, print_log repo = get_repo() print(repo) print() print_log() print() new_file_name = create_random_file(repo)[1] message = 'Create: ' + new_file_name print(message) repo.index.add([new_file_name]) # # or: # repo.index.add(['*']) # repo.git.add(new_file_name) # repo.git.add('-A') repo.index.commit(message)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("-b", "--out_prefix", required=True, help="file (without extension) to write output") parser.add_argument( "-f", "--allele_freq_model", required=False, default="isotropic", help="Either 'isotropic' or 'directional'. Default, 'isotropic'") parser.add_argument("-n", "--n", required=True, type=int, help="Number of individuals n to simulate") parser.add_argument("-p", "--p", required=True, type=int, help="Number of SNPs p to simulate") parser.add_argument("--pc", required=False, type=int, default=10, help="Number of SNPs with non-zero effect sizes") parser.add_argument( "-g", "--geno_h", required=False, type=float, default=0.95, help="Genetic heritability contribution (fraction between 0 and 1)") parser.add_argument( "-a", "--anc_h", required=False, type=float, default=0.05, help="Ancestry heritability contribution (fraction between 0 and 1)") parser.add_argument( "-e", "--env_h", required=False, type=float, default=0.05, help="Environment heritability contribution (fraction between 0 and 1)" ) parser.add_argument("args", nargs=argparse.REMAINDER) args = parser.parse_args(argv[1:]) kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) out_prefix = args.out_prefix allele_freq_model_name = args.allele_freq_model n = args.n p = args.p pc = args.pc allele_freq_fn = simulate_localization.allele_freq_fns[ allele_freq_model_name] is_discrete = "discrete" in kwargs if is_discrete: loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution = simulate_square( allele_freq_fn, n, p, pc, geno_h=args.geno_h, anc_h=args.anc_h, **kwargs) else: loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution = simulate_square( allele_freq_fn, n, p, pc, geno_h=args.geno_h, anc_h=args.anc_h, env_h=args.env_h, **kwargs) assert X.shape == (n, p) # output true locations to file output_locations_file = "{0}.loc".format(out_prefix) df = pd.DataFrame(loc) df.to_csv(output_locations_file, sep="\t", header=False) common.print_log( "Wrote ancestry information to {0}".format(output_locations_file)) # output allele frequencies to file if needed if "save_allele_frequencies" in kwargs: allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix) np.save(allele_frequency_file, Q) common.print_log("Wrote allele frequencies to binary file {0}".format( allele_frequency_file)) # output genotype data to bed/fam/bim file bed_file = "{0}.bed".format(out_prefix) common.write_bed_file_dims(X, bed_file, n, p) common.print_log("Wrote genotypes to file {0}".format(bed_file)) # output phenotype contribution from different components fam_file = "{0}.fam".format(out_prefix) pheno_dict = { "fam_id": range(n), "ind_id": range(n), "pat_id": [0] * n, "mat_id": [0] * n, "sex": [0] * n, "phenotype": Y, "geno_contribution": geno_contribution, "anc_contribution": anc_contribution } if not is_discrete: pheno_dict["env_contribution"] = env_contribution pheno_df = pd.DataFrame(pheno_dict) if is_discrete: pheno_df.to_csv(fam_file, sep="\t", header=False, index=False, columns=[ "fam_id", "ind_id", "pat_id", "mat_id", "sex", "phenotype", "geno_contribution", "anc_contribution" ]) else: pheno_df.to_csv(fam_file, sep="\t", header=False, index=False, columns=[ "fam_id", "ind_id", "pat_id", "mat_id", "sex", "phenotype", "geno_contribution", "anc_contribution", "env_contribution" ]) common.print_log("Wrote phenotype to file {0}".format(fam_file)) bim_file = "{0}.bim".format(out_prefix) # null SNPs are simulated to be on chr 1, causal on chr 2 null_snps = set(null_snps) chrs = [1 if snp_idx in null_snps else 2 for snp_idx in range(p)] df = pd.DataFrame({ "chr": chrs, "snp": range(p), "dist": range(p), "pos": range(p), "allele1": [0] * p, "allele2": [1] * p }) df.to_csv(bim_file, sep="\t", header=False, index=False, columns=["chr", "snp", "dist", "pos", "allele1", "allele2"]) common.print_log("Wrote snp list to file {0}".format(bim_file))
def main(argv): parser = argparse.ArgumentParser() parser.add_argument( "-b", "--prefix", required=True, help="genotype file (without extension) in plink bed file format") parser.add_argument( "-l", "--out_prefix", required=True, help="prefix for file names where localization outputs will be stored") parser.add_argument( "-t", "--training_file", required=False, default=None, help="file containing a subset of the individuals with known locations" ) parser.add_argument("-c", "--cv_folds", required=False, type=int, default=1, help="number of folds for cross-validation") parser.add_argument( "args", nargs=argparse.REMAINDER, help= "specify either gap or pca (or both) for the localization algorithm to run" ) args = parser.parse_args(argv[1:]) genotype_files_prefix = args.prefix out_prefix = args.out_prefix training_file = args.training_file if args.training_file else None cv_folds = args.cv_folds kwargs = common.make_kwargs(args.args) common.print_log(" ".join(argv)) common.print_log("args: ", args) common.print_log("kwargs: ", kwargs) fam_file_path = "{0}.fam".format(genotype_files_prefix) inds_df = pd.read_table(fam_file_path, header=None, delim_whitespace=True, names=["ind_id"], usecols=[1]) inds_training_df = None if training_file: inds_training_df = pd.read_table(training_file, delim_whitespace=True, header=None, names=["ind_id", "coord1", "coord2"]) inds_training_df = pd.merge(inds_df, inds_training_df, how="inner", on=["ind_id"]) cv_folds = min(cv_folds, inds_training_df.shape[0]) bed_file_path = "{0}.bed".format(genotype_files_prefix) X = common.read_bed_file(bed_file_path) n, p = X.shape assert len(inds_df) == n X = np.asarray(X, dtype=float) X[(X < 0) | (X > 2)] = np.nan common.print_log("Input matrix dimensions, n = {0}, p = {1}".format(n, p)) if "pca" in kwargs: localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs) if "gap" in kwargs: localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' if __name__ == '__main__': from common import print_log print_log(reverse=True)
def optimize_R_K(q_hat, R, K, X, Y): # optimize K and R simultaneously by Newton's method n, p = X.shape TOL = 1e-6 EPS = 1.e-10 for iters in range(100): R_pow_Y_q_hat = R**Y * q_hat K_R_pow_Y_q_hat = K * R_pow_Y_q_hat Z = 1. - q_hat + K_R_pow_Y_q_hat s1 = 2. * K_R_pow_Y_q_hat s2 = s1/Z - X F1 = np.nansum(s2 * Y, axis=0) F2 = np.nansum(s2, axis=0) tmp = 2. * R_pow_Y_q_hat * (1. - q_hat) / Z**2 J22 = np.nansum(tmp, axis=0) tmp = Y * tmp J12 = np.nansum(tmp, axis=0) tmp = K * tmp / R J21 = np.nansum(tmp, axis=0) tmp = Y * tmp J11 = np.nansum(tmp, axis=0) det = J11 * J22 - J12 * J21 R_new = R - (J22 * F1 - J12 * F2) / det K_new = K - (- J21 * F1 + J11 * F2) / det R_new = np.fmax(EPS, R_new) K_new = np.fmax(EPS, K_new) test_R = np.max(abs((R_new - R) / np.fmax(R_new, R))) test_K = np.max(abs((K_new - K) / np.fmax(K_new, K))) R = R_new K = K_new if iters % 10 == 0: common.print_log("Iteration {0} of joint R, K optimization".format(iters)) common.print_log("Max rel err in R = {0}".format(test_R)) common.print_log("Max rel err in K = {0}".format(test_K)) if test_R < TOL and test_K < TOL: break common.print_log("Num iterations of joint R, K optimization", iters) common.print_log("Max rel err in R", test_R) common.print_log("Max rel err in K", test_K) return R, K
def localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs): gdm = localization.compute_genetic_distance(X, **kwargs) common.print_log() common.print_log("Running GAP") cand_taus = localization.get_candidate_taus(gdm, **kwargs) upper_tri_Ds = gdm[np.triu_indices_from(gdm, k=1)] training_df_folds = None if inds_training_df is not None: training_df_folds = split_folds(inds_training_df, cv_folds) common.print_log("Using {0}-fold cross-validation to optimize threshold tau".format(cv_folds)) else: common.print_log("No training data provided. Using distance matrix reconstruction proportion to optimize threshold tau.") best_cv_rmse = np.inf best_reconstruction_proportion = 0.0 for tau_idx, tau in enumerate(cand_taus): thresholded_gdm = gdm * (gdm <= tau) common.print_log() common.print_log("tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}".format(tau_idx, tau, 100.*np.sum(upper_tri_Ds <= tau)/len(upper_tri_Ds))) loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm) if loc_GAP is None: continue df_inferred = pd.DataFrame({"ind_id": inds_df.ind_id, "coord1": loc_GAP[:, 0], "coord2": loc_GAP[:, 1]}) if training_df_folds is not None: cv_rmse = 0. for fold_idx in range(cv_folds): _, fold_rmse = rescale_locations(df_inferred, training_df_folds[fold_idx]) cv_rmse += fold_rmse cv_rmse /= cv_folds common.print_log("Cross-validation RMSE = {0}".format(cv_rmse)) if cv_rmse <= best_cv_rmse: best_tau_idx, best_tau, best_cv_rmse = tau_idx, tau, cv_rmse else: if reconstruction_proportion >= best_reconstruction_proportion: best_tau_idx, best_tau, best_reconstruction_proportion = tau_idx, tau, reconstruction_proportion output_df = df_inferred common.print_log() common.print_log("Optimal tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}".format (best_tau_idx, best_tau, 100.*np.sum(upper_tri_Ds <= best_tau)/len(upper_tri_Ds))) if training_df_folds is not None: # compute RMSE on all training data thresholded_gdm = gdm * (gdm <= best_tau) loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm, verbose=False) df_inferred = pd.DataFrame({"ind_id": inds_df.ind_id, "coord1": loc_GAP[:, 0], "coord2": loc_GAP[:, 1]}) output_df, training_rmse = rescale_locations(df_inferred, inds_training_df) common.print_log("Best cross-validation RMSE = {0}".format(best_cv_rmse)) common.print_log("RMSE on training data = {0}".format(training_rmse)) else: common.print_log("Best reconstruction proportion = {0}".format(best_reconstruction_proportion)) gap_output_path = "{0}.gap".format(out_prefix) output_df.to_csv(gap_output_path, sep="\t", header=False, index=False, columns=["ind_id", "coord1", "coord2"]) common.print_log() common.print_log("Wrote GAP locations to {0}".format(gap_output_path))
def association_test(X, Y, loc): """ performs SCGAP association test """ n, p = X.shape Y = Y[:, np.newaxis] kernel = compute_smoothing_kernel(loc, threshold=1e-4) # estimation under null hypothesis Rn = np.ones((1, p)) Kn = np.ones((1, p)) Rn_pow_y = Rn**Y common.print_log("Null hypothesis optimization") q_hat = estimate_Q(X, kernel) Kn = estimate_K(q_hat, Rn_pow_y, X) loglik_null = likelihood(Rn_pow_y, Kn, X, q_hat) # estimation under alternate hypothesis max_num_restarts = 5 R = np.ones((1, p)) K = np.ones((1, p)) best_loglik_alt = -np.ones(p) * np.inf best_R_alt = np.ones(p) best_K_alt = np.ones(p) X_cur = X q_cur = q_hat neg_llr_inds = range(p) restart_idx = 0 while True: if restart_idx > 0: # random restart R = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx, (1, len(neg_llr_inds))) K = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx, (1, len(neg_llr_inds))) X_cur = X[:, neg_llr_inds] q_cur = q_hat[:, neg_llr_inds] R_pow_Y = R**Y common.print_log("Alternate hypothesis optimization") R, K = optimize_R_K(q_cur, R, K, X_cur, Y) R_pow_Y = R**Y loglik_alt = likelihood(R_pow_Y, K, X_cur, q_cur) best_loglik_alt[neg_llr_inds] = np.fmax(loglik_alt, best_loglik_alt[neg_llr_inds]) best_R_alt[neg_llr_inds] = R.flatten() best_K_alt[neg_llr_inds] = K.flatten() neg_llr_inds, = np.where(best_loglik_alt < loglik_null) if len(neg_llr_inds) == 0: common.print_log("") common.print_log("Alternate hypotheses optimization needed", restart_idx, "restarts") break if restart_idx == max_num_restarts: common.print_log("Terminating restart procedure after", restart_idx, "restarts") break restart_idx = restart_idx + 1 common.print_log("") common.print_log("Restart idx", restart_idx) common.print_log("Num SNPs", len(neg_llr_inds)) llr = 2. * (best_loglik_alt - loglik_null) return llr, best_K_alt, best_R_alt