Ejemplo n.º 1
0
def estimate_K(q_hat, R_pow_Y, X):
    n, p = X.shape
    TOL = 1e-13
    EPS = 1.e-10

    K = np.ones((1, p))
    R_pow_Y_q_hat = R_pow_Y * q_hat
    for iters in range(100):
        K_RtoY_qHat = K * R_pow_Y_q_hat
        Z = 1. - q_hat + K_RtoY_qHat
        s1 = 2. * K_RtoY_qHat
        T1 = np.nansum(s1/Z - X, axis=0)

        s1 = 2. * (1.-q_hat) * R_pow_Y_q_hat
        T2 = np.nansum(s1/(Z**2), axis=0)

        K_new = K - T1/T2
        K_new = np.fmax(EPS, K_new)
        test = np.max(abs(K_new-K))
        K = K_new
        if test < TOL:
            break

    common.print_log("Num K iterations = {0}, err = {1}".format(iters, test))
    return K
Ejemplo n.º 2
0
def estimate_K(q_hat, R_pow_Y, X):
    n, p = X.shape
    TOL = 1e-13
    EPS = 1.e-10

    K = np.ones((1, p))
    R_pow_Y_q_hat = R_pow_Y * q_hat
    for iters in range(100):
        K_RtoY_qHat = K * R_pow_Y_q_hat
        Z = 1. - q_hat + K_RtoY_qHat
        s1 = 2. * K_RtoY_qHat
        T1 = np.nansum(s1 / Z - X, axis=0)

        s1 = 2. * (1. - q_hat) * R_pow_Y_q_hat
        T2 = np.nansum(s1 / (Z**2), axis=0)

        K_new = K - T1 / T2
        K_new = np.fmax(EPS, K_new)
        test = np.max(abs(K_new - K))
        K = K_new
        if test < TOL:
            break

    common.print_log("Num K iterations = {0}, err = {1}".format(iters, test))
    return K
Ejemplo n.º 3
0
def simulate_square(allele_freq_fn, n, p, pc, **kwargs):
    loc, Q, X = simulate_localization.simulate_square(allele_freq_fn, n, p,
                                                      **kwargs)

    pr = p - pc
    common.print_log("Number of non-zero effect size SNPs = {0}".format(pc))
    common.print_log("Total number of SNPs = {0}".format(p))

    null_snps = np.arange(0, pr)
    causal_snps = np.arange(pr, p)

    geno_h = float(kwargs.get("geno_h", 0.90))
    anc_h = float(kwargs.get("anc_h", 0.05))
    env_h = float(kwargs.get("env_h", 0.05))

    is_discrete = "discrete" in kwargs

    common.print_log("Genetic heritability proportion = {0}".format(geno_h))
    common.print_log("Ancestry heritability proportion = {0}".format(anc_h))
    if not is_discrete:
        common.print_log(
            "Environment heritability proportion = {0}".format(env_h))

    # generate genotype contribution to phenotype
    betas = np.zeros(p)
    betas[causal_snps] = np.random.normal(0, 1, pc)
    geno_contribution = np.dot(X, betas)
    geno_contribution = geno_contribution * np.sqrt(geno_h) / np.std(
        geno_contribution)

    # generate location-dependent contribution to phenotype
    # if alleleFreqFn == simulate.logisticDirectionalExpDecayCovAlleleFreqFn:
    theta = float(kwargs.get("theta", 0.))
    u = np.array([np.cos(theta), np.sin(theta)])
    anc_contribution = np.dot(loc, u)
    anc_contribution = anc_contribution * np.sqrt(anc_h) / np.std(
        anc_contribution)

    # phenotype is the sum of genotype contribution and env contribution
    Y = geno_contribution + anc_contribution

    if is_discrete:  # phenotype is discrete
        prob = 1. / (1. + np.exp(-Y))
        rnd = np.random.rand(n)
        Y = np.array(rnd <= prob, dtype=int)
    else:
        # generate independent env/noise contribution to phenotype for continuous phenotype
        env_contribution = np.random.normal(0, 1, n)
        env_contribution = env_contribution * np.sqrt(env_h) / np.std(
            env_contribution)
        Y = Y + env_contribution

    if is_discrete:
        return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution
    else:
        return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution
Ejemplo n.º 4
0
def simulate_square(allele_freq_fn, n, p, pc, **kwargs):
    loc, Q, X = simulate_localization.simulate_square(allele_freq_fn, n, p, **kwargs)

    pr = p - pc
    common.print_log("Number of non-zero effect size SNPs = {0}".format(pc))
    common.print_log("Total number of SNPs = {0}".format(p))

    null_snps = np.arange(0, pr)
    causal_snps = np.arange(pr, p)

    geno_h = float(kwargs.get("geno_h", 0.90))
    anc_h = float(kwargs.get("anc_h", 0.05))
    env_h = float(kwargs.get("env_h", 0.05))

    is_discrete = "discrete" in kwargs

    common.print_log("Genetic heritability proportion = {0}".format(geno_h))
    common.print_log("Ancestry heritability proportion = {0}".format(anc_h))
    if not is_discrete:
        common.print_log("Environment heritability proportion = {0}".format(env_h))

    # generate genotype contribution to phenotype
    betas = np.zeros(p)
    betas[causal_snps] = np.random.normal(0, 1, pc)
    geno_contribution = np.dot(X, betas)
    geno_contribution = geno_contribution * np.sqrt(geno_h) / np.std(geno_contribution)

    # generate location-dependent contribution to phenotype
    # if alleleFreqFn == simulate.logisticDirectionalExpDecayCovAlleleFreqFn:
    theta = float(kwargs.get("theta", 0.))
    u = np.array([np.cos(theta), np.sin(theta)])
    anc_contribution = np.dot(loc, u)
    anc_contribution = anc_contribution * np.sqrt(anc_h) / np.std(anc_contribution)

    # phenotype is the sum of genotype contribution and env contribution
    Y = geno_contribution + anc_contribution

    if is_discrete:     # phenotype is discrete
        prob = 1. / (1. + np.exp(- Y))
        rnd = np.random.rand(n)
        Y = np.array(rnd <= prob, dtype=int)
    else:
        # generate independent env/noise contribution to phenotype for continuous phenotype
        env_contribution = np.random.normal(0, 1, n)
        env_contribution = env_contribution * np.sqrt(env_h) / np.std(env_contribution)
        Y = Y + env_contribution

    if is_discrete:
        return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution
    else:
        return loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution
Ejemplo n.º 5
0
def localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs):
    n, p = X.shape

    common.print_log()
    common.print_log("Running PCA")
    loc_PCA, variance_explained, reconstruction_proportion = localization.pca(
        X, **kwargs)

    df_inferred = pd.DataFrame({
        "ind_id": inds_df.ind_id,
        "coord1": loc_PCA[:, 0],
        "coord2": loc_PCA[:, 1]
    })
    if inds_training_df is not None:
        df_inferred, training_rmse = rescale_locations(df_inferred,
                                                       inds_training_df)
        common.print_log("RMSE on training data: {0}".format(training_rmse))

    pca_output_path = "{0}.pca".format(out_prefix)
    df_inferred.to_csv(pca_output_path,
                       sep="\t",
                       header=False,
                       index=False,
                       columns=["ind_id", "coord1", "coord2"])
    common.print_log(
        "Wrote PCA inferred locations to {0}".format(pca_output_path))
Ejemplo n.º 6
0
def get_candidate_taus_above_threshold(Ds, thresh, **kwargs):
    upper_tri_Ds = Ds[np.triu_indices_from(Ds, k=1)]
    if "nz_frac" in kwargs:
        nz_frac = float(kwargs["nz_frac"])
        common.print_log("Setting tau so that fraction of distances below threshold = {0}".format(nz_frac))

        all_taus = np.array(sorted(upper_tri_Ds))
        n_all_taus = len(all_taus)
        idx = min(max(int(nz_frac*n_all_taus), 0), n_all_taus-1)
        tau = all_taus[idx]
        if tau < thresh:
            common.print_log("Parameter tau was set below the minimum value which makes the graph connected. Changing it to {0}".format(thresh))
            tau = thresh
        candidate_taus = np.array([tau])
    else:
        grid_size = int(kwargs.get("grid_size", 20))

        linspace_tau = bool(kwargs.get("linspace_tau", False))
        if linspace_tau:
            candidate_taus = np.linspace(thresh, np.max(Ds[Ds > 0]), grid_size)
        else:
            all_taus = np.array(sorted(upper_tri_Ds[upper_tri_Ds > thresh]))
            n_all_taus = len(all_taus)
            tau_indices = np.asarray(np.concatenate([np.linspace(0, 1, grid_size)]) * (n_all_taus - 1), dtype=int)
            candidate_taus = sorted(all_taus[tau_indices])

    nz_fracs = [100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds) for tau in candidate_taus]

    common.print_log("Found {0} candidate thresholds:".format(len(candidate_taus)), candidate_taus)
    common.print_log("Percentage of distances below threshold:", nz_fracs)
    return candidate_taus
Ejemplo n.º 7
0
def simulate_square(allele_freq_fn, n=1000, p=50000, **kwargs):
    """simulate individual locations in a unit square with coordinates drawn
    independently from a Beta(b, b) distribution, using allele frequencies drawn from
    the stochastic process encoded in function allele_freq_fn
    """
    beta = float(kwargs.get("beta", 1.0))

    common.print_log("Simulating from the unit square, n = {0}, p = {1}".format(n, p))
    common.print_log("Coordinate distribution, beta =", beta)

    loc = np.random.beta(beta, beta, size=(n, 2)) - 0.5
    Q = allele_freq_fn(loc, p, **kwargs)
    X = generate_genotypes(Q)
    return loc, Q, X
Ejemplo n.º 8
0
def simulate_square(allele_freq_fn, n=1000, p=50000, **kwargs):
    """simulate individual locations in a unit square with coordinates drawn
    independently from a Beta(b, b) distribution, using allele frequencies drawn from
    the stochastic process encoded in function allele_freq_fn
    """
    beta = float(kwargs.get("beta", 1.0))

    common.print_log(
        "Simulating from the unit square, n = {0}, p = {1}".format(n, p))
    common.print_log("Coordinate distribution, beta =", beta)

    loc = np.random.beta(beta, beta, size=(n, 2)) - 0.5
    Q = allele_freq_fn(loc, p, **kwargs)
    X = generate_genotypes(Q)
    return loc, Q, X
Ejemplo n.º 9
0
def mds(dist_mat, verbose=True):
    n = dist_mat.shape[0]

    sparse_graph = csr_matrix(dist_mat)
    n_components, _ = connected_components(sparse_graph)
    if n_components != 1:
        common.print_log("Choose larger threshold tau!")
        return None, None, None

    shortest_path_dist = shortest_path(sparse_graph, method='D')
    if verbose:
        common.print_log(
            "Shortest path distance matrix entries, mean = %f, std dev = %f, max = %f"
            % (np.mean(shortest_path_dist), np.std(shortest_path_dist),
               np.max(shortest_path_dist)))

    shortest_path_dist_sq = shortest_path_dist**2

    C = np.eye(n) - 1. / n * np.ones((n, n))
    tmp = -0.5 * np.dot(np.dot(C, shortest_path_dist_sq), C)
    tmp = (tmp + tmp.T) / 2.

    S, U = np.linalg.eigh(tmp)
    S_MDS, U_MDS = S[[-1, -2]], U[:, [-1, -2]]
    assert np.all(S_MDS > 0.)

    loc_MDS = np.sqrt(S_MDS) * U_MDS

    if verbose:
        common.print_log("Num positive eigenvalues of MDS matrix =",
                         np.sum(S >= 0.))
        common.print_log("Num negative eigenvalues of MDS matrix =",
                         np.sum(S < 0.))

    reconstruction_proportion = np.sum(S_MDS) / np.sum(np.abs(S))

    if verbose:
        # (||LD'L||_F / ||LDL||_F)
        common.print_log(
            "Distance matrix reconstruction proportion = {0}".format(
                reconstruction_proportion))

    return loc_MDS, reconstruction_proportion
Ejemplo n.º 10
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-b", "--prefix", required=True,
        help="genotype file (without extension) in plink bed file format")
    parser.add_argument("-l", "--out_prefix", required=True,
        help="prefix for file names where localization outputs will be stored")
    parser.add_argument("-t", "--training_file", required=False, default=None,
        help="file containing a subset of the individuals with known locations")
    parser.add_argument("-c", "--cv_folds", required=False, type=int, default=1,
        help="number of folds for cross-validation")
    parser.add_argument("args", nargs=argparse.REMAINDER,
        help="specify either gap or pca (or both) for the localization algorithm to run")

    args = parser.parse_args(argv[1:])

    genotype_files_prefix = args.prefix
    out_prefix = args.out_prefix
    training_file = args.training_file if args.training_file else None
    cv_folds = args.cv_folds

    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    fam_file_path = "{0}.fam".format(genotype_files_prefix)
    inds_df = pd.read_table(fam_file_path, header=None, delim_whitespace=True, names=["ind_id"], usecols=[1])

    inds_training_df = None
    if training_file:
        inds_training_df = pd.read_table(training_file, delim_whitespace=True, header=None, names=["ind_id", "coord1", "coord2"])
        inds_training_df = pd.merge(inds_df, inds_training_df, how="inner", on=["ind_id"])
        cv_folds = min(cv_folds, inds_training_df.shape[0])

    bed_file_path = "{0}.bed".format(genotype_files_prefix)
    X = common.read_bed_file(bed_file_path)

    n, p = X.shape
    assert len(inds_df) == n

    X = np.asarray(X, dtype=float)
    X[(X < 0) | (X > 2)] = np.nan

    common.print_log("Input matrix dimensions, n = {0}, p = {1}".format(n, p))

    if "pca" in kwargs:
        localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs)

    if "gap" in kwargs:
        localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs)
Ejemplo n.º 11
0
def compute_connectivity_threshold(Ds, **kwargs):
    n, _ = Ds.shape

    Ds_nz = Ds[Ds > 0]
    # find the range of values of the distance for which the graph is connected
    lo = np.min(Ds_nz)
    hi = np.max(Ds_nz)

    eps = 0.01*(hi - lo)
    while (hi - lo) >= eps:
        mid = (lo + hi) / 2.
        sparse_graph = csr_matrix(Ds <= mid)
        n_components, _ = connected_components(sparse_graph)
        if n_components != 1:   # graph is not connected
            lo = mid
        else:
            hi = mid

    thresh = hi
    common.print_log("Smallest threshold tau which makes graph connected = {0}".format(thresh))
    return thresh
Ejemplo n.º 12
0
def run_experiment(info, experiments_dir, tmp_data_dir, exp_name):

    to_local_time = lambda sec: time.asctime(time.localtime(sec))
    exp_dir = os.path.join(experiments_dir, exp_name)
    exp_conf = info.exp_config_dir(exp_name)

    # set up a temporary data directory for that experiment
    exp_data_dir = os.path.join(tmp_data_dir, exp_name)
    idemp_mkdir(exp_data_dir)

    # Mark the start and the end of an experiment
    start_time = time.time()
    start_msg = f'Experiment {exp_name} starts @ {to_local_time(start_time)}'
    print_log(start_msg)
    # run the run.sh file on the configs directory and the destination directory
    subprocess.call([os.path.join(exp_dir, 'run.sh'), exp_conf, exp_data_dir],
                    cwd=exp_dir)
    end_time = time.time()
    delta = datetime.timedelta(seconds=end_time - start_time)
    # collect the status file from the destination directory, copy to status dir
    status = validate_status(exp_data_dir)
    # show experiment status to terminal
    if status['success']:
        end_msg = f'Experiment {exp_name} ends @ {to_local_time(end_time)}\nTime Delta: {delta}'
        print_log(end_msg)
    else:
        print_log(f'*** {exp_name} FAILED ***\n*** Reason: {status["message"]} ***')
    # record start & end & duration of an experiment
    status['start_time'] = to_local_time(start_time)
    status['end_time'] = to_local_time(end_time)
    status['time_delta'] = str(delta)
    # not literally copying because validate may have produced a status that generated an error
    info.report_exp_status(exp_name, 'run', status)
    return status['success']
Ejemplo n.º 13
0
def localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs):
    n, p = X.shape

    common.print_log()
    common.print_log("Running PCA")
    loc_PCA, variance_explained, reconstruction_proportion = localization.pca(X, **kwargs)

    df_inferred = pd.DataFrame({"ind_id": inds_df.ind_id, "coord1": loc_PCA[:, 0], "coord2": loc_PCA[:, 1]})
    if inds_training_df is not None:
        df_inferred, training_rmse = rescale_locations(df_inferred, inds_training_df)
        common.print_log("RMSE on training data: {0}".format(training_rmse))

    pca_output_path = "{0}.pca".format(out_prefix)
    df_inferred.to_csv(pca_output_path, sep="\t", header=False, index=False, columns=["ind_id", "coord1", "coord2"])
    common.print_log("Wrote PCA inferred locations to {0}".format(pca_output_path))
Ejemplo n.º 14
0
def compute_connectivity_threshold(Ds, **kwargs):
    n, _ = Ds.shape

    Ds_nz = Ds[Ds > 0]
    # find the range of values of the distance for which the graph is connected
    lo = np.min(Ds_nz)
    hi = np.max(Ds_nz)

    eps = 0.01 * (hi - lo)
    while (hi - lo) >= eps:
        mid = (lo + hi) / 2.
        sparse_graph = csr_matrix(Ds <= mid)
        n_components, _ = connected_components(sparse_graph)
        if n_components != 1:  # graph is not connected
            lo = mid
        else:
            hi = mid

    thresh = hi
    common.print_log(
        "Smallest threshold tau which makes graph connected = {0}".format(
            thresh))
    return thresh
Ejemplo n.º 15
0
def mds(dist_mat, verbose=True):
    n = dist_mat.shape[0]

    sparse_graph = csr_matrix(dist_mat)
    n_components, _ = connected_components(sparse_graph)
    if n_components != 1:
        common.print_log("Choose larger threshold tau!")
        return None, None, None

    shortest_path_dist = shortest_path(sparse_graph, method='D')
    if verbose:
        common.print_log("Shortest path distance matrix entries, mean = %f, std dev = %f, max = %f" % (np.mean(shortest_path_dist), np.std(shortest_path_dist), np.max(shortest_path_dist)))

    shortest_path_dist_sq = shortest_path_dist**2

    C = np.eye(n) - 1./n * np.ones((n, n))
    tmp = - 0.5 * np.dot(np.dot(C, shortest_path_dist_sq), C)
    tmp = (tmp + tmp.T) / 2.

    S, U = np.linalg.eigh(tmp)
    S_MDS, U_MDS = S[[-1, -2]], U[:, [-1, -2]]
    assert np.all(S_MDS > 0.)

    loc_MDS = np.sqrt(S_MDS) * U_MDS

    if verbose:
        common.print_log("Num positive eigenvalues of MDS matrix =", np.sum(S >= 0.))
        common.print_log("Num negative eigenvalues of MDS matrix =", np.sum(S < 0.))

    reconstruction_proportion = np.sum(S_MDS) / np.sum(np.abs(S))

    if verbose:
        # (||LD'L||_F / ||LDL||_F)
        common.print_log("Distance matrix reconstruction proportion = {0}".format(reconstruction_proportion))

    return loc_MDS, reconstruction_proportion
Ejemplo n.º 16
0
def get_candidate_taus_above_threshold(Ds, thresh, **kwargs):
    upper_tri_Ds = Ds[np.triu_indices_from(Ds, k=1)]
    if "nz_frac" in kwargs:
        nz_frac = float(kwargs["nz_frac"])
        common.print_log(
            "Setting tau so that fraction of distances below threshold = {0}".
            format(nz_frac))

        all_taus = np.array(sorted(upper_tri_Ds))
        n_all_taus = len(all_taus)
        idx = min(max(int(nz_frac * n_all_taus), 0), n_all_taus - 1)
        tau = all_taus[idx]
        if tau < thresh:
            common.print_log(
                "Parameter tau was set below the minimum value which makes the graph connected. Changing it to {0}"
                .format(thresh))
            tau = thresh
        candidate_taus = np.array([tau])
    else:
        grid_size = int(kwargs.get("grid_size", 20))

        linspace_tau = bool(kwargs.get("linspace_tau", False))
        if linspace_tau:
            candidate_taus = np.linspace(thresh, np.max(Ds[Ds > 0]), grid_size)
        else:
            all_taus = np.array(sorted(upper_tri_Ds[upper_tri_Ds > thresh]))
            n_all_taus = len(all_taus)
            tau_indices = np.asarray(
                np.concatenate([np.linspace(0, 1, grid_size)]) *
                (n_all_taus - 1),
                dtype=int)
            candidate_taus = sorted(all_taus[tau_indices])

    nz_fracs = [
        100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds)
        for tau in candidate_taus
    ]

    common.print_log(
        "Found {0} candidate thresholds:".format(len(candidate_taus)),
        candidate_taus)
    common.print_log("Percentage of distances below threshold:", nz_fracs)
    return candidate_taus
Ejemplo n.º 17
0
def pca(X, **kwargs):
    col_sums = np.nansum(X, axis=0)
    n_inds = np.sum(~np.isnan(X), axis=0)
    mu_hat = col_sums / n_inds  # dimension p (or n if rowNormalize is true)

    # normalization by estimated std deviation
    sd_hat = (1. + col_sums) / (2. + 2. * n_inds)
    sd_hat = np.sqrt(sd_hat * (1. - sd_hat))

    Xn = X.copy()
    Xn -= mu_hat
    Xn[np.isnan(Xn)] = 0.

    Xn /= sd_hat

    grm = np.dot(Xn, Xn.T)

    eig_indices = kwargs.get("eig_indices", [1, 2])
    eig_indices = np.array(sorted([int(x) for x in eig_indices]))

    common.print_log("Computing principal components:", eig_indices)

    S, U = np.linalg.eigh(grm)
    S_PCA, U_PCA = S[-eig_indices], U[:, -eig_indices]
    assert np.all(S_PCA > 0.)

    loc_PCA = np.sqrt(S_PCA) * U_PCA
    variance_explained = np.sum(S_PCA) / np.trace(grm) * 100.

    reconstruction_proportion = np.sum(S_PCA) / np.sum(np.abs(S))

    # (||LD'L||_F / ||LDL||_F)
    common.print_log(
        "Percent variance explained by PCA projection = {0}".format(
            variance_explained))
    common.print_log("Distance matrix reconstruction proportion = {0}".format(
        reconstruction_proportion))

    return loc_PCA, variance_explained, reconstruction_proportion
Ejemplo n.º 18
0
def pca(X, **kwargs):
    col_sums = np.nansum(X, axis=0)
    n_inds = np.sum(~ np.isnan(X), axis=0)
    mu_hat = col_sums / n_inds      # dimension p (or n if rowNormalize is true)

    # normalization by estimated std deviation
    sd_hat = (1. + col_sums) / (2. + 2.*n_inds)
    sd_hat = np.sqrt(sd_hat * (1. - sd_hat))

    Xn = X.copy()
    Xn -= mu_hat
    Xn[np.isnan(Xn)] = 0.

    Xn /= sd_hat

    grm = np.dot(Xn, Xn.T)

    eig_indices = kwargs.get("eig_indices", [1, 2])
    eig_indices = np.array(sorted([int(x) for x in eig_indices]))

    common.print_log("Computing principal components:", eig_indices)

    S, U = np.linalg.eigh(grm)
    S_PCA, U_PCA = S[-eig_indices], U[:, -eig_indices]
    assert np.all(S_PCA > 0.)

    loc_PCA = np.sqrt(S_PCA) * U_PCA
    variance_explained = np.sum(S_PCA) / np.trace(grm) * 100.

    reconstruction_proportion = np.sum(S_PCA) / np.sum(np.abs(S))

    # (||LD'L||_F / ||LDL||_F)
    common.print_log("Percent variance explained by PCA projection = {0}".format(variance_explained))
    common.print_log("Distance matrix reconstruction proportion = {0}".format(reconstruction_proportion))

    return loc_PCA, variance_explained, reconstruction_proportion
Ejemplo n.º 19
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-b", "--out_prefix", required=True,
        help="file (without extension) to write output")
    parser.add_argument("-f", "--allele_freq_model", required=False, default="isotropic",
        help="Either 'isotropic' or 'directional'. Default, 'isotropic'")
    parser.add_argument("-n", "--n", required=True, type=int,
        help="Number of individuals n to simulate")
    parser.add_argument("-p", "--p", required=True, type=int,
        help="Number of SNPs p to simulate")
    parser.add_argument("--pc", required=False, type=int, default=10,
        help="Number of SNPs with non-zero effect sizes")
    parser.add_argument("-g", "--geno_h", required=False, type=float, default=0.95,
        help="Genetic heritability contribution (fraction between 0 and 1)")
    parser.add_argument("-a", "--anc_h", required=False, type=float, default=0.05,
        help="Ancestry heritability contribution (fraction between 0 and 1)")
    parser.add_argument("-e", "--env_h", required=False, type=float, default=0.05,
        help="Environment heritability contribution (fraction between 0 and 1)")
    parser.add_argument("args", nargs=argparse.REMAINDER)

    args = parser.parse_args(argv[1:])
    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    out_prefix = args.out_prefix
    allele_freq_model_name = args.allele_freq_model
    n = args.n
    p = args.p
    pc = args.pc

    allele_freq_fn = simulate_localization.allele_freq_fns[allele_freq_model_name]

    is_discrete = "discrete" in kwargs

    if is_discrete:
        loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution = simulate_square(allele_freq_fn, n, p, pc, geno_h=args.geno_h, anc_h=args.anc_h, **kwargs)
    else:
        loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution = simulate_square(allele_freq_fn, n, p, pc, geno_h=args.geno_h, anc_h=args.anc_h, env_h=args.env_h, **kwargs)

    assert X.shape == (n, p)

    # output true locations to file
    output_locations_file = "{0}.loc".format(out_prefix)
    df = pd.DataFrame(loc)
    df.to_csv(output_locations_file, sep="\t", header=False)
    common.print_log("Wrote ancestry information to {0}".format(output_locations_file))

    # output allele frequencies to file if needed
    if "save_allele_frequencies" in kwargs:
        allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix)
        np.save(allele_frequency_file, Q)
        common.print_log("Wrote allele frequencies to binary file {0}".format(allele_frequency_file))

    # output genotype data to bed/fam/bim file
    bed_file = "{0}.bed".format(out_prefix)
    common.write_bed_file_dims(X, bed_file, n, p)
    common.print_log("Wrote genotypes to file {0}".format(bed_file))

    # output phenotype contribution from different components
    fam_file = "{0}.fam".format(out_prefix)
    pheno_dict = {"fam_id": range(n), "ind_id": range(n), "pat_id": [0]*n, "mat_id": [0]*n, "sex": [0]*n, "phenotype": Y, "geno_contribution": geno_contribution, "anc_contribution": anc_contribution}
    if not is_discrete:
        pheno_dict["env_contribution"] = env_contribution
    pheno_df = pd.DataFrame(pheno_dict)
    if is_discrete:
        pheno_df.to_csv(fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "phenotype", "geno_contribution", "anc_contribution"])
    else:
        pheno_df.to_csv(fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "phenotype", "geno_contribution", "anc_contribution", "env_contribution"])
    common.print_log("Wrote phenotype to file {0}".format(fam_file))

    bim_file = "{0}.bim".format(out_prefix)
    # null SNPs are simulated to be on chr 1, causal on chr 2
    null_snps = set(null_snps)
    chrs = [1 if snp_idx in null_snps else 2 for snp_idx in range(p)]
    df = pd.DataFrame({"chr": chrs, "snp": range(p), "dist": range(p), "pos": range(p), "allele1": [0]*p, "allele2": [1]*p})
    df.to_csv(bim_file, sep="\t", header=False, index=False, columns=["chr", "snp", "dist", "pos", "allele1", "allele2"])
    common.print_log("Wrote snp list to file {0}".format(bim_file))
Ejemplo n.º 20
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-b", "--out_prefix", required=True,
        help="file (without extension) to write output")
    parser.add_argument("-f", "--allele_freq_model", required=False, default="isotropic",
        help="Either 'isotropic' or 'directional'. Default, 'isotropic'")
    parser.add_argument("-n", "--n", required=True, type=int,
        help="Number of individuals n to simulate")
    parser.add_argument("-p", "--p", required=True, type=int,
        help="Number of SNPs p to simulate")
    parser.add_argument("args", nargs=argparse.REMAINDER)

    args = parser.parse_args(argv[1:])
    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    out_prefix = args.out_prefix
    allele_freq_model_name = args.allele_freq_model
    n = args.n
    p = args.p

    allele_freq_fn = allele_freq_fns[allele_freq_model_name]

    loc, Q, X = simulate_square(allele_freq_fn, n, p, **kwargs)
    assert X.shape == (n, p)

    # output true locations to file
    output_locations_file = "{0}.loc".format(out_prefix)
    df = pd.DataFrame(loc)
    df.to_csv(output_locations_file, sep="\t", header=False)
    common.print_log("Wrote ind locations to {0}".format(output_locations_file))

    # output allele frequencies to file if needed
    if "save_allele_frequencies" in kwargs:
        allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix)
        np.save(allele_frequency_file, Q)
        common.print_log("Wrote allele frequencies to binary file {0}".format(allele_frequency_file))

    # output genotype data to bed/fam/bim file
    bed_file = "{0}.bed".format(out_prefix)
    common.write_bed_file_dims(X, bed_file, n, p)
    common.print_log("Wrote genotypes to file {0}".format(bed_file))

    fam_file = "{0}.fam".format(out_prefix)
    df = pd.DataFrame({"fam_id": range(n), "ind_id": range(n), "pat_id": [0]*n, "mat_id": [0]*n, "sex": [0]*n, "status": [0]*n})
    df.to_csv(fam_file, sep="\t", header=False, index=False, columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "status"])
    common.print_log("Wrote ind list to file {0}".format(fam_file))

    bim_file = "{0}.bim".format(out_prefix)
    df = pd.DataFrame({"chr": [1]*p, "snp": range(p), "dist": range(p), "pos": range(p), "allele1": [0]*p, "allele2": [1]*p})
    df.to_csv(bim_file, sep="\t", header=False, index=False, columns=["chr", "snp", "dist", "pos", "allele1", "allele2"])
    common.print_log("Wrote snp list to file {0}".format(bim_file))
Ejemplo n.º 21
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-b",
        "--prefix",
        required=True,
        help="genotype file (without extension) in plink bed file format")
    parser.add_argument("-o",
                        "--out_prefix",
                        required=True,
                        help="prefix for association test output file")
    parser.add_argument(
        '-l',
        '--locations_file',
        required=True,
        help="PCA or GAP coordinates that will be used for allele frequency \
             estimation and smoothing")
    parser.add_argument('args', nargs=argparse.REMAINDER)
    args = parser.parse_args(argv[1:])

    genotype_files_prefix = args.prefix
    out_prefix = args.out_prefix
    locations_file = args.locations_file

    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    bed_file_path = "{0}.bed".format(genotype_files_prefix)
    X = common.read_bed_file(bed_file_path)
    X = np.asarray(X, dtype=float)
    X[(X < 0) | (X > 2)] = np.nan

    fam_file_path = "{0}.fam".format(genotype_files_prefix)
    phenotype_df = pd.read_table(fam_file_path,
                                 header=None,
                                 delim_whitespace=True,
                                 names=["ind_id", "phenotype"],
                                 usecols=[1, 5])
    assert X.shape[0] == len(phenotype_df)

    loc_df = pd.read_table(locations_file,
                           delim_whitespace=True,
                           header=None,
                           names=["ind_id", "coord1", "coord2"])
    loc = np.array(loc_df[["coord1", "coord2"]])
    assert X.shape[0] == loc.shape[0]

    bim_file_path = "{0}.bim".format(genotype_files_prefix)
    snp_df = common.read_bim_file(bim_file_path)
    assert X.shape[1] == len(snp_df)

    n, p = X.shape
    common.print_log(
        "Input genotype matrix dimensions, n = {0}, p = {1}".format(n, p))

    Y = np.array(phenotype_df["phenotype"])

    # remove all inds with nan phenotypes
    non_missing_pheno_inds = ~np.isnan(Y)
    Y = Y[non_missing_pheno_inds]
    X = X[non_missing_pheno_inds, :]
    loc = loc[non_missing_pheno_inds, :]
    common.print_log("Found {0} individuals with phenotype".format(
        np.sum(non_missing_pheno_inds)))

    llr, K, R = association_test.association_test(X, Y, loc)
    llr[llr < 0] = np.nan
    p_vals = 1. - stats.chi2.cdf(llr, df=1)
    output_df = pd.DataFrame({
        "snp": snp_df["snp"],
        "llr": llr,
        "K": K,
        "R": R,
        "p": p_vals
    })

    output_file_path = "{0}.scgap".format(out_prefix)
    output_df.to_csv(output_file_path,
                     sep="\t",
                     header=False,
                     index=False,
                     na_rep="nan",
                     columns=["snp", "llr", "K", "R", "p"])
    common.print_log(
        "Output of association test written to {0}".format(output_file_path))
Ejemplo n.º 22
0
def optimize_R_K(q_hat, R, K, X, Y):
    # optimize K and R simultaneously by Newton's method
    n, p = X.shape
    TOL = 1e-6
    EPS = 1.e-10

    for iters in range(100):
        R_pow_Y_q_hat = R**Y * q_hat
        K_R_pow_Y_q_hat = K * R_pow_Y_q_hat

        Z = 1. - q_hat + K_R_pow_Y_q_hat
        s1 = 2. * K_R_pow_Y_q_hat
        s2 = s1 / Z - X
        F1 = np.nansum(s2 * Y, axis=0)
        F2 = np.nansum(s2, axis=0)

        tmp = 2. * R_pow_Y_q_hat * (1. - q_hat) / Z**2
        J22 = np.nansum(tmp, axis=0)

        tmp = Y * tmp
        J12 = np.nansum(tmp, axis=0)

        tmp = K * tmp / R
        J21 = np.nansum(tmp, axis=0)

        tmp = Y * tmp
        J11 = np.nansum(tmp, axis=0)

        det = J11 * J22 - J12 * J21
        R_new = R - (J22 * F1 - J12 * F2) / det
        K_new = K - (-J21 * F1 + J11 * F2) / det

        R_new = np.fmax(EPS, R_new)
        K_new = np.fmax(EPS, K_new)

        test_R = np.max(abs((R_new - R) / np.fmax(R_new, R)))
        test_K = np.max(abs((K_new - K) / np.fmax(K_new, K)))

        R = R_new
        K = K_new

        if iters % 10 == 0:
            common.print_log(
                "Iteration {0} of joint R, K optimization".format(iters))
            common.print_log("Max rel err in R = {0}".format(test_R))
            common.print_log("Max rel err in K = {0}".format(test_K))

        if test_R < TOL and test_K < TOL:
            break

    common.print_log("Num iterations of joint R, K optimization", iters)
    common.print_log("Max rel err in R", test_R)
    common.print_log("Max rel err in K", test_K)
    return R, K
Ejemplo n.º 23
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--out_prefix",
                        required=True,
                        help="file (without extension) to write output")
    parser.add_argument(
        "-f",
        "--allele_freq_model",
        required=False,
        default="isotropic",
        help="Either 'isotropic' or 'directional'. Default, 'isotropic'")
    parser.add_argument("-n",
                        "--n",
                        required=True,
                        type=int,
                        help="Number of individuals n to simulate")
    parser.add_argument("-p",
                        "--p",
                        required=True,
                        type=int,
                        help="Number of SNPs p to simulate")
    parser.add_argument("args", nargs=argparse.REMAINDER)

    args = parser.parse_args(argv[1:])
    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    out_prefix = args.out_prefix
    allele_freq_model_name = args.allele_freq_model
    n = args.n
    p = args.p

    allele_freq_fn = allele_freq_fns[allele_freq_model_name]

    loc, Q, X = simulate_square(allele_freq_fn, n, p, **kwargs)
    assert X.shape == (n, p)

    # output true locations to file
    output_locations_file = "{0}.loc".format(out_prefix)
    df = pd.DataFrame(loc)
    df.to_csv(output_locations_file, sep="\t", header=False)
    common.print_log(
        "Wrote ind locations to {0}".format(output_locations_file))

    # output allele frequencies to file if needed
    if "save_allele_frequencies" in kwargs:
        allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix)
        np.save(allele_frequency_file, Q)
        common.print_log("Wrote allele frequencies to binary file {0}".format(
            allele_frequency_file))

    # output genotype data to bed/fam/bim file
    bed_file = "{0}.bed".format(out_prefix)
    common.write_bed_file_dims(X, bed_file, n, p)
    common.print_log("Wrote genotypes to file {0}".format(bed_file))

    fam_file = "{0}.fam".format(out_prefix)
    df = pd.DataFrame({
        "fam_id": range(n),
        "ind_id": range(n),
        "pat_id": [0] * n,
        "mat_id": [0] * n,
        "sex": [0] * n,
        "status": [0] * n
    })
    df.to_csv(
        fam_file,
        sep="\t",
        header=False,
        index=False,
        columns=["fam_id", "ind_id", "pat_id", "mat_id", "sex", "status"])
    common.print_log("Wrote ind list to file {0}".format(fam_file))

    bim_file = "{0}.bim".format(out_prefix)
    df = pd.DataFrame({
        "chr": [1] * p,
        "snp": range(p),
        "dist": range(p),
        "pos": range(p),
        "allele1": [0] * p,
        "allele2": [1] * p
    })
    df.to_csv(bim_file,
              sep="\t",
              header=False,
              index=False,
              columns=["chr", "snp", "dist", "pos", "allele1", "allele2"])
    common.print_log("Wrote snp list to file {0}".format(bim_file))
Ejemplo n.º 24
0
def localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs):
    gdm = localization.compute_genetic_distance(X, **kwargs)

    common.print_log()
    common.print_log("Running GAP")
    cand_taus = localization.get_candidate_taus(gdm, **kwargs)

    upper_tri_Ds = gdm[np.triu_indices_from(gdm, k=1)]

    training_df_folds = None
    if inds_training_df is not None:
        training_df_folds = split_folds(inds_training_df, cv_folds)
        common.print_log(
            "Using {0}-fold cross-validation to optimize threshold tau".format(
                cv_folds))
    else:
        common.print_log(
            "No training data provided. Using distance matrix reconstruction proportion to optimize threshold tau."
        )

    best_cv_rmse = np.inf
    best_reconstruction_proportion = 0.0
    for tau_idx, tau in enumerate(cand_taus):
        thresholded_gdm = gdm * (gdm <= tau)
        common.print_log()
        common.print_log(
            "tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}".
            format(tau_idx, tau,
                   100. * np.sum(upper_tri_Ds <= tau) / len(upper_tri_Ds)))

        loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm)
        if loc_GAP is None:
            continue

        df_inferred = pd.DataFrame({
            "ind_id": inds_df.ind_id,
            "coord1": loc_GAP[:, 0],
            "coord2": loc_GAP[:, 1]
        })
        if training_df_folds is not None:
            cv_rmse = 0.
            for fold_idx in range(cv_folds):
                _, fold_rmse = rescale_locations(df_inferred,
                                                 training_df_folds[fold_idx])
                cv_rmse += fold_rmse
            cv_rmse /= cv_folds
            common.print_log("Cross-validation RMSE = {0}".format(cv_rmse))

            if cv_rmse <= best_cv_rmse:
                best_tau_idx, best_tau, best_cv_rmse = tau_idx, tau, cv_rmse
        else:
            if reconstruction_proportion >= best_reconstruction_proportion:
                best_tau_idx, best_tau, best_reconstruction_proportion = tau_idx, tau, reconstruction_proportion
                output_df = df_inferred

    common.print_log()
    common.print_log(
        "Optimal tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}"
        .format(best_tau_idx, best_tau,
                100. * np.sum(upper_tri_Ds <= best_tau) / len(upper_tri_Ds)))

    if training_df_folds is not None:
        # compute RMSE on all training data
        thresholded_gdm = gdm * (gdm <= best_tau)
        loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm,
                                                              verbose=False)
        df_inferred = pd.DataFrame({
            "ind_id": inds_df.ind_id,
            "coord1": loc_GAP[:, 0],
            "coord2": loc_GAP[:, 1]
        })
        output_df, training_rmse = rescale_locations(df_inferred,
                                                     inds_training_df)

        common.print_log(
            "Best cross-validation RMSE = {0}".format(best_cv_rmse))
        common.print_log("RMSE on training data = {0}".format(training_rmse))
    else:
        common.print_log("Best reconstruction proportion = {0}".format(
            best_reconstruction_proportion))

    gap_output_path = "{0}.gap".format(out_prefix)
    output_df.to_csv(gap_output_path,
                     sep="\t",
                     header=False,
                     index=False,
                     columns=["ind_id", "coord1", "coord2"])
    common.print_log()
    common.print_log("Wrote GAP locations to {0}".format(gap_output_path))
Ejemplo n.º 25
0
def association_test(X, Y, loc):
    """ performs SCGAP association test
    """
    n, p = X.shape
    Y = Y[:, np.newaxis]

    kernel = compute_smoothing_kernel(loc, threshold=1e-4)

    # estimation under null hypothesis
    Rn = np.ones((1, p))
    Kn = np.ones((1, p))
    Rn_pow_y = Rn**Y

    common.print_log("Null hypothesis optimization")
    q_hat = estimate_Q(X, kernel)
    Kn = estimate_K(q_hat, Rn_pow_y, X)

    loglik_null = likelihood(Rn_pow_y, Kn, X, q_hat)

    # estimation under alternate hypothesis
    max_num_restarts = 5
    R = np.ones((1, p))
    K = np.ones((1, p))
    best_loglik_alt = - np.ones(p) * np.inf
    best_R_alt = np.ones(p)
    best_K_alt = np.ones(p)

    X_cur = X
    q_cur = q_hat
    neg_llr_inds = range(p)
    restart_idx = 0
    while True:
        if restart_idx > 0:     # random restart
            R = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx, (1, len(neg_llr_inds)))
            K = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx, (1, len(neg_llr_inds)))
            X_cur = X[:, neg_llr_inds]
            q_cur = q_hat[:, neg_llr_inds]

        R_pow_Y = R**Y
        common.print_log("Alternate hypothesis optimization")
        R, K = optimize_R_K(q_cur, R, K, X_cur, Y)

        R_pow_Y = R**Y
        loglik_alt = likelihood(R_pow_Y, K, X_cur, q_cur)
        best_loglik_alt[neg_llr_inds] = np.fmax(loglik_alt, best_loglik_alt[neg_llr_inds])
        best_R_alt[neg_llr_inds] = R.flatten()
        best_K_alt[neg_llr_inds] = K.flatten()

        neg_llr_inds, = np.where(best_loglik_alt < loglik_null)
        if len(neg_llr_inds) == 0:
            common.print_log("")
            common.print_log("Alternate hypotheses optimization needed", restart_idx, "restarts")
            break

        if restart_idx == max_num_restarts:
            common.print_log("Terminating restart procedure after", restart_idx, "restarts")
            break

        restart_idx = restart_idx + 1

        common.print_log("")
        common.print_log("Restart idx", restart_idx)
        common.print_log("Num SNPs", len(neg_llr_inds))

    llr = 2.*(best_loglik_alt - loglik_null)

    return llr, best_K_alt, best_R_alt
Ejemplo n.º 26
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-b", "--prefix", required=True,
            help="genotype file (without extension) in plink bed file format")
    parser.add_argument("-o", "--out_prefix", required=True,
        help="prefix for association test output file")
    parser.add_argument('-l', '--locations_file', required=True,
        help="PCA or GAP coordinates that will be used for allele frequency \
             estimation and smoothing")
    parser.add_argument('args', nargs=argparse.REMAINDER)
    args = parser.parse_args(argv[1:])

    genotype_files_prefix = args.prefix
    out_prefix = args.out_prefix
    locations_file = args.locations_file

    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    bed_file_path = "{0}.bed".format(genotype_files_prefix)
    X = common.read_bed_file(bed_file_path)
    X = np.asarray(X, dtype=float)
    X[(X < 0) | (X > 2)] = np.nan

    fam_file_path = "{0}.fam".format(genotype_files_prefix)
    phenotype_df = pd.read_table(fam_file_path, header=None, delim_whitespace=True, names=["ind_id", "phenotype"], usecols=[1, 5])
    assert X.shape[0] == len(phenotype_df)

    loc_df = pd.read_table(locations_file, delim_whitespace=True, header=None, names=["ind_id", "coord1", "coord2"])
    loc = np.array(loc_df[["coord1", "coord2"]])
    assert X.shape[0] == loc.shape[0]

    bim_file_path = "{0}.bim".format(genotype_files_prefix)
    snp_df = common.read_bim_file(bim_file_path)
    assert X.shape[1] == len(snp_df)

    n, p = X.shape
    common.print_log("Input genotype matrix dimensions, n = {0}, p = {1}".format(n, p))

    Y = np.array(phenotype_df["phenotype"])

    # remove all inds with nan phenotypes
    non_missing_pheno_inds = ~np.isnan(Y)
    Y = Y[non_missing_pheno_inds]
    X = X[non_missing_pheno_inds, :]
    loc = loc[non_missing_pheno_inds, :]
    common.print_log("Found {0} individuals with phenotype".format(np.sum(non_missing_pheno_inds)))

    llr, K, R = association_test.association_test(X, Y, loc)
    llr[llr < 0] = np.nan
    p_vals = 1. - stats.chi2.cdf(llr, df=1)
    output_df = pd.DataFrame({"snp": snp_df["snp"], "llr": llr, "K": K, "R": R, "p": p_vals})

    output_file_path = "{0}.scgap".format(out_prefix)
    output_df.to_csv(output_file_path, sep="\t", header=False, index=False, na_rep="nan", columns=["snp", "llr", "K", "R", "p"])
    common.print_log("Output of association test written to {0}".format(output_file_path))
        import string
        text = "".join(
            random.choice(string.ascii_letters + string.digits)
            for _ in range(64))
        f.write(text)

    return file_name, file_name


if __name__ == '__main__':
    from common import get_repo, print_log
    repo = get_repo()
    print(repo)
    print()

    print_log()

    print()

    new_file_name = create_random_file(repo)[1]
    message = 'Create: ' + new_file_name
    print(message)

    repo.index.add([new_file_name])
    # # or:
    # repo.index.add(['*'])
    # repo.git.add(new_file_name)
    # repo.git.add('-A')

    repo.index.commit(message)
Ejemplo n.º 28
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--out_prefix",
                        required=True,
                        help="file (without extension) to write output")
    parser.add_argument(
        "-f",
        "--allele_freq_model",
        required=False,
        default="isotropic",
        help="Either 'isotropic' or 'directional'. Default, 'isotropic'")
    parser.add_argument("-n",
                        "--n",
                        required=True,
                        type=int,
                        help="Number of individuals n to simulate")
    parser.add_argument("-p",
                        "--p",
                        required=True,
                        type=int,
                        help="Number of SNPs p to simulate")
    parser.add_argument("--pc",
                        required=False,
                        type=int,
                        default=10,
                        help="Number of SNPs with non-zero effect sizes")
    parser.add_argument(
        "-g",
        "--geno_h",
        required=False,
        type=float,
        default=0.95,
        help="Genetic heritability contribution (fraction between 0 and 1)")
    parser.add_argument(
        "-a",
        "--anc_h",
        required=False,
        type=float,
        default=0.05,
        help="Ancestry heritability contribution (fraction between 0 and 1)")
    parser.add_argument(
        "-e",
        "--env_h",
        required=False,
        type=float,
        default=0.05,
        help="Environment heritability contribution (fraction between 0 and 1)"
    )
    parser.add_argument("args", nargs=argparse.REMAINDER)

    args = parser.parse_args(argv[1:])
    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    out_prefix = args.out_prefix
    allele_freq_model_name = args.allele_freq_model
    n = args.n
    p = args.p
    pc = args.pc

    allele_freq_fn = simulate_localization.allele_freq_fns[
        allele_freq_model_name]

    is_discrete = "discrete" in kwargs

    if is_discrete:
        loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution = simulate_square(
            allele_freq_fn,
            n,
            p,
            pc,
            geno_h=args.geno_h,
            anc_h=args.anc_h,
            **kwargs)
    else:
        loc, Q, X, Y, null_snps, causal_snps, geno_contribution, anc_contribution, env_contribution = simulate_square(
            allele_freq_fn,
            n,
            p,
            pc,
            geno_h=args.geno_h,
            anc_h=args.anc_h,
            env_h=args.env_h,
            **kwargs)

    assert X.shape == (n, p)

    # output true locations to file
    output_locations_file = "{0}.loc".format(out_prefix)
    df = pd.DataFrame(loc)
    df.to_csv(output_locations_file, sep="\t", header=False)
    common.print_log(
        "Wrote ancestry information to {0}".format(output_locations_file))

    # output allele frequencies to file if needed
    if "save_allele_frequencies" in kwargs:
        allele_frequency_file = "{0}.allelefreq.npy".format(out_prefix)
        np.save(allele_frequency_file, Q)
        common.print_log("Wrote allele frequencies to binary file {0}".format(
            allele_frequency_file))

    # output genotype data to bed/fam/bim file
    bed_file = "{0}.bed".format(out_prefix)
    common.write_bed_file_dims(X, bed_file, n, p)
    common.print_log("Wrote genotypes to file {0}".format(bed_file))

    # output phenotype contribution from different components
    fam_file = "{0}.fam".format(out_prefix)
    pheno_dict = {
        "fam_id": range(n),
        "ind_id": range(n),
        "pat_id": [0] * n,
        "mat_id": [0] * n,
        "sex": [0] * n,
        "phenotype": Y,
        "geno_contribution": geno_contribution,
        "anc_contribution": anc_contribution
    }
    if not is_discrete:
        pheno_dict["env_contribution"] = env_contribution
    pheno_df = pd.DataFrame(pheno_dict)
    if is_discrete:
        pheno_df.to_csv(fam_file,
                        sep="\t",
                        header=False,
                        index=False,
                        columns=[
                            "fam_id", "ind_id", "pat_id", "mat_id", "sex",
                            "phenotype", "geno_contribution",
                            "anc_contribution"
                        ])
    else:
        pheno_df.to_csv(fam_file,
                        sep="\t",
                        header=False,
                        index=False,
                        columns=[
                            "fam_id", "ind_id", "pat_id", "mat_id", "sex",
                            "phenotype", "geno_contribution",
                            "anc_contribution", "env_contribution"
                        ])
    common.print_log("Wrote phenotype to file {0}".format(fam_file))

    bim_file = "{0}.bim".format(out_prefix)
    # null SNPs are simulated to be on chr 1, causal on chr 2
    null_snps = set(null_snps)
    chrs = [1 if snp_idx in null_snps else 2 for snp_idx in range(p)]
    df = pd.DataFrame({
        "chr": chrs,
        "snp": range(p),
        "dist": range(p),
        "pos": range(p),
        "allele1": [0] * p,
        "allele2": [1] * p
    })
    df.to_csv(bim_file,
              sep="\t",
              header=False,
              index=False,
              columns=["chr", "snp", "dist", "pos", "allele1", "allele2"])
    common.print_log("Wrote snp list to file {0}".format(bim_file))
Ejemplo n.º 29
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-b",
        "--prefix",
        required=True,
        help="genotype file (without extension) in plink bed file format")
    parser.add_argument(
        "-l",
        "--out_prefix",
        required=True,
        help="prefix for file names where localization outputs will be stored")
    parser.add_argument(
        "-t",
        "--training_file",
        required=False,
        default=None,
        help="file containing a subset of the individuals with known locations"
    )
    parser.add_argument("-c",
                        "--cv_folds",
                        required=False,
                        type=int,
                        default=1,
                        help="number of folds for cross-validation")
    parser.add_argument(
        "args",
        nargs=argparse.REMAINDER,
        help=
        "specify either gap or pca (or both) for the localization algorithm to run"
    )

    args = parser.parse_args(argv[1:])

    genotype_files_prefix = args.prefix
    out_prefix = args.out_prefix
    training_file = args.training_file if args.training_file else None
    cv_folds = args.cv_folds

    kwargs = common.make_kwargs(args.args)

    common.print_log(" ".join(argv))
    common.print_log("args: ", args)
    common.print_log("kwargs: ", kwargs)

    fam_file_path = "{0}.fam".format(genotype_files_prefix)
    inds_df = pd.read_table(fam_file_path,
                            header=None,
                            delim_whitespace=True,
                            names=["ind_id"],
                            usecols=[1])

    inds_training_df = None
    if training_file:
        inds_training_df = pd.read_table(training_file,
                                         delim_whitespace=True,
                                         header=None,
                                         names=["ind_id", "coord1", "coord2"])
        inds_training_df = pd.merge(inds_df,
                                    inds_training_df,
                                    how="inner",
                                    on=["ind_id"])
        cv_folds = min(cv_folds, inds_training_df.shape[0])

    bed_file_path = "{0}.bed".format(genotype_files_prefix)
    X = common.read_bed_file(bed_file_path)

    n, p = X.shape
    assert len(inds_df) == n

    X = np.asarray(X, dtype=float)
    X[(X < 0) | (X > 2)] = np.nan

    common.print_log("Input matrix dimensions, n = {0}, p = {1}".format(n, p))

    if "pca" in kwargs:
        localize_pca(X, out_prefix, inds_df, inds_training_df, **kwargs)

    if "gap" in kwargs:
        localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds,
                     **kwargs)
Ejemplo n.º 30
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = 'ipetrash'

if __name__ == '__main__':
    from common import print_log
    print_log(reverse=True)
Ejemplo n.º 31
0
def optimize_R_K(q_hat, R, K, X, Y):
    # optimize K and R simultaneously by Newton's method
    n, p = X.shape
    TOL = 1e-6
    EPS = 1.e-10

    for iters in range(100):
        R_pow_Y_q_hat = R**Y * q_hat
        K_R_pow_Y_q_hat = K * R_pow_Y_q_hat

        Z = 1. - q_hat + K_R_pow_Y_q_hat
        s1 = 2. * K_R_pow_Y_q_hat
        s2 = s1/Z - X
        F1 = np.nansum(s2 * Y, axis=0)
        F2 = np.nansum(s2, axis=0)

        tmp = 2. * R_pow_Y_q_hat * (1. - q_hat) / Z**2
        J22 = np.nansum(tmp, axis=0)
        
        tmp = Y * tmp
        J12 = np.nansum(tmp, axis=0)

        tmp = K * tmp / R
        J21 = np.nansum(tmp, axis=0)

        tmp = Y * tmp
        J11 = np.nansum(tmp, axis=0)
        
        det = J11 * J22 - J12 * J21
        R_new = R - (J22 * F1 - J12 * F2) / det
        K_new = K - (- J21 * F1 + J11 * F2) / det

        R_new = np.fmax(EPS, R_new)
        K_new = np.fmax(EPS, K_new)

        test_R = np.max(abs((R_new - R) / np.fmax(R_new, R)))
        test_K = np.max(abs((K_new - K) / np.fmax(K_new, K)))
        
        R = R_new
        K = K_new

        if iters % 10 == 0:
            common.print_log("Iteration {0} of joint R, K optimization".format(iters))
            common.print_log("Max rel err in R = {0}".format(test_R))
            common.print_log("Max rel err in K = {0}".format(test_K))

        if test_R < TOL and test_K < TOL:
            break

    common.print_log("Num iterations of joint R, K optimization", iters)
    common.print_log("Max rel err in R", test_R)
    common.print_log("Max rel err in K", test_K)
    return R, K
Ejemplo n.º 32
0
def localize_gap(X, out_prefix, inds_df, inds_training_df, cv_folds, **kwargs):
    gdm = localization.compute_genetic_distance(X, **kwargs)

    common.print_log()
    common.print_log("Running GAP")
    cand_taus = localization.get_candidate_taus(gdm, **kwargs)

    upper_tri_Ds = gdm[np.triu_indices_from(gdm, k=1)]

    training_df_folds = None
    if inds_training_df is not None:
        training_df_folds = split_folds(inds_training_df, cv_folds)
        common.print_log("Using {0}-fold cross-validation to optimize threshold tau".format(cv_folds))
    else:
        common.print_log("No training data provided. Using distance matrix reconstruction proportion to optimize threshold tau.")

    best_cv_rmse = np.inf
    best_reconstruction_proportion = 0.0
    for tau_idx, tau in enumerate(cand_taus):
        thresholded_gdm = gdm * (gdm <= tau)
        common.print_log()
        common.print_log("tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}".format(tau_idx, tau, 100.*np.sum(upper_tri_Ds <= tau)/len(upper_tri_Ds)))

        loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm)
        if loc_GAP is None:
            continue

        df_inferred = pd.DataFrame({"ind_id": inds_df.ind_id, "coord1": loc_GAP[:, 0], "coord2": loc_GAP[:, 1]})
        if training_df_folds is not None:
            cv_rmse = 0.
            for fold_idx in range(cv_folds):
                _, fold_rmse = rescale_locations(df_inferred, training_df_folds[fold_idx])
                cv_rmse += fold_rmse
            cv_rmse /= cv_folds
            common.print_log("Cross-validation RMSE = {0}".format(cv_rmse))

            if cv_rmse <= best_cv_rmse:
                best_tau_idx, best_tau, best_cv_rmse = tau_idx, tau, cv_rmse
        else:
            if reconstruction_proportion >= best_reconstruction_proportion:
                best_tau_idx, best_tau, best_reconstruction_proportion = tau_idx, tau, reconstruction_proportion
                output_df = df_inferred

    common.print_log()
    common.print_log("Optimal tau_idx = {0}, tau = {1}, percentage of distances <= tau = {2}".format 
        (best_tau_idx, best_tau, 100.*np.sum(upper_tri_Ds <= best_tau)/len(upper_tri_Ds)))

    if training_df_folds is not None:
        # compute RMSE on all training data
        thresholded_gdm = gdm * (gdm <= best_tau)
        loc_GAP, reconstruction_proportion = localization.mds(thresholded_gdm, verbose=False)
        df_inferred = pd.DataFrame({"ind_id": inds_df.ind_id, "coord1": loc_GAP[:, 0], "coord2": loc_GAP[:, 1]})
        output_df, training_rmse = rescale_locations(df_inferred, inds_training_df)

        common.print_log("Best cross-validation RMSE = {0}".format(best_cv_rmse))
        common.print_log("RMSE on training data = {0}".format(training_rmse))
    else:
        common.print_log("Best reconstruction proportion = {0}".format(best_reconstruction_proportion))

    gap_output_path = "{0}.gap".format(out_prefix)
    output_df.to_csv(gap_output_path, sep="\t", header=False, index=False, columns=["ind_id", "coord1", "coord2"])
    common.print_log()
    common.print_log("Wrote GAP locations to {0}".format(gap_output_path))
Ejemplo n.º 33
0
def association_test(X, Y, loc):
    """ performs SCGAP association test
    """
    n, p = X.shape
    Y = Y[:, np.newaxis]

    kernel = compute_smoothing_kernel(loc, threshold=1e-4)

    # estimation under null hypothesis
    Rn = np.ones((1, p))
    Kn = np.ones((1, p))
    Rn_pow_y = Rn**Y

    common.print_log("Null hypothesis optimization")
    q_hat = estimate_Q(X, kernel)
    Kn = estimate_K(q_hat, Rn_pow_y, X)

    loglik_null = likelihood(Rn_pow_y, Kn, X, q_hat)

    # estimation under alternate hypothesis
    max_num_restarts = 5
    R = np.ones((1, p))
    K = np.ones((1, p))
    best_loglik_alt = -np.ones(p) * np.inf
    best_R_alt = np.ones(p)
    best_K_alt = np.ones(p)

    X_cur = X
    q_cur = q_hat
    neg_llr_inds = range(p)
    restart_idx = 0
    while True:
        if restart_idx > 0:  # random restart
            R = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx,
                                  (1, len(neg_llr_inds)))
            K = np.random.uniform(1.2**(-restart_idx), 1.2**restart_idx,
                                  (1, len(neg_llr_inds)))
            X_cur = X[:, neg_llr_inds]
            q_cur = q_hat[:, neg_llr_inds]

        R_pow_Y = R**Y
        common.print_log("Alternate hypothesis optimization")
        R, K = optimize_R_K(q_cur, R, K, X_cur, Y)

        R_pow_Y = R**Y
        loglik_alt = likelihood(R_pow_Y, K, X_cur, q_cur)
        best_loglik_alt[neg_llr_inds] = np.fmax(loglik_alt,
                                                best_loglik_alt[neg_llr_inds])
        best_R_alt[neg_llr_inds] = R.flatten()
        best_K_alt[neg_llr_inds] = K.flatten()

        neg_llr_inds, = np.where(best_loglik_alt < loglik_null)
        if len(neg_llr_inds) == 0:
            common.print_log("")
            common.print_log("Alternate hypotheses optimization needed",
                             restart_idx, "restarts")
            break

        if restart_idx == max_num_restarts:
            common.print_log("Terminating restart procedure after",
                             restart_idx, "restarts")
            break

        restart_idx = restart_idx + 1

        common.print_log("")
        common.print_log("Restart idx", restart_idx)
        common.print_log("Num SNPs", len(neg_llr_inds))

    llr = 2. * (best_loglik_alt - loglik_null)

    return llr, best_K_alt, best_R_alt