Esempio n. 1
0
def test_genotypes_exact():
    """
    Test whether genotype knockoffs with true HMM are accurate
    """
    p = 10
    K = 3
    M = 3
    n_train = 1000
    n_test = 100000
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n_train)
    _, Xfp_file = tempfile.mkstemp()
    fp.writeXtoInp(X, Xfp_file)
    fastphase = "fastphase"  # Name of fastPhase executable
    _, out_path = tempfile.mkstemp()
    fp.runFastPhase(Xfp_file, out_path, fastphase=fastphase, K=5, numit=25)
    r_file = out_path + "_rhat.txt"
    alpha_file = out_path + "_alphahat.txt"
    theta_file = out_path + "_thetahat.txt"
    char_file = out_path + "_origchars"
    hmm = fp.loadHMM(r_file, alpha_file, theta_file, char_file)
    knockoffs = knockoffGenotypes(hmm["r"],
                                  hmm["alpha"],
                                  hmm["theta"],
                                  seed=123)
    hmm_hat = fp.loadHMM(r_file,
                         alpha_file,
                         theta_file,
                         char_file,
                         compact=False)
    modelX_hat = models.HMM(hmm_hat["pInit"], hmm_hat["Q"], hmm_hat["pEmit"])
    X_new = modelX_hat.sample(n_test)
    Xk_new = knockoffs.sample(X_new)
    verify_exchangeability(X_new, Xk_new, tolerance=1e-3)
Esempio n. 2
0
def test_haplotypes_fastphase():
    """
    Test whether haplotype knockoffs with HMM fitted by fastPHASE are accurate
    """
    p = 10
    K = 3
    M = 2
    n = 1000
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n)
    _, Xfp_file = tempfile.mkstemp()
    fp.writeXtoInp(X, Xfp_file, phased=True)
    fastphase = "fastphase"  # Name of fastPhase executable
    _, out_path = tempfile.mkstemp()
    fp.runFastPhase(Xfp_file,
                    out_path,
                    fastphase=fastphase,
                    phased=True,
                    K=5,
                    numit=25)
    r_file = out_path + "_rhat.txt"
    alpha_file = out_path + "_alphahat.txt"
    theta_file = out_path + "_thetahat.txt"
    char_file = out_path + "_origchars"
    hmm = fp.loadHMM(r_file, alpha_file, theta_file, char_file, phased=True)
    knockoffs = knockoffHaplotypes(hmm["r"],
                                   hmm["alpha"],
                                   hmm["theta"],
                                   seed=123)
    Xk = knockoffs.sample(X)
    verify_exchangeability(X, Xk, tolerance=1e-1)
Esempio n. 3
0
def test_haplotypes_hmm():
    """
    Test whether specialized haplotype knockoff algorithm agrees with special case
    """
    p = 10
    K = 5
    M = 2
    n_train = 1000
    n_test = 100000
    pInit, Q, pEmit = generate_HMM(p, K, M)
    modelX = models.HMM(pInit, Q, pEmit)
    X = modelX.sample(n_train)
    _, Xfp_file = tempfile.mkstemp()
    fp.writeXtoInp(X, Xfp_file, phased=True)
    fastphase = "fastphase"  # Name of fastPhase executable
    _, out_path = tempfile.mkstemp()
    fp.runFastPhase(Xfp_file,
                    out_path,
                    fastphase=fastphase,
                    phased=True,
                    K=5,
                    numit=25)
    r_file = out_path + "_rhat.txt"
    alpha_file = out_path + "_alphahat.txt"
    theta_file = out_path + "_thetahat.txt"
    char_file = out_path + "_origchars"
    groups = np.repeat(np.arange(p), 3)[:p]
    hmm_compact = fp.loadHMM(r_file, alpha_file, theta_file, char_file)
    hmm = fp.loadHMM(r_file,
                     alpha_file,
                     theta_file,
                     char_file,
                     compact=False,
                     phased=True)
    knockoffs = knockoffHMM(hmm["pInit"],
                            hmm["Q"],
                            hmm["pEmit"],
                            groups=groups,
                            seed=123)
    knockoffs_hap = knockoffHaplotypes(hmm_compact["r"], hmm_compact["alpha"], hmm_compact["theta"], \
                                       groups=groups, seed=123)
    hmm_hat = fp.loadHMM(r_file,
                         alpha_file,
                         theta_file,
                         char_file,
                         compact=False,
                         phased=True)
    Xk = knockoffs.sample(X)
    Xk_compact = knockoffs_hap.sample(X)
    assert np.array_equal(
        Xk, Xk_compact), "Knockoffs with trivial groups do not match"
Esempio n. 4
0
 def _estimate(self, Xfp_file='./X.inp'):
     fp.writeX(self.X, Xfp_file)
     path_to_fp = "/home/roquero/Software/fastPHASE"  # Relative path to the fastPhase executable
     out_path = "./example"  # Prefix to temporary output files produced by fastPhase
     fp.runFastPhase(path_to_fp,
                     Xfp_file,
                     out_path,
                     K=self.hidden_states,
                     numit=self.numit)
     r_file = out_path + "_rhat.txt"
     alpha_file = out_path + "_alphahat.txt"
     theta_file = out_path + "_thetahat.txt"
     self.hmm = fp.loadFit(r_file, theta_file, alpha_file, self.X[0, :])
     self.knockoffHMM = knockoffHMM(self.hmm["pInit"], self.hmm["Q"],
                                    self.hmm["pEmit"])
Esempio n. 5
0
def make_knockoff(chromosome=None, grouped_by_chromosome=None, df_SNP=None,
                  df_geno_experiment=None, df_geno_ensembl=None,
                  SNP_to_wild_type=None, cache_dir=None, path_to_fp=None,
                  em_iterations=25, random_seed=123):
    # assert chromosome!=None and grouped_by_chromosome!=None and df_SNP!=None
    assert chromosome is not None
    assert grouped_by_chromosome is not None
    assert df_SNP is not None

    logger.debug("################")
    logger.debug("Chromosome %2d #" % chromosome)
    logger.debug("################")

    num_experiment_people = len(df_geno_experiment)
    num_ensembl_people = len(df_geno_ensembl)

    indices = grouped_by_chromosome.groups[chromosome]
    df_SNP_chromo = df_SNP.iloc[indices].sort_values('chromosome_position')
    SNPs_on_chromosome = df_SNP_chromo['SNP'].values

    X_experiment = np.empty((num_experiment_people, len(SNPs_on_chromosome)))
    X_ensembl = np.empty((num_ensembl_people, len(SNPs_on_chromosome)))
    for X, df in [
            (X_experiment, df_geno_experiment),
            (X_ensembl, df_geno_ensembl)]:

        for j, SNP in enumerate(SNPs_on_chromosome):
            X[:, j] = utils.genotype_to_nonwild_type_count(
                df[SNP].values, SNP_to_wild_type[SNP])

    out_path = '%s/chrom_%d' % (cache_dir, chromosome)

    # If all relevant files are found in cache, skip EM recomputation; otherwise,
    # redo the whole thing.
    target_file_suffix_list = [
        'alphahat.txt', 'finallikelihoods', 'origchars', 'rhat.txt', 'thetahat.txt']
    already_in_cache = True
    for suffix in target_file_suffix_list:
        target_path = os.path.join(
            cache_dir, 'chrom_%d_%s' % (chromosome, suffix))
        if not os.path.exists(target_path):
            already_in_cache = False
            break
    if already_in_cache:
        logger.debug("Found chrom %d HMM in cache" % chromosome)
    else:
        # Write array to file
        Xfp_file = '%s/X_%d.inp' % (cache_dir, chromosome)
        fp.writeX(X_ensembl, Xfp_file)

        # Run fastPhase on data (which runs EM)
        fp.runFastPhase(path_to_fp, Xfp_file, out_path,
                        K=12, numit=em_iterations)

    # Read in fastPhase results (i.e., HMM parameters) from file:
    r_file = out_path + "_rhat.txt"
    alpha_file = out_path + "_alphahat.txt"
    theta_file = out_path + "_thetahat.txt"
    # Why is X_ensembl[0, :] in the function arguments below?
    hmm = fp.loadFit(r_file, theta_file, alpha_file, X_ensembl[0, :])

    # Actually produce the knockoffs
    knockoffs = knockoffHMM(hmm["pInit"], hmm["Q"], hmm[
                            "pEmit"], seed=random_seed)
    X_knockoffs = knockoffs.sample(X_experiment)

    return(X_knockoffs, X_experiment, SNPs_on_chromosome)