def test_genotypes_exact(): """ Test whether genotype knockoffs with true HMM are accurate """ p = 10 K = 3 M = 3 n_train = 1000 n_test = 100000 pInit, Q, pEmit = generate_HMM(p, K, M) modelX = models.HMM(pInit, Q, pEmit) X = modelX.sample(n_train) _, Xfp_file = tempfile.mkstemp() fp.writeXtoInp(X, Xfp_file) fastphase = "fastphase" # Name of fastPhase executable _, out_path = tempfile.mkstemp() fp.runFastPhase(Xfp_file, out_path, fastphase=fastphase, K=5, numit=25) r_file = out_path + "_rhat.txt" alpha_file = out_path + "_alphahat.txt" theta_file = out_path + "_thetahat.txt" char_file = out_path + "_origchars" hmm = fp.loadHMM(r_file, alpha_file, theta_file, char_file) knockoffs = knockoffGenotypes(hmm["r"], hmm["alpha"], hmm["theta"], seed=123) hmm_hat = fp.loadHMM(r_file, alpha_file, theta_file, char_file, compact=False) modelX_hat = models.HMM(hmm_hat["pInit"], hmm_hat["Q"], hmm_hat["pEmit"]) X_new = modelX_hat.sample(n_test) Xk_new = knockoffs.sample(X_new) verify_exchangeability(X_new, Xk_new, tolerance=1e-3)
def test_haplotypes_fastphase(): """ Test whether haplotype knockoffs with HMM fitted by fastPHASE are accurate """ p = 10 K = 3 M = 2 n = 1000 pInit, Q, pEmit = generate_HMM(p, K, M) modelX = models.HMM(pInit, Q, pEmit) X = modelX.sample(n) _, Xfp_file = tempfile.mkstemp() fp.writeXtoInp(X, Xfp_file, phased=True) fastphase = "fastphase" # Name of fastPhase executable _, out_path = tempfile.mkstemp() fp.runFastPhase(Xfp_file, out_path, fastphase=fastphase, phased=True, K=5, numit=25) r_file = out_path + "_rhat.txt" alpha_file = out_path + "_alphahat.txt" theta_file = out_path + "_thetahat.txt" char_file = out_path + "_origchars" hmm = fp.loadHMM(r_file, alpha_file, theta_file, char_file, phased=True) knockoffs = knockoffHaplotypes(hmm["r"], hmm["alpha"], hmm["theta"], seed=123) Xk = knockoffs.sample(X) verify_exchangeability(X, Xk, tolerance=1e-1)
def test_haplotypes_hmm(): """ Test whether specialized haplotype knockoff algorithm agrees with special case """ p = 10 K = 5 M = 2 n_train = 1000 n_test = 100000 pInit, Q, pEmit = generate_HMM(p, K, M) modelX = models.HMM(pInit, Q, pEmit) X = modelX.sample(n_train) _, Xfp_file = tempfile.mkstemp() fp.writeXtoInp(X, Xfp_file, phased=True) fastphase = "fastphase" # Name of fastPhase executable _, out_path = tempfile.mkstemp() fp.runFastPhase(Xfp_file, out_path, fastphase=fastphase, phased=True, K=5, numit=25) r_file = out_path + "_rhat.txt" alpha_file = out_path + "_alphahat.txt" theta_file = out_path + "_thetahat.txt" char_file = out_path + "_origchars" groups = np.repeat(np.arange(p), 3)[:p] hmm_compact = fp.loadHMM(r_file, alpha_file, theta_file, char_file) hmm = fp.loadHMM(r_file, alpha_file, theta_file, char_file, compact=False, phased=True) knockoffs = knockoffHMM(hmm["pInit"], hmm["Q"], hmm["pEmit"], groups=groups, seed=123) knockoffs_hap = knockoffHaplotypes(hmm_compact["r"], hmm_compact["alpha"], hmm_compact["theta"], \ groups=groups, seed=123) hmm_hat = fp.loadHMM(r_file, alpha_file, theta_file, char_file, compact=False, phased=True) Xk = knockoffs.sample(X) Xk_compact = knockoffs_hap.sample(X) assert np.array_equal( Xk, Xk_compact), "Knockoffs with trivial groups do not match"
def _estimate(self, Xfp_file='./X.inp'): fp.writeX(self.X, Xfp_file) path_to_fp = "/home/roquero/Software/fastPHASE" # Relative path to the fastPhase executable out_path = "./example" # Prefix to temporary output files produced by fastPhase fp.runFastPhase(path_to_fp, Xfp_file, out_path, K=self.hidden_states, numit=self.numit) r_file = out_path + "_rhat.txt" alpha_file = out_path + "_alphahat.txt" theta_file = out_path + "_thetahat.txt" self.hmm = fp.loadFit(r_file, theta_file, alpha_file, self.X[0, :]) self.knockoffHMM = knockoffHMM(self.hmm["pInit"], self.hmm["Q"], self.hmm["pEmit"])
def make_knockoff(chromosome=None, grouped_by_chromosome=None, df_SNP=None, df_geno_experiment=None, df_geno_ensembl=None, SNP_to_wild_type=None, cache_dir=None, path_to_fp=None, em_iterations=25, random_seed=123): # assert chromosome!=None and grouped_by_chromosome!=None and df_SNP!=None assert chromosome is not None assert grouped_by_chromosome is not None assert df_SNP is not None logger.debug("################") logger.debug("Chromosome %2d #" % chromosome) logger.debug("################") num_experiment_people = len(df_geno_experiment) num_ensembl_people = len(df_geno_ensembl) indices = grouped_by_chromosome.groups[chromosome] df_SNP_chromo = df_SNP.iloc[indices].sort_values('chromosome_position') SNPs_on_chromosome = df_SNP_chromo['SNP'].values X_experiment = np.empty((num_experiment_people, len(SNPs_on_chromosome))) X_ensembl = np.empty((num_ensembl_people, len(SNPs_on_chromosome))) for X, df in [ (X_experiment, df_geno_experiment), (X_ensembl, df_geno_ensembl)]: for j, SNP in enumerate(SNPs_on_chromosome): X[:, j] = utils.genotype_to_nonwild_type_count( df[SNP].values, SNP_to_wild_type[SNP]) out_path = '%s/chrom_%d' % (cache_dir, chromosome) # If all relevant files are found in cache, skip EM recomputation; otherwise, # redo the whole thing. target_file_suffix_list = [ 'alphahat.txt', 'finallikelihoods', 'origchars', 'rhat.txt', 'thetahat.txt'] already_in_cache = True for suffix in target_file_suffix_list: target_path = os.path.join( cache_dir, 'chrom_%d_%s' % (chromosome, suffix)) if not os.path.exists(target_path): already_in_cache = False break if already_in_cache: logger.debug("Found chrom %d HMM in cache" % chromosome) else: # Write array to file Xfp_file = '%s/X_%d.inp' % (cache_dir, chromosome) fp.writeX(X_ensembl, Xfp_file) # Run fastPhase on data (which runs EM) fp.runFastPhase(path_to_fp, Xfp_file, out_path, K=12, numit=em_iterations) # Read in fastPhase results (i.e., HMM parameters) from file: r_file = out_path + "_rhat.txt" alpha_file = out_path + "_alphahat.txt" theta_file = out_path + "_thetahat.txt" # Why is X_ensembl[0, :] in the function arguments below? hmm = fp.loadFit(r_file, theta_file, alpha_file, X_ensembl[0, :]) # Actually produce the knockoffs knockoffs = knockoffHMM(hmm["pInit"], hmm["Q"], hmm[ "pEmit"], seed=random_seed) X_knockoffs = knockoffs.sample(X_experiment) return(X_knockoffs, X_experiment, SNPs_on_chromosome)