def test_load(self): t = PredictionModel.ModelDB("tests/_td/dbs/test_1.db") extra = t.load_extra() self.assertEqual(len(extra), 6) e_e = zip(*(SampleData.sample_extra_2())) self.assertEqual(extra[PredictionModel.WDBEQF.GENE], e_e[PredictionModel.WDBEQF.GENE]) self.assertEqual(extra[PredictionModel.WDBEQF.GENE_NAME], e_e[PredictionModel.WDBEQF.GENE_NAME]) self.assertEqual(extra[PredictionModel.WDBEQF.N_SNP_IN_MODEL], e_e[PredictionModel.WDBEQF.N_SNP_IN_MODEL]) self.assertEqual(extra[PredictionModel.WDBEQF.PRED_PERF_R2], e_e[PredictionModel.WDBEQF.PRED_PERF_R2]) self.assertEqual(extra[PredictionModel.WDBEQF.PRED_PERF_PVAL], e_e[PredictionModel.WDBEQF.PRED_PERF_PVAL]) self.assertEqual(extra[PredictionModel.WDBEQF.PRED_PERF_QVAL], e_e[PredictionModel.WDBEQF.PRED_PERF_QVAL]) weights = t.load_weights() self.assertEqual(len(weights), 5) e_w = zip(*(SampleData.sample_weights_2())) self.assertEqual(weights[PredictionModel.WDBQF.RSID], e_w[PredictionModel.WDBQF.RSID]) self.assertEqual(weights[PredictionModel.WDBQF.GENE], e_w[PredictionModel.WDBQF.GENE]) self.assertEqual(weights[PredictionModel.WDBQF.WEIGHT], e_w[PredictionModel.WDBQF.WEIGHT]) self.assertEqual(weights[PredictionModel.WDBQF.REF_ALLELE], e_w[PredictionModel.WDBQF.REF_ALLELE]) self.assertEqual(weights[PredictionModel.WDBQF.EFF_ALLELE], e_w[PredictionModel.WDBQF.EFF_ALLELE])
def test_model_manager(self): model_manager = PredictionModel.load_model_manager("tests/_td/dbs_2") weights = get_weights_in_models("tests/_td/dbs_2") models_ = weights[["model", "gene"]].drop_duplicates() for t in models_.itertuples(): compare_model_manager_models_to_weights(self, model_manager, weights, t.model, t.gene) # self.assertEqual( model_manager.get_genes(), {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'}) self.assertEqual(model_manager.get_rsids(), set(weights.rsid)) self.assertEqual(model_manager.get_model_labels(), {"model_sim_1", "model_sim_2"}) # for rsid in set(weights.rsid): self.assertEqual(model_manager.snp_keys[rsid], set(weights[weights.rsid == rsid].gene)) # for gene in set(weights.gene): w = weights[weights.gene == gene].set_index( ["model", "rsid"])[["weight", "eff_allele", "ref_allele"]] m = model_manager.get_models(gene).rename( columns={ "effect_allele": "eff_allele", "non_effect_allele": "ref_allele" }) for m_ in set(m.index.get_level_values(0)): _compare(self, m.loc[m_], w.loc[m_]) # for gene in set(weights.gene): self.assertEqual(model_manager.get_model_labels(gene), set(weights[weights.gene == gene].model))
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile( args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort( ) #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % ( args.gwas_folder, args.gwas_file_pattern, ) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model( args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info( "%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" % (str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r, b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds" % (str(end - start))) return r
def test_optimized_model_manager_gtex(self): model_manager = PredictionModel.load_model_manager( "tests/_td/dbs_3", Klass=PredictionModel._ModelManager) weights = get_weights_in_models("tests/_td/dbs_3") weights.model = weights.model.str.extract("TW_(.*)_0.5", expand=False) # _assert_optimized_manager( self, model_manager, weights, {"ENSG00000107937.14", "ENSG00000107959.11", "ENSG00000234745.5"})
def test_snps_in_db(self): expected = { "rs245915", "rs245913", "rs245909", "rs245906", "rs10486599", "rs144012121", "rs117887801", "rs542000", "rs544632", "rs498475", "rs849327", "rs849336", "rs849335", "rs1513272", "rs849135", "rs849134", "rs860262", "rs849133", "rs1635852", "rs864745", "rs112751321", "rs144273091", "rs117462481", "rs149305679", "rs643036", "rs1937888", "rs17155745", "rs62626328" } actual = PredictionModel.snps_in_db("tests/_td/dbs/test_2.db") self.assertEqual(actual, expected)
def test_optimized_model_manager_gtex_trimmed(self): model_manager = PredictionModel.load_model_manager( "tests/_td/dbs_3", trim_ensemble_version=True, Klass=PredictionModel._ModelManager) weights = get_weights_in_models("tests/_td/dbs_3") weights.model = weights.model.str.extract("TW_(.*)_0.5", expand=False) weights.gene = weights.gene.str.split(".").str.get(0) # _assert_optimized_manager( self, model_manager, weights, {"ENSG00000107937", "ENSG00000107959", "ENSG00000234745"})
def test_load_model(self): snp_model = PredictionModel.load_model("tests/_td/dbs/test_1.db") e_e = SampleData.dataframe_from_extra(SampleData.sample_extra_2()) numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_GENE], e_e[PredictionModel.WDBEQF.K_GENE]) numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_GENE_NAME], e_e[PredictionModel.WDBEQF.K_GENE_NAME]) numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL], e_e[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL]) numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_R2], e_e[PredictionModel.WDBEQF.K_PRED_PERF_R2]) numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_PVAL], e_e[PredictionModel.WDBEQF.K_PRED_PERF_PVAL]) numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_QVAL], e_e[PredictionModel.WDBEQF.K_PRED_PERF_QVAL]) e_w = SampleData.dataframe_from_weights(SampleData.sample_weights_2()) numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_RSID], e_w[PredictionModel.WDBQF.K_RSID]) numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_GENE], e_w[PredictionModel.WDBQF.K_GENE]) numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_WEIGHT], e_w[PredictionModel.WDBQF.K_WEIGHT]) numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE], e_w[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE]) numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_EFFECT_ALLELE], e_w[PredictionModel.WDBQF.K_EFFECT_ALLELE])
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort() #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info("%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" %(str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r,b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start))) return r
def model_structure(args): model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) m = {} weights, extra = model.weights, model.extra if args.sub_batches is not None and args.sub_batch is not None: logging.info("slicing models") extra = Utilities.sub_batch(extra, args.sub_batches, args.sub_batch) weights = weights[weights.gene.isin(extra.gene)].reset_index(drop=True) if args.only_entries: extra = extra[extra.gene.isin(set(args.only_entries))] weights = weights[weights.gene.isin(set(args.only_entries))] for i in weights.itertuples(): if not i.rsid in m: m[i.rsid] = (i.non_effect_allele, i.effect_allele, {}) m[i.rsid][2][i.gene] = i.weight return m, weights, extra
def test_optimized_model_manager(self): model_manager = PredictionModel.load_model_manager( "tests/_td/dbs_2", Klass=PredictionModel._ModelManager) weights = get_weights_in_models("tests/_td/dbs_2") # self.assertEqual( model_manager.get_genes(), {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'}) self.assertEqual(model_manager.get_rsids(), set(weights.rsid)) self.assertEqual(model_manager.get_model_labels(), {"model_sim_1", "model_sim_2"}) # for gene in set(weights.gene): w = weights[weights.gene == gene] m = model_manager.get_models(gene) _compare_o(self, m, w) # for gene in set(weights.gene): self.assertEqual(model_manager.get_model_labels(gene), set(weights[weights.gene == gene].model))
def test_load_model(self): snp_model = PredictionModel.load_model("tests/_td/dbs/test_1.db") e_e = SampleData.dataframe_from_extra(SampleData.sample_extra_2()) numpy.testing.assert_array_equal( snp_model.extra[PredictionModel.WDBEQF.K_GENE], e_e[PredictionModel.WDBEQF.K_GENE]) numpy.testing.assert_array_equal( snp_model.extra[PredictionModel.WDBEQF.K_GENE_NAME], e_e[PredictionModel.WDBEQF.K_GENE_NAME]) numpy.testing.assert_array_equal( snp_model.extra[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL], e_e[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL]) numpy.testing.assert_array_equal( snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_R2], e_e[PredictionModel.WDBEQF.K_PRED_PERF_R2]) numpy.testing.assert_array_equal( snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_PVAL], e_e[PredictionModel.WDBEQF.K_PRED_PERF_PVAL]) numpy.testing.assert_array_equal( snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_QVAL], e_e[PredictionModel.WDBEQF.K_PRED_PERF_QVAL]) e_w = SampleData.dataframe_from_weights(SampleData.sample_weights_2()) numpy.testing.assert_array_equal( snp_model.weights[PredictionModel.WDBQF.K_RSID], e_w[PredictionModel.WDBQF.K_RSID]) numpy.testing.assert_array_equal( snp_model.weights[PredictionModel.WDBQF.K_GENE], e_w[PredictionModel.WDBQF.K_GENE]) numpy.testing.assert_array_equal( snp_model.weights[PredictionModel.WDBQF.K_WEIGHT], e_w[PredictionModel.WDBQF.K_WEIGHT]) numpy.testing.assert_array_equal( snp_model.weights[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE], e_w[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE]) numpy.testing.assert_array_equal( snp_model.weights[PredictionModel.WDBQF.K_EFFECT_ALLELE], e_w[PredictionModel.WDBQF.K_EFFECT_ALLELE])
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern) all_snps = model_manager.get_rsids() logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) covariance_results = pandas.DataFrame() context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA") covariance_results = pandas.concat([covariance_results, cov_data]) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) logging.log(9, "writing chromosome results") Utilities.save_dataframe(covariance_results, args.snp_covariance_output, mode="w" if chromosome ==1 else "a", header=chromosome==1) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter) all_snps = model_manager.get_rsids() Utilities.ensure_requisite_folders(args.snp_covariance_output) with gzip.open(args.snp_covariance_output, "w") as o: o.write("GENE\tRSID1\tRSID2\tVALUE\n") logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) for e in cov_data: l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3]) o.write(l) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def _prediction_model(): e = SampleData.dataframe_from_extra(SampleData.sample_extra_1()) w = SampleData.dataframe_from_weights(SampleData.sample_weights_1()) p = PredictionModel.Model(w, e) return p
def test_snps_in_db(self): expected = {"rs245915", "rs245913", "rs245909", "rs245906", "rs10486599", "rs144012121", "rs117887801", "rs542000", "rs544632", "rs498475", "rs849327", "rs849336", "rs849335", "rs1513272", "rs849135", "rs849134", "rs860262", "rs849133", "rs1635852", "rs864745", "rs112751321", "rs144273091", "rs117462481", "rs149305679", "rs643036", "rs1937888", "rs17155745", "rs62626328"} actual = PredictionModel.snps_in_db("tests/_td/dbs/test_2.db") self.assertEqual(actual, expected)