def test_invalid_data(self): with self.assertRaises(Exceptions.InvalidInputFormat) as ctx: MatrixManager.load_matrix_manager("tests/_td/cov/cov.duplicate.txt.gz") self.assertTrue("duplicate" in ctx.exception.msg.lower()) with self.assertRaises(Exceptions.InvalidInputFormat) as ctx: MatrixManager.load_matrix_manager("tests/_td/cov/cov.uncontiguous.txt.gz") self.assertTrue("contiguous" in ctx.exception.msg.lower())
def test_invalid_data(self): with self.assertRaises(Exceptions.InvalidInputFormat) as ctx: MatrixManager.load_matrix_manager( "tests/_td/cov/cov.duplicate.txt.gz") self.assertTrue("duplicate" in ctx.exception.msg.lower()) with self.assertRaises(Exceptions.InvalidInputFormat) as ctx: MatrixManager.load_matrix_manager( "tests/_td/cov/cov.uncontiguous.txt.gz") self.assertTrue("contiguous" in ctx.exception.msg.lower())
def test_from_load(self): m = MatrixManager.load_matrix_manager("tests/_td/cov/cov.txt.gz") snps, cov = m.get("ENSG00000239789.1") self.assertEqual(snps, cov_data.SNPS_ENSG00000239789_1) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_ENSG00000239789_1) n = m.n_snps("ENSG00000239789.1") self.assertEqual(n, len(cov_data.SNPS_ENSG00000239789_1)) with self.assertRaises(Exceptions.InvalidArguments) as ctx: snps, cov = m.get("ENSG00000183742.8", ["rs7806506", "rs12718973"]) self.assertTrue("whitelist" in ctx.exception.message) #? whitelist = ["rs3094989", "rs7806506", "rs12536095", "rs10226814"] snps, cov = m.get("ENSG00000183742.8", whitelist) self.assertEqual(snps, cov_data.SNPS_ENSG00000183742_8_w) numpy.testing.assert_array_almost_equal( cov, cov_data.COV_ENSG00000183742_8_w) snps, cov = m.get("ENSG00000004766.11") self.assertEqual(snps, cov_data.SNPS_ENSG00000004766_11) numpy.testing.assert_array_almost_equal( cov, cov_data.COV_ENSG00000004766_11) n = m.n_snps("ENSG00000004766.11") self.assertEqual(n, len(cov_data.COV_ENSG00000004766_11))
def test_from_load(self): m = MatrixManager.load_matrix_manager("tests/_td/cov/cov.txt.gz") snps, cov = m.get("ENSG00000239789.1") self.assertEqual(snps, cov_data.SNPS_ENSG00000239789_1) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_ENSG00000239789_1) n = m.n_ids("ENSG00000239789.1") self.assertEqual(n, len(cov_data.SNPS_ENSG00000239789_1)) with self.assertRaises(Exceptions.InvalidArguments) as ctx: snps, cov = m.get("ENSG00000183742.8", ["rs7806506", "rs12718973"]) self.assertTrue("whitelist" in ctx.exception.message) #? whitelist = ["rs3094989", "rs7806506", "rs12536095", "rs10226814"] snps, cov = m.get("ENSG00000183742.8", whitelist) self.assertEqual(snps, cov_data.SNPS_ENSG00000183742_8_w) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_ENSG00000183742_8_w) snps, cov = m.get("ENSG00000004766.11") self.assertEqual(snps, cov_data.SNPS_ENSG00000004766_11) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_ENSG00000004766_11) n = m.n_ids("ENSG00000004766.11") self.assertEqual(n, len(cov_data.COV_ENSG00000004766_11))
def _context(): gwas = _gwas() model = _prediction_model() s = SampleData.dataframe_from_covariance(SampleData.sample_covariance_s_1()) covariance = MatrixManager.MatrixManager(s, D) c = Utilities._build_context(model, covariance, gwas) return c
def test_flatten(self): labels = cov_data.SNPS_ENSG00000183742_8_w matrix = cov_data.COV_ENSG00000183742_8_w name= "test" flat = MatrixManager._flatten_matrix_data([(name, labels, matrix)]) expected = \ [('test', 'rs7806506', 'rs7806506', 0.28428631), ('test', 'rs7806506', 'rs12536095', -0.01636001), ('test', 'rs7806506', 'rs10226814', -0.00157224), ('test', 'rs12536095', 'rs12536095', 0.35760734), ('test', 'rs12536095', 'rs10226814', 0.00815426), ('test', 'rs10226814', 'rs10226814', 0.44923289)] numpy.testing.assert_array_equal(flat, expected) X = [0,1,3] cov = numpy.cov([X]) flat = MatrixManager._flatten_matrix_data([("a", "b", cov)]) expected = [('a', 'b', 'b', 2.33333333333333)] numpy.testing.assert_array_equal(flat, expected)
def test_flatten(self): labels = cov_data.SNPS_ENSG00000183742_8_w matrix = cov_data.COV_ENSG00000183742_8_w name = "test" flat = MatrixManager._flatten_matrix_data([(name, labels, matrix)]) expected = \ [('test', 'rs7806506', 'rs7806506', 0.28428631), ('test', 'rs7806506', 'rs12536095', -0.01636001), ('test', 'rs7806506', 'rs10226814', -0.00157224), ('test', 'rs12536095', 'rs12536095', 0.35760734), ('test', 'rs12536095', 'rs10226814', 0.00815426), ('test', 'rs10226814', 'rs10226814', 0.44923289)] numpy.testing.assert_array_equal(flat, expected) X = [0, 1, 3] cov = numpy.cov([X]) flat = MatrixManager._flatten_matrix_data([("a", "b", cov)]) expected = [('a', 'b', 'b', 2.33333333333333)] numpy.testing.assert_array_equal(flat, expected)
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern) all_snps = model_manager.get_rsids() logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) covariance_results = pandas.DataFrame() context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA") covariance_results = pandas.concat([covariance_results, cov_data]) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) logging.log(9, "writing chromosome results") Utilities.save_dataframe(covariance_results, args.snp_covariance_output, mode="w" if chromosome ==1 else "a", header=chromosome==1) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def test_from_data(self): s = SampleData.dataframe_from_covariance( SampleData.sample_covariance_s_1()) m = MatrixManager.MatrixManager(s) snps, cov = m.get("A") self.assertEqual(snps, cov_data.SNPS_A) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_A) snps, cov = m.get("B") self.assertEqual(snps, cov_data.SNPS_B) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_B) snps, cov = m.get("C") self.assertEqual(snps, cov_data.SNPS_C) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_C) snps, cov = m.get("C", ['rs100', 'rs101', 'rs102']) self.assertEqual(snps, cov_data.SNPS_C) numpy.testing.assert_array_almost_equal(cov, cov_data.COV_C) with self.assertRaises(Exceptions.InvalidArguments) as ctx: snps, cov = m.get("C", ["rs100", "rs12718973"]) self.assertTrue("whitelist" in ctx.exception.message)
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter) all_snps = model_manager.get_rsids() Utilities.ensure_requisite_folders(args.snp_covariance_output) with gzip.open(args.snp_covariance_output, "w") as o: o.write("GENE\tRSID1\tRSID2\tVALUE\n") logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) for e in cov_data: l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3]) o.write(l) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))