def test_load(self):
        t = PredictionModel.ModelDB("tests/_td/dbs/test_1.db")
        extra = t.load_extra()
        self.assertEqual(len(extra), 6)

        e_e = zip(*(SampleData.sample_extra_2()))
        self.assertEqual(extra[PredictionModel.WDBEQF.GENE],
                         e_e[PredictionModel.WDBEQF.GENE])
        self.assertEqual(extra[PredictionModel.WDBEQF.GENE_NAME],
                         e_e[PredictionModel.WDBEQF.GENE_NAME])
        self.assertEqual(extra[PredictionModel.WDBEQF.N_SNP_IN_MODEL],
                         e_e[PredictionModel.WDBEQF.N_SNP_IN_MODEL])
        self.assertEqual(extra[PredictionModel.WDBEQF.PRED_PERF_R2],
                         e_e[PredictionModel.WDBEQF.PRED_PERF_R2])
        self.assertEqual(extra[PredictionModel.WDBEQF.PRED_PERF_PVAL],
                         e_e[PredictionModel.WDBEQF.PRED_PERF_PVAL])
        self.assertEqual(extra[PredictionModel.WDBEQF.PRED_PERF_QVAL],
                         e_e[PredictionModel.WDBEQF.PRED_PERF_QVAL])

        weights = t.load_weights()
        self.assertEqual(len(weights), 5)

        e_w = zip(*(SampleData.sample_weights_2()))
        self.assertEqual(weights[PredictionModel.WDBQF.RSID],
                         e_w[PredictionModel.WDBQF.RSID])
        self.assertEqual(weights[PredictionModel.WDBQF.GENE],
                         e_w[PredictionModel.WDBQF.GENE])
        self.assertEqual(weights[PredictionModel.WDBQF.WEIGHT],
                         e_w[PredictionModel.WDBQF.WEIGHT])
        self.assertEqual(weights[PredictionModel.WDBQF.REF_ALLELE],
                         e_w[PredictionModel.WDBQF.REF_ALLELE])
        self.assertEqual(weights[PredictionModel.WDBQF.EFF_ALLELE],
                         e_w[PredictionModel.WDBQF.EFF_ALLELE])
    def test_model_manager(self):
        model_manager = PredictionModel.load_model_manager("tests/_td/dbs_2")
        weights = get_weights_in_models("tests/_td/dbs_2")
        models_ = weights[["model", "gene"]].drop_duplicates()
        for t in models_.itertuples():
            compare_model_manager_models_to_weights(self, model_manager,
                                                    weights, t.model, t.gene)
        #
        self.assertEqual(
            model_manager.get_genes(),
            {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'})
        self.assertEqual(model_manager.get_rsids(), set(weights.rsid))
        self.assertEqual(model_manager.get_model_labels(),
                         {"model_sim_1", "model_sim_2"})
        #
        for rsid in set(weights.rsid):
            self.assertEqual(model_manager.snp_keys[rsid],
                             set(weights[weights.rsid == rsid].gene))
        #
        for gene in set(weights.gene):
            w = weights[weights.gene == gene].set_index(
                ["model", "rsid"])[["weight", "eff_allele", "ref_allele"]]
            m = model_manager.get_models(gene).rename(
                columns={
                    "effect_allele": "eff_allele",
                    "non_effect_allele": "ref_allele"
                })
            for m_ in set(m.index.get_level_values(0)):
                _compare(self, m.loc[m_], w.loc[m_])

        #
        for gene in set(weights.gene):
            self.assertEqual(model_manager.get_model_labels(gene),
                             set(weights[weights.gene == gene].model))
Beispiel #3
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(
            args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder,
                                                       regexp)
        names.sort(
        )  #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (
                args.gwas_folder,
                args.gwas_file_pattern,
            )
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(
        args.model_db_path,
        args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info(
                    "%s already exists, delete it if you want it to be done again",
                    output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %
                     (str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r, b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds" %
                     (str(end - start)))

        return r
    def test_optimized_model_manager_gtex(self):
        model_manager = PredictionModel.load_model_manager(
            "tests/_td/dbs_3", Klass=PredictionModel._ModelManager)
        weights = get_weights_in_models("tests/_td/dbs_3")
        weights.model = weights.model.str.extract("TW_(.*)_0.5", expand=False)

        #
        _assert_optimized_manager(
            self, model_manager, weights,
            {"ENSG00000107937.14", "ENSG00000107959.11", "ENSG00000234745.5"})
 def test_snps_in_db(self):
     expected = {
         "rs245915", "rs245913", "rs245909", "rs245906", "rs10486599",
         "rs144012121", "rs117887801", "rs542000", "rs544632", "rs498475",
         "rs849327", "rs849336", "rs849335", "rs1513272", "rs849135",
         "rs849134", "rs860262", "rs849133", "rs1635852", "rs864745",
         "rs112751321", "rs144273091", "rs117462481", "rs149305679",
         "rs643036", "rs1937888", "rs17155745", "rs62626328"
     }
     actual = PredictionModel.snps_in_db("tests/_td/dbs/test_2.db")
     self.assertEqual(actual, expected)
    def test_optimized_model_manager_gtex_trimmed(self):
        model_manager = PredictionModel.load_model_manager(
            "tests/_td/dbs_3",
            trim_ensemble_version=True,
            Klass=PredictionModel._ModelManager)
        weights = get_weights_in_models("tests/_td/dbs_3")
        weights.model = weights.model.str.extract("TW_(.*)_0.5", expand=False)
        weights.gene = weights.gene.str.split(".").str.get(0)

        #
        _assert_optimized_manager(
            self, model_manager, weights,
            {"ENSG00000107937", "ENSG00000107959", "ENSG00000234745"})
    def test_load_model(self):
        snp_model = PredictionModel.load_model("tests/_td/dbs/test_1.db")

        e_e = SampleData.dataframe_from_extra(SampleData.sample_extra_2())
        numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_GENE], e_e[PredictionModel.WDBEQF.K_GENE])
        numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_GENE_NAME], e_e[PredictionModel.WDBEQF.K_GENE_NAME])
        numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL], e_e[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL])
        numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_R2], e_e[PredictionModel.WDBEQF.K_PRED_PERF_R2])
        numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_PVAL], e_e[PredictionModel.WDBEQF.K_PRED_PERF_PVAL])
        numpy.testing.assert_array_equal(snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_QVAL], e_e[PredictionModel.WDBEQF.K_PRED_PERF_QVAL])

        e_w = SampleData.dataframe_from_weights(SampleData.sample_weights_2())
        numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_RSID], e_w[PredictionModel.WDBQF.K_RSID])
        numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_GENE], e_w[PredictionModel.WDBQF.K_GENE])
        numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_WEIGHT], e_w[PredictionModel.WDBQF.K_WEIGHT])
        numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE], e_w[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE])
        numpy.testing.assert_array_equal(snp_model.weights[PredictionModel.WDBQF.K_EFFECT_ALLELE], e_w[PredictionModel.WDBQF.K_EFFECT_ALLELE])
Beispiel #8
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else  None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
        names.sort() #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,)
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info("%s already exists, delete it if you want it to be done again", output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %(str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r,b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start)))

        return r
Beispiel #9
0
def model_structure(args):
    model = PredictionModel.load_model(args.model_db_path,
                                       args.model_db_snp_key)
    m = {}
    weights, extra = model.weights, model.extra
    if args.sub_batches is not None and args.sub_batch is not None:
        logging.info("slicing models")
        extra = Utilities.sub_batch(extra, args.sub_batches, args.sub_batch)
        weights = weights[weights.gene.isin(extra.gene)].reset_index(drop=True)

    if args.only_entries:
        extra = extra[extra.gene.isin(set(args.only_entries))]
        weights = weights[weights.gene.isin(set(args.only_entries))]

    for i in weights.itertuples():
        if not i.rsid in m:
            m[i.rsid] = (i.non_effect_allele, i.effect_allele, {})
        m[i.rsid][2][i.gene] = i.weight
    return m, weights, extra
    def test_optimized_model_manager(self):
        model_manager = PredictionModel.load_model_manager(
            "tests/_td/dbs_2", Klass=PredictionModel._ModelManager)
        weights = get_weights_in_models("tests/_td/dbs_2")

        #
        self.assertEqual(
            model_manager.get_genes(),
            {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'})
        self.assertEqual(model_manager.get_rsids(), set(weights.rsid))
        self.assertEqual(model_manager.get_model_labels(),
                         {"model_sim_1", "model_sim_2"})
        #
        for gene in set(weights.gene):
            w = weights[weights.gene == gene]
            m = model_manager.get_models(gene)
            _compare_o(self, m, w)

        #
        for gene in set(weights.gene):
            self.assertEqual(model_manager.get_model_labels(gene),
                             set(weights[weights.gene == gene].model))
    def test_load_model(self):
        snp_model = PredictionModel.load_model("tests/_td/dbs/test_1.db")

        e_e = SampleData.dataframe_from_extra(SampleData.sample_extra_2())
        numpy.testing.assert_array_equal(
            snp_model.extra[PredictionModel.WDBEQF.K_GENE],
            e_e[PredictionModel.WDBEQF.K_GENE])
        numpy.testing.assert_array_equal(
            snp_model.extra[PredictionModel.WDBEQF.K_GENE_NAME],
            e_e[PredictionModel.WDBEQF.K_GENE_NAME])
        numpy.testing.assert_array_equal(
            snp_model.extra[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL],
            e_e[PredictionModel.WDBEQF.K_N_SNP_IN_MODEL])
        numpy.testing.assert_array_equal(
            snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_R2],
            e_e[PredictionModel.WDBEQF.K_PRED_PERF_R2])
        numpy.testing.assert_array_equal(
            snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_PVAL],
            e_e[PredictionModel.WDBEQF.K_PRED_PERF_PVAL])
        numpy.testing.assert_array_equal(
            snp_model.extra[PredictionModel.WDBEQF.K_PRED_PERF_QVAL],
            e_e[PredictionModel.WDBEQF.K_PRED_PERF_QVAL])

        e_w = SampleData.dataframe_from_weights(SampleData.sample_weights_2())
        numpy.testing.assert_array_equal(
            snp_model.weights[PredictionModel.WDBQF.K_RSID],
            e_w[PredictionModel.WDBQF.K_RSID])
        numpy.testing.assert_array_equal(
            snp_model.weights[PredictionModel.WDBQF.K_GENE],
            e_w[PredictionModel.WDBQF.K_GENE])
        numpy.testing.assert_array_equal(
            snp_model.weights[PredictionModel.WDBQF.K_WEIGHT],
            e_w[PredictionModel.WDBQF.K_WEIGHT])
        numpy.testing.assert_array_equal(
            snp_model.weights[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE],
            e_w[PredictionModel.WDBQF.K_NON_EFFECT_ALLELE])
        numpy.testing.assert_array_equal(
            snp_model.weights[PredictionModel.WDBQF.K_EFFECT_ALLELE],
            e_w[PredictionModel.WDBQF.K_EFFECT_ALLELE])
Beispiel #12
0
def run(args):
    if os.path.exists(args.snp_covariance_output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output)
        return

    start = timer()

    logging.info("Loading models...")
    model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern)
    all_snps = model_manager.get_rsids()

    logging.info("processing genotype")
    for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps):
        logging.log(9, "Processing chromosome %s", str(chromosome))
        covariance_results = pandas.DataFrame()

        context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager)
        genes = context.get_genes()
        reporter = Utilities.PercentReporter(9, len(genes))
        reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome))
        for i,gene in enumerate(genes):
            logging.log(6, "%d/%d:%s", i+1, len(genes), gene)
            cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene)
            cov_data = MatrixManager._flatten_matrix_data([cov_data])
            cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA")
            covariance_results = pandas.concat([covariance_results, cov_data])

            reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome))

        reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome))

        logging.log(9, "writing chromosome results")
        Utilities.save_dataframe(covariance_results, args.snp_covariance_output,
                                    mode="w" if chromosome ==1 else "a",
                                    header=chromosome==1)

    end = timer()
    logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def run(args):
    if os.path.exists(args.snp_covariance_output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output)
        return

    start = timer()

    logging.info("Loading models...")
    model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter)
    all_snps = model_manager.get_rsids()
    Utilities.ensure_requisite_folders(args.snp_covariance_output)
    with gzip.open(args.snp_covariance_output, "w") as o:
        o.write("GENE\tRSID1\tRSID2\tVALUE\n")
        logging.info("processing genotype")

        for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps):
            logging.log(9, "Processing chromosome %s", str(chromosome))

            context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager)
            genes = context.get_genes()
            reporter = Utilities.PercentReporter(9, len(genes))
            reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome))
            for i,gene in enumerate(genes):
                logging.log(6, "%d/%d:%s", i+1, len(genes), gene)
                cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene)
                cov_data = MatrixManager._flatten_matrix_data([cov_data])
                for e in cov_data:
                    l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3])
                    o.write(l)

                reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome))

            reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome))

    end = timer()
    logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def _prediction_model():
    e = SampleData.dataframe_from_extra(SampleData.sample_extra_1())
    w = SampleData.dataframe_from_weights(SampleData.sample_weights_1())
    p = PredictionModel.Model(w, e)
    return p
 def test_snps_in_db(self):
     expected = {"rs245915", "rs245913", "rs245909", "rs245906", "rs10486599", "rs144012121", "rs117887801", "rs542000", "rs544632",
                 "rs498475", "rs849327", "rs849336", "rs849335", "rs1513272", "rs849135", "rs849134", "rs860262", "rs849133", "rs1635852",
                 "rs864745", "rs112751321", "rs144273091", "rs117462481", "rs149305679", "rs643036", "rs1937888", "rs17155745", "rs62626328"}
     actual = PredictionModel.snps_in_db("tests/_td/dbs/test_2.db")
     self.assertEqual(actual, expected)