def testWeightDBEntry(self):
     entry = WeightDBUtilities.WeightDBEntry("a", "b", "c", "d", "e")
     self.assertEqual(entry.rsid, "a")
     self.assertEqual(entry.gene, "b")
     self.assertEqual(entry.weight, "c")
     self.assertEqual(entry.ref_allele, "d")
     self.assertEqual(entry.eff_allele, "e")
    def testWeightDBEntryLogic(self):
        weight_db_entry_logic = WeightDBUtilities.WeightDBEntryLogic(
            "tests/_td/test.db")

        expected_weights = expected_weights_results()
        expected_extra = expected_extra_results()

        self.assertEqual(len(weight_db_entry_logic.weights_by_gene),
                         len(expected_extra))
        self.assertEqual(len(weight_db_entry_logic.gene_data_for_gene),
                         len(expected_extra))

        for e in expected_extra:
            self.assertTrue(e.gene in weight_db_entry_logic.weights_by_gene)
            self.assertTrue(e.gene in weight_db_entry_logic.gene_data_for_gene)

            actual_gene_data = weight_db_entry_logic.gene_data_for_gene[e.gene]
            self.assertExtra([actual_gene_data], [e])

            actual_weights = [
                w for k, w in weight_db_entry_logic.weights_by_gene[
                    e.gene].items()
            ]
            e_w = [w for w in expected_weights if w.gene == e.gene]
            self.assertWeights(actual_weights, e_w)

        self.assertEqual(len(weight_db_entry_logic.genes_for_an_rsid), 6)
        for rsid, genes in weight_db_entry_logic.genes_for_an_rsid.items():
            expected = [w.gene for w in expected_weights if w.rsid == rsid]
            self.assertEqual(expected, genes)
Example #3
0
    def run(self):
        folder = os.path.split(self.output_file)[0]
        if len(folder) and not os.path.exists(folder):
            os.makedirs(folder)

        if os.path.exists(self.output_file):
            logging.info(
                "Results path %s already exists, delete it if you want it to be calculated again",
                self.output_file)
            return

        people_by_id = None
        if os.path.exists(self.selected_dosage_folder):
            logging.info("Loading people")
            samples_path = Utilities.samplesInputPath(
                self.selected_dosage_folder)
            if samples_path is not None:
                people = Person.Person.loadPeople(samples_path)
                people_by_id = {p.id: p for p in people}

        logging.info("Loading weights from database: %s" %
                     (self.weight_db_path))
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db_path)

        #Normalization is ignored at the moment. Not sure if it will return.
        results = None
        normalization = None
        results, normalization = self.resultsFromCovarianceFile(
            weight_db_logic)

        self.saveEntries(self.output_file, results)

        logging.info("Successfully ran MetaXcan analysis")
Example #4
0
    def run(self):
        if self.args.weight_db_path:
            logging.info("Loading weight model")
            weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
                self.weight_db_path)
        else:
            weight_db_logic = None

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        if len(names) == 0:
            raise Exceptions.ReportableException(
                "No GWAS files found on %s with pattern %s" % (
                    self.gwas_folder,
                    self.gwas_regexp.pattern,
                ))

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
 def testGeneEntry(self):
     entry = WeightDBUtilities.GeneEntry("a", "b", "c", "d", "e", "f")
     self.assertEqual(entry.gene, "a")
     self.assertEqual(entry.gene_name, "b")
     self.assertEqual(entry.n_snps, "c")
     self.assertEqual(entry.pred_perf_R2, "d")
     self.assertEqual(entry.pred_perf_pval, "e")
     self.assertEqual(entry.pred_perf_qval, "f")
Example #6
0
    def run(self):
        if not self.correlation_output and not self.covariance_output:
            logging.info("Provide --correlation_output or --covariance_output or both")
            return

        logging.info("Loading Weights")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path)

        self.buildFiles(weight_db_logic)
Example #7
0
    def run(self):
        start = timer()

        if not self.correlation_output and not self.covariance_output:
            logging.info(
                "Provide --correlation_output or --covariance_output or both")
            return

        logging.info("Loading Weights")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path)

        logging.info("Building files")
        self.buildFiles(weight_db_logic)

        end = timer()
        logging.info("Ran successfully in %s seconds", str(end - start))
Example #8
0
    def run(self):
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db) if self.weight_db else None
        contents = Utilities.contentsWithPatternsFromFolder(
            self.data_folder_phase, ["gz"])

        if os.path.exists(self.output_file):
            logging.info(
                "Variance output already exists, delete it if you want stuff to be figured out again"
            )
            return

        dir = os.path.dirname(self.output_file)
        if not os.path.exists(dir):
            os.makedirs(dir)

        for content in contents:
            self.buildVarianceDB(weight_db_logic, content)
Example #9
0
    def run(self):
        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db_path)

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
Example #10
0
    def testPrediXcanLoader(self):
        weight_db = WeightDBUtilities.WeightDBEntryLogic("tests/_td/test.db")
        loader = PrediXcanFormatUtilities.PrediXcanFormatDosageLoader(
            "tests/_td/filtered_dosage/chr1.dosage.gz", weight_db)
        snps, snps_by_rsid = loader.load()

        self.assertEqual(sorted(snps_by_rsid.keys()),
                         ["rs1", "rs2", "rs3", "rs4", "rs5", "rs6"])

        self.assertEqual(snps[0].ref_allele, "T")
        self.assertEqual(snps[0].eff_allele, "C")
        self.assertEqual(snps[0].position, 1)
        self.assertEqual(snps[0].data, [0, 0, 0, 2])

        self.assertEqual(snps[1].ref_allele, "A")
        self.assertEqual(snps[1].eff_allele, "G")
        self.assertEqual(snps[1].position, 2)
        self.assertEqual(snps[1].data, [1, 1, 1, 1])

        self.assertEqual(snps[2].ref_allele, "G")
        self.assertEqual(snps[2].eff_allele, "A")
        self.assertEqual(snps[2].position, 3)
        self.assertEqual(snps[2].data, [0, 1, 0, 1])

        self.assertEqual(snps[3].ref_allele, "T")
        self.assertEqual(snps[3].eff_allele, "C")
        self.assertEqual(snps[3].position, 4)
        self.assertEqual(snps[3].data, [0, 0, 0, 0])

        self.assertEqual(snps[4].ref_allele, "C")
        self.assertEqual(snps[4].eff_allele, "T")
        self.assertEqual(snps[4].position, 5)
        self.assertEqual(snps[4].data, [0, 0, 1, 1])

        self.assertEqual(snps[5].ref_allele, "C")
        self.assertEqual(snps[5].eff_allele, "T")
        self.assertEqual(snps[5].position, 6)
        self.assertEqual(snps[5].data, [0, 0, 0, 2])
    def testWeightDB(self):
        #test setup
        class DummyCallback():
            def __init__(self):
                self.entries = []

            def __call__(self, weight, extra):
                self.entries.append((weight, extra))

        expected_weights = expected_weights_results()
        expected_extra = expected_extra_results()

        weight_db = WeightDBUtilities.WeightDB("tests/_td/test.db")

        #load gene data
        extra = weight_db.loadExtraColumnData("A")
        self.assertExtra(extra, [expected_extra[0]])

        extra = weight_db.loadExtraColumnData("B")
        self.assertExtra(extra, [expected_extra[1]])

        extra = weight_db.loadExtraColumnData("C")
        self.assertExtra(extra, [expected_extra[2]])

        extra = weight_db.loadExtraColumnData("D")
        self.assertExtra(extra, [expected_extra[3]])

        extra = weight_db.loadExtraColumnData()
        self.assertExtra(extra, expected_extra)

        #load db
        callback = DummyCallback()
        weights = weight_db.loadFromDB(callback, "A")
        self.assertWeights(
            weights,
            [expected_weights[0], expected_weights[1], expected_weights[2]])
        self.assertEqual(len(callback.entries), 3)
        callback_weights = [e[0] for e in callback.entries]
        self.assertEqual(callback_weights, weights)

        callback = DummyCallback()
        weights = weight_db.loadFromDB(callback, "B")
        self.assertWeights(weights, [expected_weights[3], expected_weights[4]])
        self.assertEqual(len(callback.entries), 2)
        callback_weights = [e[0] for e in callback.entries]
        self.assertEqual(callback_weights, weights)

        callback = DummyCallback()
        weights = weight_db.loadFromDB(callback, "C")
        self.assertWeights(weights, [expected_weights[5]])
        self.assertEqual(len(callback.entries), 1)
        callback_weights = [e[0] for e in callback.entries]
        self.assertEqual(callback_weights, weights)

        callback = DummyCallback()
        weights = weight_db.loadFromDB(callback, "D")
        self.assertWeights(weights, [expected_weights[6]])
        self.assertEqual(len(callback.entries), 1)
        callback_weights = [e[0] for e in callback.entries]
        self.assertEqual(callback_weights, weights)

        callback = DummyCallback()
        weights = weight_db.loadFromDB(callback)
        self.assertWeights(weights, expected_weights)
        self.assertEqual(len(callback.entries), 7)
        callback_weights = [e[0] for e in callback.entries]
        self.assertEqual(callback_weights, weights)

        #gene names
        gene_names = weight_db.loadGeneNamesFromDB()
        self.assertEqual(gene_names, ["A", "B", "C", "D"])
    def testWeightDBInvalidPath(self):
        weight_db = WeightDBUtilities.WeightDB("tests/kk.db")

        with self.assertRaises(RuntimeError):
            weight_db.openDBIfNecessary()
 def testGeneEntry(self):
     entry = WeightDBUtilities.GeneEntry("a", "b", "c", "d")
     self.assertEqual(entry.gene, "a")
     self.assertEqual(entry.gene_name, "b")
     self.assertEqual(entry.R2, "c")
     self.assertEqual(entry.n_snp, "d")
Example #14
0
    def buildBetas(self, db_filename):
        filebase = os.path.basename(db_filename).replace(".db", "")
        output_folder = os.path.abspath(self.args.output_directory)

        logging.info("Processing betas for %s" % (db_filename))
        self.args.weight_db_path = os.path.abspath(db_filename)
        cov_directory = self.args.covariance_directory
        if cov_directory.upper() == "SAME":
            cov_directory = "/".join(self.args.weight_db_path.split("/")[0:-1])

        extComponents = self.args.covariance_suffix.split("..")

        if len(extComponents) > 1:
            covext = "..".join(extComponents[0:-1])
            dbext = extComponents[-1]
            filebase = db_filename.replace(dbext, "")
            self.args.covariance = "%s/%s%s" % (cov_directory, filebase.split("/")[-1], covext)
        else:
            self.args.covariance = "%s/%s%s" % (
            cov_directory, filebase.strip("/")[-1], self.args.covariance_suffix)
        file_prefix = filebase.split("/")[-1].split(".")[0]
        beta_output = os.path.join(output_folder, file_prefix)
        logging.info("Writing betas to %s" % (beta_output))

        self.args.output_folder = beta_output

        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db_path)

        betaScript = M03_betas.GetBetas(self.args)
        names = Utilities.contentsWithRegexpFromFolder(self.args.gwas_folder, betaScript.gwas_regexp)

        if not os.path.exists(beta_output):
            os.makedirs(beta_output)
        betaScript.output_folder = beta_output              #os.path.join(output_folder, filebase)
        if not os.path.exists(betaScript.output_folder):
            os.makedirs(betaScript.output_folder)

        report_prefix = None
        for name in names:
            name = name + ".gz"
            if report_prefix is None:
                report_prefix = name.split("/")[-1].split(".")[0]
            try:
                betaScript.buildBetas(weight_db_logic,name)

            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass

        suffix = ".csv"
        self.args.output_file = os.path.join(output_folder,
                                             report_prefix + "-" + file_prefix + suffix)  # output_folder       #os.path.join(output_folder, file_prefix) + ".csv"

        # ZScores
        logging.info("Calculating ZScores for %s" % (filebase))
        zscoreScript = M04_zscores.CalculateZScores(self.args)
        zscoreScript.folder_beta = betaScript.output_folder
        zscoreScript.run()
Example #15
0
    def run(self):
        # run the main function
        if not self.covariance_output:
            logging.info("Provide --covariance_output or both")
            return

        # list all the databases in the path
        for file in sorted(os.listdir(self.db_path)):
            if file.endswith(".db") and not file.endswith("sqtl.db"):
                self.db_file_list.append(file)

        # load the database and build the separate db entry logic
        logging.info("Loading Weights")
        count = 0
        for file in self.db_file_list:
            count += 1
            filename = self.db_path + file
            self.db_logic_dict[file] = WeightDBUtilities.WeightDBEntryLogic(
                filename)
            logging.info("Building file" + str(count))

        # merge the info from different databases
        tmp_logic_object = self.db_logic_dict[list(
            self.db_logic_dict.keys())[0]]
        count = 0
        for db_logic in self.db_logic_dict.values():
            count += 1
            logging.info("Scanning file" + str(count))
            # update the weights_by_gene
            count_gene = 0
            num_gene = len(db_logic.weights_by_gene.keys())
            for gene in db_logic.weights_by_gene.keys():
                count_gene += 1
                if count_gene % 150 == 0:
                    logging.info("Percentage of genes processed  " + str(
                        round(float(count_gene) / float(num_gene), 2) * 100))
                if gene in tmp_logic_object.weights_by_gene.keys():
                    for rsid in db_logic.weights_by_gene[gene].keys():
                        if rsid not in tmp_logic_object.weights_by_gene[
                                gene].keys():
                            tmp_logic_object.weights_by_gene[gene][
                                rsid] = db_logic.weights_by_gene[gene][rsid]
                else:
                    tmp_logic_object.weights_by_gene[
                        gene] = db_logic.weights_by_gene[gene]

        # summary of gene count and snp count
        logging.info("Total Genes:" +
                     str(len(tmp_logic_object.weights_by_gene.keys())))
        rsid_count = 0
        for gene in tmp_logic_object.weights_by_gene.keys():
            rsid_count += len(tmp_logic_object.weights_by_gene[gene].keys())
        logging.info("Total SNPs:" + str(rsid_count))

        # store the pickle file
        pickle_out = open("db_weight_logic.pickle", "wb")
        pickle.dump(tmp_logic_object, pickle_out)
        pickle_out.close()

        # store the gene info
        self.saveGeneInfo(tmp_logic_object)

        # whether calculate the covariance directly
        # store the database entry logic as pickle file
        if not self.store_pickle_only:
            self.buildFiles(tmp_logic_object)

        logging.info("Ran successfully")
    def run(self):
        # run the main function
        if not self.covariance_output:
            logging.info("Provide --covariance_output or both")
            return

        
        # list all the databases in the path
        for file in sorted(os.listdir(self.db_path)):
            if file.endswith(".db"):
                self.db_file_list.append(file)
                
        
        # load the database and build the separate db entry logic  
        logging.info("Loading Weights")
        count = 0    
        # gene level snplist
        for gene in self.gene_list:
            self.gene_rsid_dict[gene] = []
        # reading weight database
        tissue_vec = []
        #self.db_file_list = self.db_file_list[0:1]
        for file in self.db_file_list:
            count += 1
            filename = self.db_path + file
            tissue = file.split(".db")[0]
            tissue_vec.append(tissue)
            self.db_logic_dict[tissue] = WeightDBUtilities.WeightDBEntryLogicGene(filename, self.gene_list)
            self.db_logic_tissue[tissue] = copy.deepcopy(self.db_logic_dict[tissue])
            logging.info("Building file" + str(count))
            print "INFO: Building file" + str(count)
            #print self.db_logic_dict[tissue].weights_by_gene[self.gene_list[0]]
        
        # same tissue, different genes
        for tissue in tissue_vec:
            tmp_db_logic = self.db_logic_tissue[tissue]
            tmp_db_logic.weights_by_gene['merged'] = {}
            for gene in self.gene_list:
                if gene in tmp_db_logic.weights_by_gene.keys():
                    tmp_db_logic.weights_by_gene['merged'].update(tmp_db_logic.weights_by_gene[gene])
                    del tmp_db_logic.weights_by_gene[gene]
            if len(tmp_db_logic.weights_by_gene['merged'].keys()) > 0:
                rsid_merged = tmp_db_logic.weights_by_gene['merged'].keys()
                for rsid in rsid_merged:
                    tmp_db_logic.genes_for_an_rsid[rsid] = 'merged'
                # build related data
                self.buildFilesTissue(tmp_db_logic, tissue, self.chr_idx)
                
        # same gene, different tissues
        for gene in self.gene_list:
            self.db_logic_gene[gene] = copy.deepcopy(self.db_logic_dict[tissue_vec[0]])
            tmp_db_logic = self.db_logic_gene[gene]
            if not gene in tmp_db_logic.weights_by_gene.keys():
                tmp_db_logic.weights_by_gene[gene] = {}
            for tissue in tissue_vec:
                if gene in self.db_logic_dict[tissue].weights_by_gene.keys():
                    tmp_db_logic.weights_by_gene[gene].update(self.db_logic_dict[tissue].weights_by_gene[gene])
            for tmp_gene in self.gene_list:
                if tmp_gene != gene and tmp_gene in tmp_db_logic.weights_by_gene.keys():
                    del tmp_db_logic.weights_by_gene[tmp_gene]
            if len(tmp_db_logic.weights_by_gene[gene].keys()) > 0:
                rsid_merged = tmp_db_logic.weights_by_gene[gene].keys()
                for rsid in rsid_merged:
                    tmp_db_logic.genes_for_an_rsid[rsid] = gene  
                # build related data
                self.buildFilesGene(tmp_db_logic, gene, self.chr_idx) 
            else:
                print "Not coresponding SNPs for gene " + gene
                return      
       
        # summary of gene count and snp count
        logging.info("Total Genes:" + str(len(self.db_logic_dict[tissue].weights_by_gene.keys())))

        logging.info("Preprocess successfully")
        print "INFO: Preprocess complete"