def testWeightDBEntry(self): entry = WeightDBUtilities.WeightDBEntry("a", "b", "c", "d", "e") self.assertEqual(entry.rsid, "a") self.assertEqual(entry.gene, "b") self.assertEqual(entry.weight, "c") self.assertEqual(entry.ref_allele, "d") self.assertEqual(entry.eff_allele, "e")
def testWeightDBEntryLogic(self): weight_db_entry_logic = WeightDBUtilities.WeightDBEntryLogic( "tests/_td/test.db") expected_weights = expected_weights_results() expected_extra = expected_extra_results() self.assertEqual(len(weight_db_entry_logic.weights_by_gene), len(expected_extra)) self.assertEqual(len(weight_db_entry_logic.gene_data_for_gene), len(expected_extra)) for e in expected_extra: self.assertTrue(e.gene in weight_db_entry_logic.weights_by_gene) self.assertTrue(e.gene in weight_db_entry_logic.gene_data_for_gene) actual_gene_data = weight_db_entry_logic.gene_data_for_gene[e.gene] self.assertExtra([actual_gene_data], [e]) actual_weights = [ w for k, w in weight_db_entry_logic.weights_by_gene[ e.gene].items() ] e_w = [w for w in expected_weights if w.gene == e.gene] self.assertWeights(actual_weights, e_w) self.assertEqual(len(weight_db_entry_logic.genes_for_an_rsid), 6) for rsid, genes in weight_db_entry_logic.genes_for_an_rsid.items(): expected = [w.gene for w in expected_weights if w.rsid == rsid] self.assertEqual(expected, genes)
def run(self): folder = os.path.split(self.output_file)[0] if len(folder) and not os.path.exists(folder): os.makedirs(folder) if os.path.exists(self.output_file): logging.info( "Results path %s already exists, delete it if you want it to be calculated again", self.output_file) return people_by_id = None if os.path.exists(self.selected_dosage_folder): logging.info("Loading people") samples_path = Utilities.samplesInputPath( self.selected_dosage_folder) if samples_path is not None: people = Person.Person.loadPeople(samples_path) people_by_id = {p.id: p for p in people} logging.info("Loading weights from database: %s" % (self.weight_db_path)) weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) #Normalization is ignored at the moment. Not sure if it will return. results = None normalization = None results, normalization = self.resultsFromCovarianceFile( weight_db_logic) self.saveEntries(self.output_file, results) logging.info("Successfully ran MetaXcan analysis")
def run(self): if self.args.weight_db_path: logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) else: weight_db_logic = None names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) if len(names) == 0: raise Exceptions.ReportableException( "No GWAS files found on %s with pattern %s" % ( self.gwas_folder, self.gwas_regexp.pattern, )) for name in names: try: self.buildBetas(weight_db_logic, name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def testGeneEntry(self): entry = WeightDBUtilities.GeneEntry("a", "b", "c", "d", "e", "f") self.assertEqual(entry.gene, "a") self.assertEqual(entry.gene_name, "b") self.assertEqual(entry.n_snps, "c") self.assertEqual(entry.pred_perf_R2, "d") self.assertEqual(entry.pred_perf_pval, "e") self.assertEqual(entry.pred_perf_qval, "f")
def run(self): if not self.correlation_output and not self.covariance_output: logging.info("Provide --correlation_output or --covariance_output or both") return logging.info("Loading Weights") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path) self.buildFiles(weight_db_logic)
def run(self): start = timer() if not self.correlation_output and not self.covariance_output: logging.info( "Provide --correlation_output or --covariance_output or both") return logging.info("Loading Weights") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path) logging.info("Building files") self.buildFiles(weight_db_logic) end = timer() logging.info("Ran successfully in %s seconds", str(end - start))
def run(self): weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db) if self.weight_db else None contents = Utilities.contentsWithPatternsFromFolder( self.data_folder_phase, ["gz"]) if os.path.exists(self.output_file): logging.info( "Variance output already exists, delete it if you want stuff to be figured out again" ) return dir = os.path.dirname(self.output_file) if not os.path.exists(dir): os.makedirs(dir) for content in contents: self.buildVarianceDB(weight_db_logic, content)
def run(self): logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) for name in names: try: self.buildBetas(weight_db_logic, name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def testPrediXcanLoader(self): weight_db = WeightDBUtilities.WeightDBEntryLogic("tests/_td/test.db") loader = PrediXcanFormatUtilities.PrediXcanFormatDosageLoader( "tests/_td/filtered_dosage/chr1.dosage.gz", weight_db) snps, snps_by_rsid = loader.load() self.assertEqual(sorted(snps_by_rsid.keys()), ["rs1", "rs2", "rs3", "rs4", "rs5", "rs6"]) self.assertEqual(snps[0].ref_allele, "T") self.assertEqual(snps[0].eff_allele, "C") self.assertEqual(snps[0].position, 1) self.assertEqual(snps[0].data, [0, 0, 0, 2]) self.assertEqual(snps[1].ref_allele, "A") self.assertEqual(snps[1].eff_allele, "G") self.assertEqual(snps[1].position, 2) self.assertEqual(snps[1].data, [1, 1, 1, 1]) self.assertEqual(snps[2].ref_allele, "G") self.assertEqual(snps[2].eff_allele, "A") self.assertEqual(snps[2].position, 3) self.assertEqual(snps[2].data, [0, 1, 0, 1]) self.assertEqual(snps[3].ref_allele, "T") self.assertEqual(snps[3].eff_allele, "C") self.assertEqual(snps[3].position, 4) self.assertEqual(snps[3].data, [0, 0, 0, 0]) self.assertEqual(snps[4].ref_allele, "C") self.assertEqual(snps[4].eff_allele, "T") self.assertEqual(snps[4].position, 5) self.assertEqual(snps[4].data, [0, 0, 1, 1]) self.assertEqual(snps[5].ref_allele, "C") self.assertEqual(snps[5].eff_allele, "T") self.assertEqual(snps[5].position, 6) self.assertEqual(snps[5].data, [0, 0, 0, 2])
def testWeightDB(self): #test setup class DummyCallback(): def __init__(self): self.entries = [] def __call__(self, weight, extra): self.entries.append((weight, extra)) expected_weights = expected_weights_results() expected_extra = expected_extra_results() weight_db = WeightDBUtilities.WeightDB("tests/_td/test.db") #load gene data extra = weight_db.loadExtraColumnData("A") self.assertExtra(extra, [expected_extra[0]]) extra = weight_db.loadExtraColumnData("B") self.assertExtra(extra, [expected_extra[1]]) extra = weight_db.loadExtraColumnData("C") self.assertExtra(extra, [expected_extra[2]]) extra = weight_db.loadExtraColumnData("D") self.assertExtra(extra, [expected_extra[3]]) extra = weight_db.loadExtraColumnData() self.assertExtra(extra, expected_extra) #load db callback = DummyCallback() weights = weight_db.loadFromDB(callback, "A") self.assertWeights( weights, [expected_weights[0], expected_weights[1], expected_weights[2]]) self.assertEqual(len(callback.entries), 3) callback_weights = [e[0] for e in callback.entries] self.assertEqual(callback_weights, weights) callback = DummyCallback() weights = weight_db.loadFromDB(callback, "B") self.assertWeights(weights, [expected_weights[3], expected_weights[4]]) self.assertEqual(len(callback.entries), 2) callback_weights = [e[0] for e in callback.entries] self.assertEqual(callback_weights, weights) callback = DummyCallback() weights = weight_db.loadFromDB(callback, "C") self.assertWeights(weights, [expected_weights[5]]) self.assertEqual(len(callback.entries), 1) callback_weights = [e[0] for e in callback.entries] self.assertEqual(callback_weights, weights) callback = DummyCallback() weights = weight_db.loadFromDB(callback, "D") self.assertWeights(weights, [expected_weights[6]]) self.assertEqual(len(callback.entries), 1) callback_weights = [e[0] for e in callback.entries] self.assertEqual(callback_weights, weights) callback = DummyCallback() weights = weight_db.loadFromDB(callback) self.assertWeights(weights, expected_weights) self.assertEqual(len(callback.entries), 7) callback_weights = [e[0] for e in callback.entries] self.assertEqual(callback_weights, weights) #gene names gene_names = weight_db.loadGeneNamesFromDB() self.assertEqual(gene_names, ["A", "B", "C", "D"])
def testWeightDBInvalidPath(self): weight_db = WeightDBUtilities.WeightDB("tests/kk.db") with self.assertRaises(RuntimeError): weight_db.openDBIfNecessary()
def testGeneEntry(self): entry = WeightDBUtilities.GeneEntry("a", "b", "c", "d") self.assertEqual(entry.gene, "a") self.assertEqual(entry.gene_name, "b") self.assertEqual(entry.R2, "c") self.assertEqual(entry.n_snp, "d")
def buildBetas(self, db_filename): filebase = os.path.basename(db_filename).replace(".db", "") output_folder = os.path.abspath(self.args.output_directory) logging.info("Processing betas for %s" % (db_filename)) self.args.weight_db_path = os.path.abspath(db_filename) cov_directory = self.args.covariance_directory if cov_directory.upper() == "SAME": cov_directory = "/".join(self.args.weight_db_path.split("/")[0:-1]) extComponents = self.args.covariance_suffix.split("..") if len(extComponents) > 1: covext = "..".join(extComponents[0:-1]) dbext = extComponents[-1] filebase = db_filename.replace(dbext, "") self.args.covariance = "%s/%s%s" % (cov_directory, filebase.split("/")[-1], covext) else: self.args.covariance = "%s/%s%s" % ( cov_directory, filebase.strip("/")[-1], self.args.covariance_suffix) file_prefix = filebase.split("/")[-1].split(".")[0] beta_output = os.path.join(output_folder, file_prefix) logging.info("Writing betas to %s" % (beta_output)) self.args.output_folder = beta_output logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db_path) betaScript = M03_betas.GetBetas(self.args) names = Utilities.contentsWithRegexpFromFolder(self.args.gwas_folder, betaScript.gwas_regexp) if not os.path.exists(beta_output): os.makedirs(beta_output) betaScript.output_folder = beta_output #os.path.join(output_folder, filebase) if not os.path.exists(betaScript.output_folder): os.makedirs(betaScript.output_folder) report_prefix = None for name in names: name = name + ".gz" if report_prefix is None: report_prefix = name.split("/")[-1].split(".")[0] try: betaScript.buildBetas(weight_db_logic,name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass suffix = ".csv" self.args.output_file = os.path.join(output_folder, report_prefix + "-" + file_prefix + suffix) # output_folder #os.path.join(output_folder, file_prefix) + ".csv" # ZScores logging.info("Calculating ZScores for %s" % (filebase)) zscoreScript = M04_zscores.CalculateZScores(self.args) zscoreScript.folder_beta = betaScript.output_folder zscoreScript.run()
def run(self): # run the main function if not self.covariance_output: logging.info("Provide --covariance_output or both") return # list all the databases in the path for file in sorted(os.listdir(self.db_path)): if file.endswith(".db") and not file.endswith("sqtl.db"): self.db_file_list.append(file) # load the database and build the separate db entry logic logging.info("Loading Weights") count = 0 for file in self.db_file_list: count += 1 filename = self.db_path + file self.db_logic_dict[file] = WeightDBUtilities.WeightDBEntryLogic( filename) logging.info("Building file" + str(count)) # merge the info from different databases tmp_logic_object = self.db_logic_dict[list( self.db_logic_dict.keys())[0]] count = 0 for db_logic in self.db_logic_dict.values(): count += 1 logging.info("Scanning file" + str(count)) # update the weights_by_gene count_gene = 0 num_gene = len(db_logic.weights_by_gene.keys()) for gene in db_logic.weights_by_gene.keys(): count_gene += 1 if count_gene % 150 == 0: logging.info("Percentage of genes processed " + str( round(float(count_gene) / float(num_gene), 2) * 100)) if gene in tmp_logic_object.weights_by_gene.keys(): for rsid in db_logic.weights_by_gene[gene].keys(): if rsid not in tmp_logic_object.weights_by_gene[ gene].keys(): tmp_logic_object.weights_by_gene[gene][ rsid] = db_logic.weights_by_gene[gene][rsid] else: tmp_logic_object.weights_by_gene[ gene] = db_logic.weights_by_gene[gene] # summary of gene count and snp count logging.info("Total Genes:" + str(len(tmp_logic_object.weights_by_gene.keys()))) rsid_count = 0 for gene in tmp_logic_object.weights_by_gene.keys(): rsid_count += len(tmp_logic_object.weights_by_gene[gene].keys()) logging.info("Total SNPs:" + str(rsid_count)) # store the pickle file pickle_out = open("db_weight_logic.pickle", "wb") pickle.dump(tmp_logic_object, pickle_out) pickle_out.close() # store the gene info self.saveGeneInfo(tmp_logic_object) # whether calculate the covariance directly # store the database entry logic as pickle file if not self.store_pickle_only: self.buildFiles(tmp_logic_object) logging.info("Ran successfully")
def run(self): # run the main function if not self.covariance_output: logging.info("Provide --covariance_output or both") return # list all the databases in the path for file in sorted(os.listdir(self.db_path)): if file.endswith(".db"): self.db_file_list.append(file) # load the database and build the separate db entry logic logging.info("Loading Weights") count = 0 # gene level snplist for gene in self.gene_list: self.gene_rsid_dict[gene] = [] # reading weight database tissue_vec = [] #self.db_file_list = self.db_file_list[0:1] for file in self.db_file_list: count += 1 filename = self.db_path + file tissue = file.split(".db")[0] tissue_vec.append(tissue) self.db_logic_dict[tissue] = WeightDBUtilities.WeightDBEntryLogicGene(filename, self.gene_list) self.db_logic_tissue[tissue] = copy.deepcopy(self.db_logic_dict[tissue]) logging.info("Building file" + str(count)) print "INFO: Building file" + str(count) #print self.db_logic_dict[tissue].weights_by_gene[self.gene_list[0]] # same tissue, different genes for tissue in tissue_vec: tmp_db_logic = self.db_logic_tissue[tissue] tmp_db_logic.weights_by_gene['merged'] = {} for gene in self.gene_list: if gene in tmp_db_logic.weights_by_gene.keys(): tmp_db_logic.weights_by_gene['merged'].update(tmp_db_logic.weights_by_gene[gene]) del tmp_db_logic.weights_by_gene[gene] if len(tmp_db_logic.weights_by_gene['merged'].keys()) > 0: rsid_merged = tmp_db_logic.weights_by_gene['merged'].keys() for rsid in rsid_merged: tmp_db_logic.genes_for_an_rsid[rsid] = 'merged' # build related data self.buildFilesTissue(tmp_db_logic, tissue, self.chr_idx) # same gene, different tissues for gene in self.gene_list: self.db_logic_gene[gene] = copy.deepcopy(self.db_logic_dict[tissue_vec[0]]) tmp_db_logic = self.db_logic_gene[gene] if not gene in tmp_db_logic.weights_by_gene.keys(): tmp_db_logic.weights_by_gene[gene] = {} for tissue in tissue_vec: if gene in self.db_logic_dict[tissue].weights_by_gene.keys(): tmp_db_logic.weights_by_gene[gene].update(self.db_logic_dict[tissue].weights_by_gene[gene]) for tmp_gene in self.gene_list: if tmp_gene != gene and tmp_gene in tmp_db_logic.weights_by_gene.keys(): del tmp_db_logic.weights_by_gene[tmp_gene] if len(tmp_db_logic.weights_by_gene[gene].keys()) > 0: rsid_merged = tmp_db_logic.weights_by_gene[gene].keys() for rsid in rsid_merged: tmp_db_logic.genes_for_an_rsid[rsid] = gene # build related data self.buildFilesGene(tmp_db_logic, gene, self.chr_idx) else: print "Not coresponding SNPs for gene " + gene return # summary of gene count and snp count logging.info("Total Genes:" + str(len(self.db_logic_dict[tissue].weights_by_gene.keys()))) logging.info("Preprocess successfully") print "INFO: Preprocess complete"