def run_metaxcan(args, context): logging.info("Started metaxcan association") model_snps = context.get_model_snps() total_snps = len(model_snps) snps_found=set() reporter = Utilities.PercentReporter(logging.INFO, total_snps) i_genes, i_snps = context.get_data_intersection() results = [] for gene in i_genes: logging.log(7, "Processing gene %s", gene) r, snps = AssociationCalculation.association(gene, context, return_snps=True) results.append(r) snps_found.update(snps) reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study") reporter.update(len(snps_found), "%d %% of model's snps used", force=True) results = AssociationCalculation.dataframe_from_results(results) results = MetaxcanUtilities.format_output(results, context, args.remove_ens_version) if args.output_file: Utilities.ensure_requisite_folders(args.output_file) results.to_csv(args.output_file, index=False) return results
def run(args): if os.path.exists(args.output): logging.info("Output exists, delete it or move it if you want it generated again") return Utilities.ensure_requisite_folders(args.output) logging.info("Reading input") data = pandas.read_table(args.input) logging.info("Opening output") f = h5py_cache.File(args.output, 'w', chunk_cache_mem_size=int(50 * (1024 ** 2))) n_genes = data.shape[1]-2 n_samples = data.shape[0] n_genes_chunk = np.min((n_genes, 10)) logging.info("Processing expression") p = f.create_dataset("pred_expr", shape=(n_genes, n_samples), chunks=(n_genes_chunk, n_samples), dtype=np.dtype('float32'), scaleoffset=4, compression='gzip') g = f.create_dataset("genes", (n_genes,), dtype="S30") for i, gene in enumerate(data.columns.values[2:]): p[i, :] = data[gene].to_numpy() g[i] = np.string_(gene) logging.info("saving samples") s = f.create_dataset("samples", (n_samples,), dtype="S25") for i in xrange(0, n_samples): s[i] = np.string_(data["IID"][i]) f.close() logging.info("Done")
def run(args): start = timer() if os.path.exists(args.output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.output) return logging.info("Creating context") context = CrossModelUtilities.context_from_args(args) results = [] n_genes = context.get_n_genes() reporter = Utilities.PercentReporter(logging.INFO, n_genes) logging.info("Processing") reporter.update(0, "%d %% of model's genes processed so far") for i,gene in enumerate(context.get_genes()): logging.log(7, "Gene %d/%d: %s", i+1, n_genes, gene) result = JointAnalysis.joint_analysis(context, gene) results.append(result) reporter.update(i, "%d %% of model's genes processed so far") results = JointAnalysis.format_results(results) Utilities.ensure_requisite_folders(args.output) results.to_csv(args.output, index=False, sep="\t") end = timer() logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
def run(self): if os.path.exists(self.output_file): logging.info("File %s already exists, delete it if you want it calculated again", self.output_file) return logging.info("Opening %s", self.weight_db) weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path) CHROMOSOMES = ["chr"+str(x) for x in xrange(1, 23)] dosage_names = Utilities.dosageNamesFromFolder(self.data_folder_gwas_dosage) legend_names = Utilities.legendNamesFromFolder(self.data_folder_phase) findings={} for chromosome in CHROMOSOMES: logging.info("Processing chromosome %s", chromosome) dosage_name = Utilities.removeNameWithPatterns(dosage_names, [chromosome+"."]) dosage = self.loadDosageFile(self.data_folder_gwas_dosage, dosage_name) self.processDosage(chromosome, weight_db_logic, dosage, findings) legend_name = Utilities.removeNameEndingWith(legend_names, chromosome) self.processLegendName(chromosome, weight_db_logic, dosage, findings, legend_name) with open(self.output_file, "w") as file: file.write(AlleleStats.CSVHeader()) def sortByChromosome(finding): return finding.chromosome entries = sorted(findings.values(), key=sortByChromosome) for finding in entries: line = finding.toCSVLine() file.write(line)
def run(args, _gwas=None): start = timer() if not args.overwrite and os.path.exists(args.output_file): logging.info("%s already exists, move it or delete it if you want it done again", args.output_file) return logging.info("Started metaxcan association") context = MetaxcanUtilities.build_context(args, _gwas) model_snps = context.get_model_snps() total_snps = len(model_snps) snps_found=set() reporter = Utilities.PercentReporter(logging.INFO, total_snps) i_genes, i_snps = context.get_data_intersection() results = [] for gene in i_genes: r, snps = AssociationCalculation.association(gene, context, return_snps=True) results.append(r) snps_found.update(snps) reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study") Utilities.ensure_requisite_folders(args.output_file) reporter.update(len(snps_found), "%d %% of model's snps used", force=True) results = AssociationCalculation.dataframe_from_results(zip(*results)) results = MetaxcanUtilities.format_output(results, context, args.keep_ens_version) results.to_csv(args.output_file, index=False) end = timer() logging.info("Sucessfully processed metaxcan association in %s seconds"%(str(end - start)))
def run(args): start = timer() if os.path.exists(args.output): logging.info( "%s already exists, you have to move it or delete it if you want it done again", args.output) return logging.info("Creating context") context = CrossModelUtilities.context_from_args(args) results = [] n_genes = context.get_n_genes() reporter = Utilities.PercentReporter(logging.INFO, n_genes) logging.info("Processing") reporter.update(0, "%d %% of model's genes processed so far") for i, gene in enumerate(context.get_genes()): logging.log(7, "Gene %d/%d: %s", i + 1, n_genes, gene) result = JointAnalysis.joint_analysis(context, gene) results.append(result) reporter.update(i, "%d %% of model's genes processed so far") results = JointAnalysis.format_results(results) Utilities.ensure_requisite_folders(args.output) results.to_csv(args.output, index=False, sep="\t") end = timer() logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
def run(args): start = timer() if os.path.exists(args.output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.output) return if (args.hdf5_expression_file and args.expression_file) or \ (not args.hdf5_expression_file and not args.expression_file): logging.info("Provide either hdf5 expression file or plain text expression file") return with PrediXcanUtilities.p_context_from_args(args) as context: genes = context.get_genes() n_genes = len(genes) reporter = Utilities.PercentReporter(logging.INFO, n_genes) reporter.update(0, "%d %% of model's genes processed so far", force=True) results = [] for i,gene in enumerate(genes): logging.log(7, "Processing gene %s", gene) r = PrediXcanAssociation.predixcan_association(gene, context) results.append(r) reporter.update(i, "%d %% of model's genes processed so far") reporter.update(i, "%d %% of model's genes processed so far") results = PrediXcanAssociation.dataframe_from_results(results) results = results.fillna("NA") results = results.sort_values(by="pvalue") Utilities.save_dataframe(results, args.output) end = timer() logging.info("Ran multi tissue predixcan in %s seconds" % (str(end - start)))
def testContentsWithPatternsFromFolders(self): contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["sample", "Fail"]) contents = {c for c in contents} self.assertEqual(contents, set([])) contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["set", "sample"]) contents = {c for c in contents} self.assertEqual(contents, {"set.sample"})
def testCheckSubdirectorySanity(self): b = Utilities.checkSubdirectorySanity("tests", "tests") self.assertFalse(b) b = Utilities.checkSubdirectorySanity("tests", "tests/_td") self.assertTrue(b) b = Utilities.checkSubdirectorySanity("tests/_td", "tests") self.assertFalse(b)
def _run(args, subset=None, append=None): logging.info("Loading expressions") manager = FeatureMatrix.build_manager(args.expression_folder, filters = args.expression_filters, standardize=True, subset=subset) logging.info("Saving") Utilities.ensure_requisite_folders(args.output) manager.save_covariances(args.output, append=append) logging.info("Ran.")
def _run(args, subset=None, append=None): logging.info("Loading expressions") manager = FeatureMatrix.build_manager(args.expression_folder, filters=args.expression_filters, standardize=True, subset=subset) logging.info("Saving") Utilities.ensure_requisite_folders(args.output) manager.save_covariances(args.output, append=append) logging.info("Ran.")
def run(self): folder = os.path.split(self.output_file)[0] if len(folder) and not os.path.exists(folder): os.makedirs(folder) if os.path.exists(self.output_file): logging.info( "Results path %s already exists, delete it if you want it to be calculated again", self.output_file) return people_by_id = None if os.path.exists(self.selected_dosage_folder): logging.info("Loading people") samples_path = Utilities.samplesInputPath( self.selected_dosage_folder) if samples_path is not None: people = Person.Person.loadPeople(samples_path) people_by_id = {p.id: p for p in people} logging.info("Loading weights from database: %s" % (self.weight_db_path)) weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) #Normalization is ignored at the moment. Not sure if it will return. results = None normalization = None results, normalization = self.resultsFromCovarianceFile( weight_db_logic) self.saveEntries(self.output_file, results) logging.info("Successfully ran MetaXcan analysis")
def get_name_prefix(args): regexp = re.compile( args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) name = names[0] report_prefix = name.split("/")[-1].split(".")[0] return report_prefix
def readGWAS(args): start = timer() validate(args) regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort() #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,) raise Exceptions.ReportableException(msg) print "INFO: Reading GWAS data" gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) #model = PredictionModel.load_model(args.model_db_path) if args.model_db_path else None model = None # dataframe r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r,b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start))) print("Successfully parsed input gwas in %s seconds"%(str(end-start))) return r
def run(self): folder = os.path.split(self.output_file)[0] if len(folder) and not os.path.exists(folder): os.makedirs(folder) if os.path.exists(self.output_file): logging.info("Results path %s already exists, delete it if you want it to be calculated again", self.output_file) return logging.info("Loading people") people_by_id = None if os.path.exists(self.selected_dosage_folder): samples_path = Utilities.samplesInputPath(self.selected_dosage_folder) if samples_path is not None: people = Person.Person.loadPeople(samples_path) people_by_id = {p.id:p for p in people} logging.info("Loading weights from database: %s" % (self.weight_db_path)) weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path) results = None normalization = None results, normalization = self.resultsFromCovarianceFile(weight_db_logic) self.saveEntries(self.output_file, results, normalization)
def processIMPUTEFiles(self): logging.info("Loading people") names = Utilities.hapNamesFromFolder(self.dosage_folder) all_people = Person.Person.loadPeople(self.samples_input) selected_people = Person.Person.loadPeople(self.samples_output, delim=" ") selected_people_by_id = {p.id:p for p in selected_people} logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(self.snp_list) snp_dict = {rsid:True for rsid in snp_data_set.data} for name in names: output = os.path.join(self.output_folder, name) filter = ThousandGenomesUtilities.IMPUTEFilteredDosageFileBuilder() filter.base_path = self.dosage_folder filter.name = name filter.output_pattern = output filter.snp_dict = snp_dict filter.all_people = all_people filter.selected_people_by_id = selected_people_by_id if self.output_format == Formats.IMPUTE: filter.buildIMPUTE() elif self.output_format == Formats.PrediXcan: search = self.chromosome_in_name_regex.search(name) exitIf(search is None, Exceptions.InvalidInputFormat, \ "No files found in '%s' that match the pattern, '%s'" \ % (self.dosage_folder, self.chromosome_in_name_regex.pattern)) chr = search.group(1) filter.chromosome_name = chr filter.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def test_gtex_geno_lines_generator(self): data = [] for i, line in enumerate( GTExGenotype.gtex_geno_lines( "tests/_td/genotype/gtex_like.txt.gz", "tests/_td/genotype/gtex_snp.txt.gz")): data.append(line) header = GTExGenotype.gtex_geno_header( "tests/_td/genotype/gtex_like.txt.gz") gtex_ids = header[1:] header = [ "rsid", "chromosome", "position", "ref_allele", "alt_allele", "frequency" ] + gtex_ids dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore") gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz") dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz") dataframe_2 = pandas.merge(dataframe_2, gtex_snp, left_on="Id", right_on="VariantID") compare_data_frames(dataframe, dataframe_2, gtex_ids)
def buildFiles(self, weight_db_logic): do_correlations = self.correlation_output is not None if do_correlations: if os.path.exists(self.correlation_output): logging.info("%s already exists, delete it if you want it figured out again", self.correlation_output) do_correlations = False else: correlation_dir = os.path.dirname(self.correlation_output) if not os.path.exists(correlation_dir): os.makedirs(correlation_dir) self.writeFileHeader(self.correlation_output) do_covariances = self.covariance_output is not None if do_covariances: if os.path.exists(self.covariance_output): logging.info("%s already exists, delete it if you want it figured out again", self.covariance_output) do_covariances = False else: covariance_dir = os.path.dirname(self.covariance_output) if not os.path.exists(covariance_dir): os.makedirs(covariance_dir) self.writeFileHeader(self.covariance_output) if not do_covariances and not do_correlations: return names = Utilities.dosageNamesFromFolder(self.data_folder) for name in names: snps, snps_by_rsid = self.getSNPS(name, weight_db_logic) if do_correlations: self.addToCorrelationFile(weight_db_logic, name, snps, snps_by_rsid) if do_covariances: self.addToCovarianceFile(weight_db_logic, name, snps, snps_by_rsid)
def processPrediXcanFiles(self): logging.info("Loading people") all_people = Person.Person.loadPeople(self.samples_input, '\t', False) selected_people = Person.Person.loadPeople(self.samples_output) selected_people_by_id = {p.id: p for p in selected_people} logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id)) logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile( self.snp_list) snp_dict = {k: True for k in snp_data_set.data} print len(snp_dict.keys()) contents = Utilities.contentsWithPatternsFromFolder( self.dosage_folder, ["dosage.txt.gz"]) for content_name in contents: input_path = os.path.join(self.dosage_folder, content_name) fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess( input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict) if self.output_format == Formats.IMPUTE: fileBuilder.buildIMPUTE() if self.output_format == Formats.PrediXcan: fileBuilder.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def run_additional(args, context): logging.info("Started metaxcan additional stats") i_genes, i_snps = context.get_data_intersection() results = [] for gene in i_genes: stats_ = AssociationCalculation.additional_stats(gene, context) results.append(stats_) results = AssociationCalculation.dataframe_from_aditional_stats(results) results = MetaxcanUtilities.format_additional_output(results, context, args.remove_ens_version) if args.additional_output: Utilities.ensure_requisite_folders(args.additional_output) results.to_csv(args.additional_output, index=False) return results
def run(args): logging.info("Loading weight db") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(args.weight_db_path) logging.info("Loading covariance file") covariance_contents = MatrixUtilities.loadMatrixFromFile(args.covariance) logging.info("Choosing method") beta_contents = Utilities.contentsWithPatternsFromFolder(args.beta_folder, []) zscore_calculation, normalization = MethodGuessing.chooseZscoreSchemeFromFiles(args.beta_folder, beta_contents, covariance_contents, weight_db_logic) logging.info("Processing") betas = {} for content in beta_contents: logging.info("Loading betas") beta_path = os.path.join(args.beta_folder, content) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="") beta_sets = {set.name: set for set in beta_sets} betas[content] = beta_sets if args.gene_name: try: gene_data, weights, covariance_matrix, valid_rsids, beta_sets = get_gene_data(args.gene_name, weight_db_logic, covariance_contents, betas) weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights, beta_sets) if args.interactive: embed() logging.info("Processed gene data") except Exception as e: logging.info("Couldn't get gene data") embed()
def run(self): if self.args.weight_db_path: logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) else: weight_db_logic = None names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) if len(names) == 0: raise Exceptions.ReportableException( "No GWAS files found on %s with pattern %s" % ( self.gwas_folder, self.gwas_regexp.pattern, )) for name in names: try: self.buildBetas(weight_db_logic, name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def torture_dosage(metadata, dosage, gtex_ids): d = [dosage[x] for x in metadata.rsid] d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore") d["rsid"] = list(metadata.rsid) d = pandas.merge(metadata, d, on="rsid") d["number"] = list(range(0, len(d))) d = d.set_index("number") return d
def torture_dosage(metadata, dosage, gtex_ids): d = [dosage[x] for x in metadata.rsid] d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore") d["rsid"] = list(metadata.rsid) d = pandas.merge(metadata, d, on="rsid") d["number"] = range(0, len(d)) d = d.set_index("number") return d
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile( args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort( ) #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % ( args.gwas_folder, args.gwas_file_pattern, ) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model( args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info( "%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" % (str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r, b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds" % (str(end - start))) return r
def run_metaxcan(args, context): logging.info("Started metaxcan association") model_snps = context.get_model_snps() total_snps = len(model_snps) snps_found = set() reporter = Utilities.PercentReporter(logging.INFO, total_snps) i_genes, i_snps = context.get_data_intersection() results = [] additional = [] for i, gene in enumerate(i_genes): if args.MAX_R and i + 1 > args.MAX_R: logging.log("Early exit condition met") break logging.log(9, "Processing gene %i:%s", i, gene) r, snps = AssociationCalculation.association(gene, context, return_snps=True) results.append(r) snps_found.update(snps) reporter.update( len(snps_found), "%d %% of model's snps found so far in the gwas study") if args.additional_output: stats_ = AssociationCalculation.additional_stats(gene, context) additional.append(stats_) reporter.update(len(snps_found), "%d %% of model's snps used", force=True) results = AssociationCalculation.dataframe_from_results(results) results = MetaxcanUtilities.format_output(results, context, args.remove_ens_version) if args.additional_output: additional = AssociationCalculation.dataframe_from_aditional_stats( additional) results = MetaxcanUtilities.merge_additional_output( results, additional, context, args.remove_ens_version) if args.output_file: Utilities.ensure_requisite_folders(args.output_file) results.to_csv(args.output_file, index=False) return results
def get_name_prefix(args): if args.gwas_folder: regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) name = names[0] report_prefix = get_result_prefix(args, name) else: report_prefix = get_result_prefix(args, args.gwas_file) return report_prefix
def run(args): start = timer() folder, prefix = os.path.split(args.output_prefix) results_name = args.output_prefix + "__mt_results.txt" predixcan_results_name = args.output_prefix + "__p_results.txt" additional_name = args.output_prefix + "__additional.txt" if os.path.exists(results_name): logging.info( "%s already exists, you have to move it or delete it if you want it done again", results_name) return #for reproducibility numpy.random.seed(100) results = [] additional = [] predixcan_results = [] n_max = args.max_n_results logging.info("Acquiring context") with MultiPredixcanSimulations.context_from_args(args) as context: logging.info("processing") _c, _cp, _e = context.get_mp_simulation(None) for i, gene in enumerate(context.get_genes()): if n_max and i + 1 > n_max: logging.info("Max runs met") break logging.log(9, "%d Gene %s", i, gene) r, add, p = MultiPredixcanSimulations.simulate(gene, context) if r is None: logging.log(9, "%s could not be simulated", gene) continue results.append(r) additional.append(add) if p is not None: predixcan_results.append(p) results = MultiPrediXcanAssociation.dataframe_from_results( results, _c).sort_values(by="pvalue") additional = pandas.concat(additional) Utilities.ensure_requisite_folders(results_name) Utilities.save_dataframe(results, results_name) Utilities.save_dataframe(additional, additional_name) if len(predixcan_results): predixcan_results = pandas.concat(predixcan_results) Utilities.save_dataframe(predixcan_results, predixcan_results_name) logging.info("Finished")
def resultsFromCovarianceFile(self, weight_db_logic): results = {} logging.info("Loading covariance file from %s", self.covariance) covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance) #Keep only covariances present in gene models covariance_contents = {k:v for k,v in covariance_contents.iteritems() if k in weight_db_logic.weights_by_gene} beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, []) zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic) total_entries = len(weight_db_logic.genes_for_an_rsid) snps_found = set() reporter = Utilities.PercentReporter(logging.INFO, total_entries) for beta_name in beta_contents: logging.info("Processing %s", beta_name) beta_path = os.path.join(self.folder_beta, beta_name) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="") beta_sets = {set.name:set for set in beta_sets } key, check = beta_sets.iteritems().next() normalization.update(beta_sets) for gene, entry in covariance_contents.iteritems(): #So, new covariance files might actually have more genes than those in the database if not gene in weight_db_logic.weights_by_gene: logging.log(8, "Gene %s not in weights", gene) continue weights = weight_db_logic.weights_by_gene[gene] present = [rsid for rsid,weight in weights.iteritems() if rsid in check.values_by_key] if len(present) == 0: logging.log(5, "No rsid in beta file for %s", gene) continue if gene in results: logging.info("Gene %s already processed", gene) continue covariance_matrix = entry[0] valid_rsids = entry[1] logging.log(7, "Calculating z score for %s", gene) pre_zscore, n, VAR_g, effect_size = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids) results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g, effect_size) snps_found.update(present) reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study") #second pass, for genes not in any beta file self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation) normalization_constant = normalization.calculateNormalization() return results, normalization_constant
def resultsFromCovarianceFile(self, weight_db_logic): results = {} logging.info("Loading covariance file") covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance) beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, []) zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic) total_entries = len(covariance_contents) reporter = Utilities.PercentReporter(logging.INFO, total_entries) i=0 for beta_name in beta_contents: logging.info("Processing %s", beta_name) beta_path = os.path.join(self.folder_beta, beta_name) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="") beta_sets = {set.name:set for set in beta_sets } key, check = beta_sets.iteritems().next() normalization.update(beta_sets) for gene, entry in covariance_contents.iteritems(): weights = weight_db_logic.weights_by_gene[gene] process = False for rsid, weight in weights.iteritems(): if rsid in check.values_by_key: process = True break if not process: logging.log(5, "No rsid in beta file for %s", gene) continue if gene in results: logging.info("Gene %s already processed", gene) continue reporter.update(i, "%d %% of model's snp information found so far in the gwas study") #proxied by percenteage of genes covariance_matrix = entry[0] valid_rsids = entry[1] logging.log(7, "Calculating z score for %s", gene) pre_zscore, n, VAR_g = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids) results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g) i+=1 #second pass, for genes not in any beta file self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation) normalization_constant = normalization.calculateNormalization() return results, normalization_constant
def run(args): start = timer() folder, prefix = os.path.split(args.output_prefix) results_name = args.output_prefix + "__mt_results.txt" predixcan_results_name = args.output_prefix + "__p_results.txt" additional_name = args.output_prefix + "__additional.txt" if os.path.exists(results_name): logging.info("%s already exists, you have to move it or delete it if you want it done again", results_name) return #for reproducibility numpy.random.seed(100) results = [] additional = [] predixcan_results = [] n_max = args.max_n_results logging.info("Acquiring context") with MultiPredixcanSimulations.context_from_args(args) as context: logging.info("processing") _c, _cp, _e = context.get_mp_simulation(None) for i, gene in enumerate(context.get_genes()): if n_max and i+1>n_max: logging.info("Max runs met") break logging.log(9, "%d Gene %s", i, gene) r, add, p = MultiPredixcanSimulations.simulate(gene, context) if r is None: logging.log(9, "%s could not be simulated", gene) continue results.append(r) additional.append(add) if p is not None: predixcan_results.append(p) results = MultiPrediXcanAssociation.dataframe_from_results(results, _c).sort_values(by="pvalue") additional = pandas.concat(additional) Utilities.ensure_requisite_folders(results_name) Utilities.save_dataframe(results, results_name) Utilities.save_dataframe(additional, additional_name) if len(predixcan_results): predixcan_results = pandas.concat(predixcan_results) Utilities.save_dataframe(predixcan_results, predixcan_results_name) logging.info("Finished")
def getSNPS(self, name, weight_db_logic): dosageLoader = None if self.input_format == Formats.IMPUTE: dosageLoader = ThousandGenomesUtilities.IMPUTEDosageLoader(self.data_folder, name) #outdated code elif self.input_format == Formats.PrediXcan: dosageName = Utilities.dosageName(name) path = os.path.join(self.data_folder, dosageName) dosageLoader = PrediXcanFormatUtilities.PrediXcanFormatDosageLoader(path, weight_db_logic) else: logging.info("Invalid input format: %s", self.input_format) return snps, snps_by_rsid = dosageLoader.load() return snps, snps_by_rsid
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern) all_snps = model_manager.get_rsids() logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) covariance_results = pandas.DataFrame() context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA") covariance_results = pandas.concat([covariance_results, cov_data]) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) logging.log(9, "writing chromosome results") Utilities.save_dataframe(covariance_results, args.snp_covariance_output, mode="w" if chromosome ==1 else "a", header=chromosome==1) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def run(args): start = timer() if os.path.exists(args.output): logging.info( "%s already exists, you have to move it or delete it if you want it done again", args.output) return if (args.hdf5_expression_folder and args.expression_folder) or \ (not args.hdf5_expression_folder and not args.expression_folder): logging.info( "Provide either hdf5 expression folder or plain text expression folder" ) return with MultiPrediXcanUtilities.mp_context_from_args(args) as context: genes = context.get_genes() n_genes = len(genes) reporter = Utilities.PercentReporter(logging.INFO, n_genes) reporter.update(0, "%d %% of model's genes processed so far", force=True) results = [] callbacks = {} if args.coefficient_output: callbacks["coefficient"] = MultiPrediXcanAssociation.SaveCoefs() if args.loadings_output: callbacks["loadings"] = MultiPrediXcanAssociation.SaveLoadings() for i, gene in enumerate(genes): logging.log(7, "Processing gene %i/%i: %s", i + 1, n_genes, gene) r = MultiPrediXcanAssociation.multi_predixcan_association( gene, context, callbacks.values()) results.append(r) reporter.update(i, "%d %% of model's genes processed so far") reporter.update(i, "%d %% of model's genes processed so far") results = MultiPrediXcanAssociation.dataframe_from_results( results, context) results = results.fillna("NA") results = results.sort_values(by="pvalue") Utilities.save_dataframe(results, args.output) if args.coefficient_output: Utilities.save_dataframe(callbacks["coefficient"].get(), args.coefficient_output) if args.loadings_output: Utilities.save_dataframe(callbacks["loadings"].get(), args.loadings_output) end = timer() logging.info("Ran multi tissue predixcan in %s seconds" % (str(end - start)))
def __init__(self, args): self.dosage_folder = args.dosage_folder self.snp_list = args.snp_list self.output_folder = args.output_folder self.input_format = args.input_format self.output_format = args.output_format self.population_group_filters = args.population_group_filters self.individual_filters = [re.compile(x) for x in args.individual_filters] self.chromosome_in_name_regex = re.compile(args.file_pattern) self.samples_input = Utilities.samplesInputPath(self.dosage_folder) samples_name = os.path.split(self.samples_input)[1] self.samples_output = os.path.join(self.output_folder, samples_name)
def run(self): weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db) if self.weight_db else None contents = Utilities.contentsWithPatternsFromFolder(self.data_folder_phase, ["gz"]) if os.path.exists(self.output_file): logging.info("Correlations output already exists, delete it if you want stuff to be figured out again") return dir = os.path.dirname(self.output_file) if not os.path.exists(dir): os.makedirs(dir) for content in contents: self.buildVarianceDB(weight_db_logic,content)
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter) all_snps = model_manager.get_rsids() Utilities.ensure_requisite_folders(args.snp_covariance_output) with gzip.open(args.snp_covariance_output, "w") as o: o.write("GENE\tRSID1\tRSID2\tVALUE\n") logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) for e in cov_data: l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3]) o.write(l) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
def __init__(self, args): self.dosage_folder = args.dosage_folder self.snp_list = args.snp_list self.output_folder = args.output_folder self.input_format = args.input_format self.output_format = args.output_format self.population_group_filters = args.population_group_filters self.individual_filters = [ re.compile(x) for x in args.individual_filters ] self.chromosome_in_name_regex = re.compile(args.file_pattern) self.samples_input = Utilities.samplesInputPath(self.dosage_folder) samples_name = os.path.split(self.samples_input)[1] self.samples_output = os.path.join(self.output_folder, samples_name)
def test_gtex_geno_lines_generator(self): data = [] for i, line in enumerate(GTExGenotype.gtex_geno_lines("tests/_td/genotype/gtex_like.txt.gz", "tests/_td/genotype/gtex_snp.txt.gz")): data.append(line) header = GTExGenotype.gtex_geno_header("tests/_td/genotype/gtex_like.txt.gz") gtex_ids = header[1:] header = ["rsid", "chromosome", "position", "ref_allele", "alt_allele", "frequency"]+gtex_ids dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore") gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz") dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz") dataframe_2 = pandas.merge(dataframe_2,gtex_snp, left_on="Id", right_on="VariantID") compare_data_frames(dataframe, dataframe_2, gtex_ids)
def run(self): logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path) names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) for name in names: try: self.buildBetas(weight_db_logic,name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort() #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info("%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" %(str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r,b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start))) return r
def run(self): weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db) if self.weight_db else None contents = Utilities.contentsWithPatternsFromFolder( self.data_folder_phase, ["gz"]) if os.path.exists(self.output_file): logging.info( "Variance output already exists, delete it if you want stuff to be figured out again" ) return dir = os.path.dirname(self.output_file) if not os.path.exists(dir): os.makedirs(dir) for content in contents: self.buildVarianceDB(weight_db_logic, content)
def run(self): logging.info("Loading weight db") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path) logging.info("Loading covariance file") file = Utilities.contentsWithPatternsFromFolder(self.folder_covariance, [".gz"])[0] path = os.path.join(self.folder_covariance, file) covariance_contents = MatrixUtilities.loadMatrixFromFile(path) logging.info("Getting stats") results = [] for gene, entry in covariance_contents.iteritems(): covariance_matrix = entry[0] valid_rsids = entry[1] weights = weight_db_logic.weights_by_gene_name[gene] weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights) w_w = numpy.dot(numpy.transpose(weight_values), weight_values) dot_product = numpy.dot(numpy.dot(numpy.transpose(weight_values), covariance_matrix), weight_values) det = numpy.linalg.det(covariance_matrix) eigenvalues, eigenvectors = numpy.linalg.eigh(covariance_matrix) eigenmax = numpy.amax(eigenvalues) eigenmin = numpy.amin(eigenvalues) n_small = 0 for eigen in eigenvalues: if eigen < 1e-7: n_small += 1 diag = covariance_matrix.diagonal() mean_var = numpy.mean(diag) line = (gene, str(len(weight_values)), str(float(dot_product)), str(float(det)), str(float(w_w)), str(float(mean_var)), str(float(eigenmin)), str(float(eigenmax)), str(n_small)) results.append(line) #gene, n.snps, WW, W\Gamma W, eig(\Gamma).max, eig(\Gamma).min, #eigs<1e-8, VAR_g, zscore_g logging.info("saving results") with open(self.output_file, "w") as file: header = ",".join(["gene", "m_snp_count", "w_gamma_w", "det", "w_w", "mean_var", "eigenmin", "eigenmax", "n_eigen_e-7"])+"\n" file.write(header) for line in results: text = ",".join(line)+"\n" file.write(text)
def model_structure(args): model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) m = {} weights, extra = model.weights, model.extra if args.sub_batches is not None and args.sub_batch is not None: logging.info("slicing models") extra = Utilities.sub_batch(extra, args.sub_batches, args.sub_batch) weights = weights[weights.gene.isin(extra.gene)].reset_index(drop=True) if args.only_entries: extra = extra[extra.gene.isin(set(args.only_entries))] weights = weights[weights.gene.isin(set(args.only_entries))] for i in weights.itertuples(): if not i.rsid in m: m[i.rsid] = (i.non_effect_allele, i.effect_allele, {}) m[i.rsid][2][i.gene] = i.weight return m, weights, extra
def run(self): logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) for name in names: try: self.buildBetas(weight_db_logic, name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def processPrediXcanFiles(self): logging.info("Loading people") all_people = Person.Person.loadPeople(self.samples_input, '\t', False) selected_people = Person.Person.loadPeople(self.samples_output) selected_people_by_id = {p.id:p for p in selected_people} logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id)) logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(self.snp_list) snp_dict = {k:True for k in snp_data_set.data} print len(snp_dict.keys()) contents = Utilities.contentsWithPatternsFromFolder(self.dosage_folder, ["dosage.txt.gz"]) for content_name in contents: input_path = os.path.join(self.dosage_folder, content_name) fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict) if self.output_format == Formats.IMPUTE: fileBuilder.buildIMPUTE() if self.output_format == Formats.PrediXcan: fileBuilder.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def load(self): #print "INFO: Loading dosage files" #logging.info("Loading %s dosage", self.path) class PrediXcanCollector(object): def __init__(self, snps=[], snps_by_rsid={}, weight_db_logic=None): self.snps = snps self.snps_by_rsid = snps_by_rsid self.weight_db_logic = weight_db_logic def __call__(self, i, components): rsid = components[PDTF.RSID] if self.weight_db_logic and not rsid in self.weight_db_logic.genes_for_an_rsid: logging.log(5, "rsid %s not in weight db, skip it", rsid) return position = components[PDTF.POSITION] ref_allele = components[PDTF.ALLELE_0] if not ref_allele in Utilities.VALID_ALLELES: logging.log(9, "wrong ref allele, rsid %s is not an SNP", rsid) return eff_allele = components[PDTF.ALLELE_1] if not eff_allele in Utilities.VALID_ALLELES: logging.log(9, "wrong eff allele, rsid %s is not an SNP", rsid) return dosages = map(float,components[PDTF.FIRST_DATA_COLUMN:]) #dosages may be inputed #Should we flip based on weight_db at this point? snp = DataSetSNP.DataSetSNP(name=rsid, index=i, data=dosages, position=int(position), ref_allele=ref_allele, eff_allele=eff_allele) if snp.name in self.snps_by_rsid: old = self.snps_by_rsid[snp.name] logging.info("Duplicated rsid: (%s,%s) %s", old.name, old.position, " ".join(components)) self.snps.append(snp) self.snps_by_rsid[snp.name] = snp loader = Utilities.CSVFileIterator(self.path, compressed=True) collector = PrediXcanCollector(weight_db_logic=self.weight_db_logic) loader.iterate(collector) return collector.snps, collector.snps_by_rsid
def run(self): if self.args.weight_db_path: logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path) else: weight_db_logic = None names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) if len(names) == 0: raise Exceptions.ReportableException("No GWAS files found on %s with pattern %s" %(self.gwas_folder, self.gwas_regexp.pattern,)) for name in names: try: self.buildBetas(weight_db_logic,name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def buildBetas(self, db_filename): filebase = os.path.basename(db_filename).replace(".db", "") output_folder = self.args.output_folder logging.info("Processing betas for %s" % (db_filename)) self.args.weight_db_path = os.path.abspath(db_filename) self.args.covariance = os.path.join(self.args.covariance_directory, filebase) + ".cov.txt.gz" self.args.output_file = os.path.join(self.args.output_directory, filebase) + ".csv" logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db_path) betaScript = M03_betas.GetBetas(self.args) names = Utilities.contentsWithRegexpFromFolder(self.args.gwas_folder, betaScript.gwas_regexp) if not os.path.exists(self.args.output_folder): os.makedirs(self.args.output_folder) betaScript.output_folder = os.path.join(output_folder, filebase) if not os.path.exists(betaScript.output_folder): os.makedirs(betaScript.output_folder) for name in names: try: betaScript.buildBetas(weight_db_logic,name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass # ZScores logging.info("Calculating ZScores for %s" % (filebase)) zscoreScript = M04_zscores.CalculateZScores(self.args) zscoreScript.folder_beta = betaScript.output_folder zscoreScript.run()
def testSamplesInputPath(self): path = Utilities.samplesInputPath("tests/_td/dosage_set_1") self.assertEqual(path, "tests/_td/dosage_set_1/set.sample")
def testContentsWithRegexpFromFolder(self): contents = Utilities.contentsWithRegexpFromFolder("tests/_td/dosage_set_1", re.compile(".*sample")) self.assertEqual(contents, ["set.sample"])
def testNamesWithPatternFromFolders(self): names = Utilities.namesWithPatternFromFolder("tests/_td/dosage_set_1/", ".sample") self.assertEqual(names, ["set"])