def run(self): if self.args.weight_db_path: logging.info("Loading weight model") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db_path) else: weight_db_logic = None names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp) if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) if len(names) == 0: raise Exceptions.ReportableException( "No GWAS files found on %s with pattern %s" % ( self.gwas_folder, self.gwas_regexp.pattern, )) for name in names: try: self.buildBetas(weight_db_logic, name) # This just means that there is some extra stuff inside that directory, # so I'm thinking we want to ignore it. except Exceptions.BadFilename as e: logging.info("Wrong file name: %s, skipping", e.msg) pass
def processPrediXcanFiles(self): logging.info("Loading people") all_people = Person.Person.loadPeople(self.samples_input, '\t', False) selected_people = Person.Person.loadPeople(self.samples_output) selected_people_by_id = {p.id: p for p in selected_people} logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id)) logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile( self.snp_list) snp_dict = {k: True for k in snp_data_set.data} print len(snp_dict.keys()) contents = Utilities.contentsWithPatternsFromFolder( self.dosage_folder, ["dosage.txt.gz"]) for content_name in contents: input_path = os.path.join(self.dosage_folder, content_name) fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess( input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict) if self.output_format == Formats.IMPUTE: fileBuilder.buildIMPUTE() if self.output_format == Formats.PrediXcan: fileBuilder.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def load_samples(args): s = None if args.text_sample_ids: if len(args.text_sample_ids) == 1: s = pandas.read_table(args.text_sample_ids[0], header=None, names=["FID", "IID"]) elif args.text_sample_ids[1] == "UKB": k = pandas.read_table(args.text_sample_ids[0], sep=" ") k = k[k.sex != "D"].reset_index(drop=True) s = k[["ID_1", "ID_2"]].rename(columns={ "ID_1": "FID", "ID_2": "IID" }) elif args.vcf_genotypes: from metax.genotype import CYVCF2Genotype s = CYVCF2Genotype.get_samples(args.vcf_genotypes[0]) elif args.bgen_genotypes: from metax.genotype import BGENGenotype s = BGENGenotype.get_samples(args.bgen_genotypes[0]) elif args.generate_sample_ids: s = ["ID_{}".format(x) for x in range(0, args.generate_sample_ids)] s = [(x, x) for x in s] s = pandas.DataFrame(data=s, columns=["FID", "IID"]) if s is None: raise Exceptions.InvalidArguments("Unsupported samples argument") return s
def readGWAS(args): start = timer() validate(args) regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort() #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,) raise Exceptions.ReportableException(msg) print "INFO: Reading GWAS data" gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) #model = PredictionModel.load_model(args.model_db_path) if args.model_db_path else None model = None # dataframe r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r,b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start))) print("Successfully parsed input gwas in %s seconds"%(str(end-start))) return r
def buildDosages(self): if self.input_format == Formats.IMPUTE: self.processIMPUTEFiles() elif self.input_format == Formats.PrediXcan: self.processPrediXcanFiles() else: raise Exceptions.InvalidInputFormat(self.input_format)
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile( args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort( ) #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % ( args.gwas_folder, args.gwas_file_pattern, ) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model( args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info( "%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" % (str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r, b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds" % (str(end - start))) return r
def _prepare_phenotype(context): logging.info("Accquiring phenotype") context.pheno = _pheno_from_file_and_column(context.args.input_phenos_file, context.args.input_phenos_column) if context.args.mode == MTPMode.K_LOGISTIC: try: v = set([str(float(x)) for x in context.pheno]) if not v.issubset({'0.0', '1.0', 'nan'}): raise Exceptions.InvalidArguments("Logistic regression was asked but phenotype is not binomial") except: raise Exceptions.InvalidArguments("Logistic regression: could not parse phenotype") context.mode = context.args.mode if context.args.covariates_file and context.args.covariates: context.mode = MTPMode.K_LINEAR logging.info("Acquiring covariates") context.covariates = _get_covariates(context.args) logging.info("Replacing phenotype with residuals") context.pheno = _get_residual(context.pheno, context.covariates)
def dosage_generator(args, variant_mapping=None, weights=None): if args.liftover: logging.info("Acquiring liftover conversion") liftover_chain = pyliftover.LiftOver(args.liftover) liftover_conversion = lambda chr, pos: Genomics.lift( liftover_chain, chr, pos, args.zero_based_positions) else: liftover_chain = None liftover_conversion = None whitelist = None if variant_mapping and type(variant_mapping) == dict: logging.info("Setting whitelist from mapping keys") whitelist = set(variant_mapping.keys()) else: logging.info("Setting whitelist from available models") whitelist = set(weights.rsid) d = None if args.text_genotypes: from metax.genotype import DosageGenotype d = DosageGenotype.dosage_files_geno_lines( args.text_genotypes, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) elif args.bgen_genotypes: from metax.genotype import BGENGenotype d = BGENGenotype.bgen_files_geno_lines( args.bgen_genotypes, variant_mapping=variant_mapping, force_colon=args.force_colon, use_rsid=args.bgen_use_rsid, whitelist=whitelist, skip_palindromic=args.skip_palindromic) elif args.vcf_genotypes: from metax.genotype import CYVCF2Genotype d = CYVCF2Genotype.vcf_files_geno_lines( args.vcf_genotypes, mode=args.vcf_mode, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) if d is None: raise Exceptions.InvalidArguments("unsupported genotype input") if args.force_mapped_metadata: d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata) return d
def prepare_prediction(args, extra, samples): logging.info("Preparing prediction") results = None if len(args.prediction_output) < 2: from metax.predixcan.Utilities import BasicPredictionRepository results = BasicPredictionRepository(samples, extra, args.prediction_output[0]) else: if args.prediction_output[1] == "HDF5": from metax.predixcan.Utilities import HDF5PredictionRepository results = HDF5PredictionRepository(samples, extra, args.prediction_output[0]) else: raise Exceptions.InvalidArguments( "Unsupported output specification") return results
def expression_from_args(args, prediction_results = None): if prediction_results: if isinstance(prediction_results, BasicPredictionRepository): logging.info("Preparing PrediXcan context from data") expression = Expression.ExpressionFromData(prediction_results.genes) else: raise Exceptions.ReportableException("Invalid prediction results") elif args.hdf5_expression_file: logging.info("Preparing PrediXcan HDF5 context") expression = HDF5Expression.Expression(args.hdf5_expression_file) elif args.expression_file: logging.info("Preparing PrediXcan context") expression = PlainTextExpression.Expression(args.expression_file) else: raise RuntimeError("Could not build context from arguments") return expression
def buildPeople(self): if os.path.exists(self.samples_output): logging.info( "%s already exists, delete it if you want it figured out again", self.samples_output) else: if self.input_format == Formats.IMPUTE: Person.Person.buildFilteredSamples( self.samples_input, self.samples_output, self.population_group_filters, self.individual_filters) elif self.input_format == Formats.PrediXcan: Person.Person.buildFilteredSamples( self.samples_input, self.samples_output, group_filters=self.population_group_filters, individual_filters=self.individual_filters, row_delimiter="\t", skip_header=False) else: raise Exceptions.InvalidInputFormat(self.input_format)
def get_variant_mapping(args, weights): mapping = None if len(args.variant_mapping): if len(args.variant_mapping) == 3: logging.info("Acquiring variant mapping") mapping = KeyedDataSource.load_data(args.variant_mapping[0], args.variant_mapping[1], args.variant_mapping[2], value_white_list=set( weights.rsid)) # if args.variant_mapping[1] == "UKB": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "panel_variant_id", value_white_list=set(weights.rsid)) # elif args.variant_mapping[1] == "RSID": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "rsid", value_white_list=set(weights.rsid)) # elif args.variant_mapping[1] == "ID_TO_RSID": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "id", "rsid", value_white_list=set(weights.rsid)) else: raise Exceptions.InvalidArguments( "Unsupported variant mapping argument") elif len(args.on_the_fly_mapping): checklist = set(weights.rsid) if len(args.on_the_fly_mapping) > 0: logging.info("Acquiring on-the-fly mapping") if args.on_the_fly_mapping[0] == "METADATA": if mapping: _mapping = mapping # Python scope subtlety, they are not blocks like swift mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.map_on_the_fly( _mapping, args.on_the_fly_mapping[1], chromosome, position, ref_allele, alt_allele) else: mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.coordinate_format( checklist, args.on_the_fly_mapping[1], chromosome, position, ref_allele, alt_allele) else: raise RuntimeError("Unsupported on_the_fly argument") return mapping
def processIMPUTEFiles(self): logging.info("Loading people") names = Utilities.hapNamesFromFolder(self.dosage_folder) all_people = Person.Person.loadPeople(self.samples_input) selected_people = Person.Person.loadPeople(self.samples_output, delim=" ") selected_people_by_id = {p.id: p for p in selected_people} logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile( self.snp_list) snp_dict = {rsid: True for rsid in snp_data_set.data} for name in names: output = os.path.join(self.output_folder, name) filter = ThousandGenomesUtilities.IMPUTEFilteredDosageFileBuilder() filter.base_path = self.dosage_folder filter.name = name filter.output_pattern = output filter.snp_dict = snp_dict filter.all_people = all_people filter.selected_people_by_id = selected_people_by_id if self.output_format == Formats.IMPUTE: filter.buildIMPUTE() elif self.output_format == Formats.PrediXcan: search = self.chromosome_in_name_regex.search(name) exitIf(search is None, Exceptions.InvalidInputFormat, \ "No files found in '%s' that match the pattern, '%s'" \ % (self.dosage_folder, self.chromosome_in_name_regex.pattern)) chr = search.group(1) filter.chromosome_name = chr filter.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def validate(args): if (args.gwas_file and args.gwas_folder) or (not args.gwas_file and not args.gwas_folder): raise Exceptions.InvalidArguments( "Provide either (--gwas_file) or (--gwas_folder [--gwas_file_pattern])" )
def update(self, gene, dosage, weight): raise Exceptions.NotImplemented("gene_repository is not implemented")
def store_prediction(self): raise Exceptions.NotImplemented("gene_repository is not implemented")
def summary(self): raise Exceptions.NotImplemented("gene_repository is not implemented")
def _check_args(args): if not args.mode in MTPMode.K_MODES: raise Exceptions.InvalidArguments("Invalid mode")
def validate(args): if not args.gwas_folder: raise Exceptions.InvalidArguments( "You need to provide an input folder containing GWAS files")