def load_samples(args): s = None if args.text_sample_ids: if len(args.text_sample_ids) == 1: s = pandas.read_table(args.text_sample_ids[0], header=None, names=["FID", "IID"]) elif args.text_sample_ids[1] == "UKB": k = pandas.read_table(args.text_sample_ids[0], sep=" ") k = k[k.sex != "D"].reset_index(drop=True) s = k[["ID_1", "ID_2"]].rename(columns={ "ID_1": "FID", "ID_2": "IID" }) elif args.vcf_genotypes: from metax.genotype import CYVCF2Genotype s = CYVCF2Genotype.get_samples(args.vcf_genotypes[0]) elif args.bgen_genotypes: from metax.genotype import BGENGenotype s = BGENGenotype.get_samples(args.bgen_genotypes[0]) elif args.generate_sample_ids: s = ["ID_{}".format(x) for x in range(0, args.generate_sample_ids)] s = [(x, x) for x in s] s = pandas.DataFrame(data=s, columns=["FID", "IID"]) if s is None: raise Exceptions.InvalidArguments("Unsupported samples argument") return s
def _prepare_phenotype(context): logging.info("Accquiring phenotype") context.pheno = _pheno_from_file_and_column(context.args.input_phenos_file, context.args.input_phenos_column) if context.args.mode == MTPMode.K_LOGISTIC: try: v = set([str(float(x)) for x in context.pheno]) if not v.issubset({'0.0', '1.0', 'nan'}): raise Exceptions.InvalidArguments("Logistic regression was asked but phenotype is not binomial") except: raise Exceptions.InvalidArguments("Logistic regression: could not parse phenotype") context.mode = context.args.mode if context.args.covariates_file and context.args.covariates: context.mode = MTPMode.K_LINEAR logging.info("Acquiring covariates") context.covariates = _get_covariates(context.args) logging.info("Replacing phenotype with residuals") context.pheno = _get_residual(context.pheno, context.covariates)
def dosage_generator(args, variant_mapping=None, weights=None): if args.liftover: logging.info("Acquiring liftover conversion") liftover_chain = pyliftover.LiftOver(args.liftover) liftover_conversion = lambda chr, pos: Genomics.lift( liftover_chain, chr, pos, args.zero_based_positions) else: liftover_chain = None liftover_conversion = None whitelist = None if variant_mapping and type(variant_mapping) == dict: logging.info("Setting whitelist from mapping keys") whitelist = set(variant_mapping.keys()) else: logging.info("Setting whitelist from available models") whitelist = set(weights.rsid) d = None if args.text_genotypes: from metax.genotype import DosageGenotype d = DosageGenotype.dosage_files_geno_lines( args.text_genotypes, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) elif args.bgen_genotypes: from metax.genotype import BGENGenotype d = BGENGenotype.bgen_files_geno_lines( args.bgen_genotypes, variant_mapping=variant_mapping, force_colon=args.force_colon, use_rsid=args.bgen_use_rsid, whitelist=whitelist, skip_palindromic=args.skip_palindromic) elif args.vcf_genotypes: from metax.genotype import CYVCF2Genotype d = CYVCF2Genotype.vcf_files_geno_lines( args.vcf_genotypes, mode=args.vcf_mode, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) if d is None: raise Exceptions.InvalidArguments("unsupported genotype input") if args.force_mapped_metadata: d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata) return d
def prepare_prediction(args, extra, samples): logging.info("Preparing prediction") results = None if len(args.prediction_output) < 2: from metax.predixcan.Utilities import BasicPredictionRepository results = BasicPredictionRepository(samples, extra, args.prediction_output[0]) else: if args.prediction_output[1] == "HDF5": from metax.predixcan.Utilities import HDF5PredictionRepository results = HDF5PredictionRepository(samples, extra, args.prediction_output[0]) else: raise Exceptions.InvalidArguments( "Unsupported output specification") return results
def get_variant_mapping(args, weights): mapping = None if len(args.variant_mapping): if len(args.variant_mapping) == 3: logging.info("Acquiring variant mapping") mapping = KeyedDataSource.load_data(args.variant_mapping[0], args.variant_mapping[1], args.variant_mapping[2], value_white_list=set( weights.rsid)) # if args.variant_mapping[1] == "UKB": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "panel_variant_id", value_white_list=set(weights.rsid)) # elif args.variant_mapping[1] == "RSID": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "rsid", value_white_list=set(weights.rsid)) # elif args.variant_mapping[1] == "ID_TO_RSID": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "id", "rsid", value_white_list=set(weights.rsid)) else: raise Exceptions.InvalidArguments( "Unsupported variant mapping argument") elif len(args.on_the_fly_mapping): checklist = set(weights.rsid) if len(args.on_the_fly_mapping) > 0: logging.info("Acquiring on-the-fly mapping") if args.on_the_fly_mapping[0] == "METADATA": if mapping: _mapping = mapping # Python scope subtlety, they are not blocks like swift mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.map_on_the_fly( _mapping, args.on_the_fly_mapping[1], chromosome, position, ref_allele, alt_allele) else: mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.coordinate_format( checklist, args.on_the_fly_mapping[1], chromosome, position, ref_allele, alt_allele) else: raise RuntimeError("Unsupported on_the_fly argument") return mapping
def validate(args): if (args.gwas_file and args.gwas_folder) or (not args.gwas_file and not args.gwas_folder): raise Exceptions.InvalidArguments( "Provide either (--gwas_file) or (--gwas_folder [--gwas_file_pattern])" )
def validate(args): if not args.gwas_folder: raise Exceptions.InvalidArguments( "You need to provide an input folder containing GWAS files")
def _check_args(args): if not args.mode in MTPMode.K_MODES: raise Exceptions.InvalidArguments("Invalid mode")