def dosage_generator(args, variant_mapping=None, weights=None): if args.liftover: logging.info("Acquiring liftover conversion") liftover_chain = pyliftover.LiftOver(args.liftover) liftover_conversion = lambda chr, pos: Genomics.lift( liftover_chain, chr, pos, args.zero_based_positions) else: liftover_chain = None liftover_conversion = None whitelist = None if variant_mapping and type(variant_mapping) == dict: logging.info("Setting whitelist from mapping keys") whitelist = set(variant_mapping.keys()) else: logging.info("Setting whitelist from available models") whitelist = set(weights.rsid) d = None if args.text_genotypes: from metax.genotype import DosageGenotype d = DosageGenotype.dosage_files_geno_lines( args.text_genotypes, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) elif args.bgen_genotypes: from metax.genotype import BGENGenotype d = BGENGenotype.bgen_files_geno_lines( args.bgen_genotypes, variant_mapping=variant_mapping, force_colon=args.force_colon, use_rsid=args.bgen_use_rsid, whitelist=whitelist, skip_palindromic=args.skip_palindromic) elif args.vcf_genotypes: from metax.genotype import CYVCF2Genotype d = CYVCF2Genotype.vcf_files_geno_lines( args.vcf_genotypes, mode=args.vcf_mode, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) if d is None: raise Exceptions.InvalidArguments("unsupported genotype input") if args.force_mapped_metadata: d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata) return d
def get_variant_mapping(args, weights): mapping = None if len(args.variant_mapping): if len(args.variant_mapping) == 3: logging.info("Acquiring variant mapping") mapping = KeyedDataSource.load_data(args.variant_mapping[0], args.variant_mapping[1], args.variant_mapping[2], value_white_list=set( weights.rsid)) # if args.variant_mapping[1] == "UKB": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "panel_variant_id", value_white_list=set(weights.rsid)) # elif args.variant_mapping[1] == "RSID": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "rsid", value_white_list=set(weights.rsid)) # elif args.variant_mapping[1] == "ID_TO_RSID": # mapping = KeyedDataSource.load_data(args.variant_mapping[0], "id", "rsid", value_white_list=set(weights.rsid)) else: raise Exceptions.InvalidArguments( "Unsupported variant mapping argument") elif len(args.on_the_fly_mapping): checklist = set(weights.rsid) if len(args.on_the_fly_mapping) > 0: logging.info("Acquiring on-the-fly mapping") if args.on_the_fly_mapping[0] == "METADATA": if mapping: _mapping = mapping # Python scope subtlety, they are not blocks like swift mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.map_on_the_fly( _mapping, args.on_the_fly_mapping[1], chromosome, position, ref_allele, alt_allele) else: mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.coordinate_format( checklist, args.on_the_fly_mapping[1], chromosome, position, ref_allele, alt_allele) else: raise RuntimeError("Unsupported on_the_fly argument") return mapping
def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist=None, skip_palindromic=False, liftover_conversion=None): logging.log(9, "Processing vcf %s", path) vcf_reader = VCF(path) is_dict_mapping = variant_mapping is not None and type( variant_mapping) == dict for variant in vcf_reader: chr = variant.CHROM pos = variant.POS variant_id = variant.ID ref = variant.REF alts = variant.ALT if liftover_conversion: chr_, pos_ = chr, pos chr, pos = liftover_conversion(chr, pos) if chr == "NA" or pos == "NA": continue if mode == "genotyped": for a, alt in enumerate(alts): if skip_palindromic and Genomics.is_palindromic(ref, alt): continue _varid, variant_id = Genomics.maybe_map_variant( variant_id, chr, pos, ref, alt, variant_mapping, is_dict_mapping) if variant_id is None: continue if whitelist and variant_id not in whitelist: continue d = [] for sample in variant.genotypes: d_ = (sample[0] == a + 1) + (sample[1] == a + 1) d.append(d_) f = numpy.mean(numpy.array(d, dtype=numpy.int32)) / 2 yield (variant_id, chr, pos, ref, alt, f) + tuple(d) elif mode == "imputed": if len(alts) > 1: logging.log( "VCF imputed mode doesn't support multiple ALTs, skipping %s", variant_id) continue alt = alts[0] if skip_palindromic and Genomics.is_palindromic(ref, alt): continue _varid, variant_id = Genomics.maybe_map_variant( variant_id, chr, pos, ref, alt, variant_mapping, is_dict_mapping) if variant_id is None: continue if whitelist and variant_id not in whitelist: continue d = numpy.apply_along_axis(lambda x: x[0], 1, variant.format("DS")) f = numpy.mean(numpy.array(d)) / 2 yield (variant_id, chr, pos, ref, alt, f) + tuple(d) else: raise RuntimeError("Unsupported vcf mode")