Esempio n. 1
0
def dosage_generator(args, variant_mapping=None, weights=None):
    if args.liftover:
        logging.info("Acquiring liftover conversion")
        liftover_chain = pyliftover.LiftOver(args.liftover)
        liftover_conversion = lambda chr, pos: Genomics.lift(
            liftover_chain, chr, pos, args.zero_based_positions)
    else:
        liftover_chain = None
        liftover_conversion = None

    whitelist = None
    if variant_mapping and type(variant_mapping) == dict:
        logging.info("Setting whitelist from mapping keys")
        whitelist = set(variant_mapping.keys())
    else:
        logging.info("Setting whitelist from available models")
        whitelist = set(weights.rsid)

    d = None
    if args.text_genotypes:
        from metax.genotype import DosageGenotype
        d = DosageGenotype.dosage_files_geno_lines(
            args.text_genotypes,
            variant_mapping=variant_mapping,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic,
            liftover_conversion=liftover_conversion)
    elif args.bgen_genotypes:
        from metax.genotype import BGENGenotype
        d = BGENGenotype.bgen_files_geno_lines(
            args.bgen_genotypes,
            variant_mapping=variant_mapping,
            force_colon=args.force_colon,
            use_rsid=args.bgen_use_rsid,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic)
    elif args.vcf_genotypes:
        from metax.genotype import CYVCF2Genotype
        d = CYVCF2Genotype.vcf_files_geno_lines(
            args.vcf_genotypes,
            mode=args.vcf_mode,
            variant_mapping=variant_mapping,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic,
            liftover_conversion=liftover_conversion)

    if d is None:
        raise Exceptions.InvalidArguments("unsupported genotype input")
    if args.force_mapped_metadata:
        d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata)
    return d
Esempio n. 2
0
def get_variant_mapping(args, weights):
    mapping = None

    if len(args.variant_mapping):
        if len(args.variant_mapping) == 3:
            logging.info("Acquiring variant mapping")
            mapping = KeyedDataSource.load_data(args.variant_mapping[0],
                                                args.variant_mapping[1],
                                                args.variant_mapping[2],
                                                value_white_list=set(
                                                    weights.rsid))
            # if args.variant_mapping[1] == "UKB":
            #     mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "panel_variant_id", value_white_list=set(weights.rsid))
            # elif args.variant_mapping[1] == "RSID":
            #     mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "rsid", value_white_list=set(weights.rsid))
            # elif args.variant_mapping[1] == "ID_TO_RSID":
            #     mapping = KeyedDataSource.load_data(args.variant_mapping[0], "id", "rsid", value_white_list=set(weights.rsid))
        else:
            raise Exceptions.InvalidArguments(
                "Unsupported variant mapping argument")
    elif len(args.on_the_fly_mapping):
        checklist = set(weights.rsid)

    if len(args.on_the_fly_mapping) > 0:
        logging.info("Acquiring on-the-fly mapping")
        if args.on_the_fly_mapping[0] == "METADATA":
            if mapping:
                _mapping = mapping  # Python scope subtlety, they are not blocks like swift
                mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.map_on_the_fly(
                    _mapping, args.on_the_fly_mapping[1], chromosome, position,
                    ref_allele, alt_allele)
            else:
                mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.coordinate_format(
                    checklist, args.on_the_fly_mapping[1], chromosome,
                    position, ref_allele, alt_allele)
        else:
            raise RuntimeError("Unsupported on_the_fly argument")
    return mapping
Esempio n. 3
0
def vcf_file_geno_lines(path,
                        mode="genotyped",
                        variant_mapping=None,
                        whitelist=None,
                        skip_palindromic=False,
                        liftover_conversion=None):
    logging.log(9, "Processing vcf %s", path)
    vcf_reader = VCF(path)

    is_dict_mapping = variant_mapping is not None and type(
        variant_mapping) == dict

    for variant in vcf_reader:
        chr = variant.CHROM
        pos = variant.POS
        variant_id = variant.ID
        ref = variant.REF
        alts = variant.ALT

        if liftover_conversion:
            chr_, pos_ = chr, pos
            chr, pos = liftover_conversion(chr, pos)
            if chr == "NA" or pos == "NA":
                continue

        if mode == "genotyped":
            for a, alt in enumerate(alts):
                if skip_palindromic and Genomics.is_palindromic(ref, alt):
                    continue

                _varid, variant_id = Genomics.maybe_map_variant(
                    variant_id, chr, pos, ref, alt, variant_mapping,
                    is_dict_mapping)
                if variant_id is None: continue

                if whitelist and variant_id not in whitelist:
                    continue

                d = []
                for sample in variant.genotypes:
                    d_ = (sample[0] == a + 1) + (sample[1] == a + 1)
                    d.append(d_)
                f = numpy.mean(numpy.array(d, dtype=numpy.int32)) / 2
                yield (variant_id, chr, pos, ref, alt, f) + tuple(d)

        elif mode == "imputed":
            if len(alts) > 1:
                logging.log(
                    "VCF imputed mode doesn't support multiple ALTs, skipping %s",
                    variant_id)
                continue

            alt = alts[0]
            if skip_palindromic and Genomics.is_palindromic(ref, alt):
                continue

            _varid, variant_id = Genomics.maybe_map_variant(
                variant_id, chr, pos, ref, alt, variant_mapping,
                is_dict_mapping)
            if variant_id is None: continue

            if whitelist and variant_id not in whitelist:
                continue

            d = numpy.apply_along_axis(lambda x: x[0], 1, variant.format("DS"))
            f = numpy.mean(numpy.array(d)) / 2
            yield (variant_id, chr, pos, ref, alt, f) + tuple(d)
        else:
            raise RuntimeError("Unsupported vcf mode")