Beispiel #1
0
def generate_matches(refs, sample, db, args):
    """Generate allele matches over all chromosomes."""
    for chrom, ref, loci in records_by_chromosome(refs, [sample, db],
                                                  [args.name, None], args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [
            sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key)
            for l in loci
        ]
        superloci = union(loci,
                          interval_func=attrgetter('min_start', 'max_stop'))

        for _, _, (superlocus, alleles) in superloci:
            alleles.sort(key=NormalizedLocus.natural_order_key)
            superlocus.sort(key=NormalizedLocus.natural_order_key)

            yield superlocus, generate_superlocus_matches(
                chrom, superlocus, ref, alleles, args.mode, args.debug)
Beispiel #2
0
def region_filter_exclude(records, exclude):
    for start, stop, (rec, exc) in union([records, exclude]):
        if not exc:
            for record in rec:
                yield record
Beispiel #3
0
def region_filter_include(records, include):
    for start, stop, (rec, inc) in union([records, include]):
        if inc:
            for record in rec:
                yield record
Beispiel #4
0
def region_filter_exclude(records, exclude):
    for start, stop, (rec, exc) in union([records, exclude]):
        if not exc:
            yield from rec
Beispiel #5
0
def match_database(args):
    # Load FASTA reference
    refs = Fastafile(expanduser(args.reference))

    # Open input variant files
    db = VariantFile(args.database)
    sample = VariantFile(args.sample)

    format_meta = []
    for fmt, meta in db.header.formats.items():
        if fmt not in sample.header.formats:
            format_meta.append(meta.name)
            sample.header.formats.add(meta.name + '_FOUND',
                                      number='.',
                                      type=meta.type,
                                      description='Allele(s) found: ' +
                                      meta.description)
            sample.header.formats.add(meta.name + '_NOTFOUND',
                                      number='.',
                                      type=meta.type,
                                      description='Allele(s) not found: ' +
                                      meta.description)
            sample.header.formats.add(
                meta.name + '_NOCALL',
                number='.',
                type=meta.type,
                description='Allele(s) with uncertain presense: ' +
                meta.description)

    info_meta = []
    for info, meta in db.header.info.items():
        if info not in sample.header.info:
            info_meta.append(meta.name)
            sample.header.info.add(meta.name + '_FOUND',
                                   number='.',
                                   type=meta.type,
                                   description='Allele(s) found: ' +
                                   meta.description)
            sample.header.info.add(meta.name + '_NOTFOUND',
                                   number='.',
                                   type=meta.type,
                                   description='Allele(s) not found: ' +
                                   meta.description)
            sample.header.info.add(
                meta.name + '_NOCALL',
                number='.',
                type=meta.type,
                description='Allele(s) with uncertain presense: ' +
                meta.description)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        # Create parallel locus iterator by chromosome
        for chrom, ref, loci in records_by_chromosome(refs, [sample, db],
                                                      [args.name, None], args):
            # Create superloci by taking the union of overlapping loci across all of the locus streams
            loci = [
                sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key)
                for l in loci
            ]
            superloci = union(loci,
                              interval_func=attrgetter('min_start',
                                                       'max_stop'))

            # Proceed by superlocus
            for _, _, (superlocus, alleles) in superloci:
                alleles.sort(key=NormalizedLocus.natural_order_key)
                superlocus.sort(key=NormalizedLocus.natural_order_key)

                for allele in alleles:
                    super_allele = [
                        locus for locus in superlocus
                        if locus.extremes_intersect(allele)
                    ]

                    # Remove all reference calls from the superlocus.
                    # This is primarily done to remove long leading and trailing reference regions.
                    # Interstitial reference regions will be added back, based on how gaps are handled.
                    super_non_ref = [
                        locus for locus in super_allele if not locus.is_ref()
                    ]

                    if args.debug:
                        super_start, super_stop = get_superlocus_bounds(
                            [[allele], super_non_ref])
                        print('-' * 80, file=sys.stderr)
                        print('{}:[{:d}-{:d}):'.format(chrom, super_start,
                                                       super_stop),
                              file=sys.stderr)
                        print(file=sys.stderr)

                        print('  ALLELE: {} {}:[{}-{}) ref={} alt={}'.format(
                            allele.record.id, allele.contig, allele.start,
                            allele.stop, allele.alleles[0] or '-',
                            allele.alleles[1] or '-'),
                              file=sys.stderr)
                        print(file=sys.stderr)

                        for i, locus in enumerate(super_non_ref, 1):
                            lref = locus.alleles[0] or '-'
                            indices = locus.allele_indices
                            if indices.count(None) == len(indices):
                                geno = 'nocall'
                            elif indices.count(0) == len(indices):
                                geno = 'refcall'
                            else:
                                sep = '|' if locus.phased else '/'
                                geno = sep.join(locus.alleles[a] or '-'
                                                if a is not None else '.'
                                                for a in indices)
                            print('  VAR{:d}: {}[{:5d}-{:5d}) ref={} geno={}'.
                                  format(i, locus.contig, locus.start,
                                         locus.stop, lref, geno),
                                  file=sys.stderr)

                    # Search superlocus for allele
                    match_zygosity = find_allele(ref,
                                                 allele,
                                                 super_non_ref,
                                                 debug=args.debug)

                    if args.debug:
                        print(file=sys.stderr)
                        print('    MATCH={}'.format(match_zygosity),
                              file=sys.stderr)
                        print(file=sys.stderr)

                    # Annotate results of search
                    if match_zygosity is None:
                        suffix = '_NOCALL'
                    elif match_zygosity == 0:
                        suffix = '_NOTFOUND'
                    else:
                        suffix = '_FOUND'

                    # Number of times to repeat the copied metadata
                    times = match_zygosity if suffix == '_FOUND' else 1

                    for locus in super_allele:
                        annotate_info(locus, allele, info_meta, suffix, times)
                        annotate_format(locus, allele, format_meta, suffix,
                                        times)

                for locus in sorted(superlocus,
                                    key=NormalizedLocus.record_order_key):
                    out.write(locus.record)
Beispiel #6
0
def region_filter_include(records, include):
    for start, stop, (rec, inc) in union([records, include]):
        if inc:
            yield from rec
Beispiel #7
0
def match_database(args):
    # Load FASTA reference
    refs = Fastafile(expanduser(args.reference))

    # Open input variant files
    db = VariantFile(args.database)
    sample = VariantFile(args.sample)

    format_meta = []
    for fmt, meta in db.header.formats.items():
        if fmt not in sample.header.formats:
            format_meta.append(meta.name)
            sample.header.formats.add(meta.name + '_FOUND',    number='.', type=meta.type,
                                      description='Allele(s) found: ' + meta.description)
            sample.header.formats.add(meta.name + '_NOTFOUND', number='.', type=meta.type,
                                      description='Allele(s) not found: ' + meta.description)
            sample.header.formats.add(meta.name + '_NOCALL',   number='.', type=meta.type,
                                      description='Allele(s) with uncertain presense: ' + meta.description)

    info_meta = []
    for info, meta in db.header.info.items():
        if info not in sample.header.info:
            info_meta.append(meta.name)
            sample.header.info.add(meta.name + '_FOUND',    number='.', type=meta.type,
                                   description='Allele(s) found: ' + meta.description)
            sample.header.info.add(meta.name + '_NOTFOUND', number='.', type=meta.type,
                                   description='Allele(s) not found: ' + meta.description)
            sample.header.info.add(meta.name + '_NOCALL',   number='.', type=meta.type,
                                   description='Allele(s) with uncertain presense: ' + meta.description)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        # Create parallel locus iterator by chromosome
        for chrom, ref, loci in records_by_chromosome(refs, [sample, db], [args.name, None], args):
            # Create superloci by taking the union of overlapping loci across all of the locus streams
            loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
            superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))

            # Proceed by superlocus
            for _, _, (superlocus, alleles) in superloci:
                alleles.sort(key=NormalizedLocus.natural_order_key)
                superlocus.sort(key=NormalizedLocus.natural_order_key)

                for allele in alleles:
                    super_allele = [locus for locus in superlocus if locus.extremes_intersect(allele)]

                    # Remove all reference calls from the superlocus.
                    # This is primarily done to remove long leading and trailing reference regions.
                    # Interstitial reference regions will be added back, based on how gaps are handled.
                    super_non_ref = [locus for locus in super_allele if not locus.is_ref()]

                    if args.debug:
                        super_start, super_stop = get_superlocus_bounds([[allele], super_non_ref])
                        print('-'*80, file=sys.stderr)
                        print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop), file=sys.stderr)
                        print(file=sys.stderr)

                        print('  ALLELE: {} {}:[{}-{}) ref={} alt={}'.format(allele.record.id, allele.contig,
                                                                             allele.start, allele.stop,
                                                                             allele.alleles[0] or '-', allele.alleles[1] or '-'), file=sys.stderr)
                        print(file=sys.stderr)

                        for i, locus in enumerate(super_non_ref, 1):
                            lref = locus.alleles[0] or '-'
                            indices = locus.allele_indices
                            if indices.count(None) == len(indices):
                                geno = 'nocall'
                            elif indices.count(0) == len(indices):
                                geno = 'refcall'
                            else:
                                sep = '|' if locus.phased else '/'
                                geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
                            print('  VAR{:d}: {}[{:5d}-{:5d}) ref={} geno={}'.format(i, locus.contig, locus.start, locus.stop, lref, geno), file=sys.stderr)

                    # Search superlocus for allele
                    match_zygosity = find_allele(ref, allele, super_non_ref, debug=args.debug)

                    if args.debug:
                        print(file=sys.stderr)
                        print('    MATCH={}'.format(match_zygosity), file=sys.stderr)
                        print(file=sys.stderr)

                    # Annotate results of search
                    if match_zygosity is None:
                        suffix = '_NOCALL'
                    elif match_zygosity == 0:
                        suffix = '_NOTFOUND'
                    else:
                        suffix = '_FOUND'

                    # Number of times to repeat the copied metadata
                    times = match_zygosity if suffix == '_FOUND' else 1

                    for locus in super_allele:
                        annotate_info(locus, allele, info_meta, suffix, times)
                        annotate_format(locus, allele, format_meta, suffix, times)

                for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                    out.write(locus.record)
Beispiel #8
0
def match_replicates(args):
    """Match a genome against another presumably identical genome (i.e. replicates)."""
    refs = Fastafile(expanduser(args.reference))
    in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]]
    out_vars = make_outputs(in_vars, args.out1, args.out2)

    match_status_map = {True: '=', False: 'X', None: '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars,
                                                  [args.name1, args.name2],
                                                  args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [
            sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key)
            for l in loci
        ]
        superloci = union(loci,
                          interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-' * 80)
            print(f'{chrom}:[{super_start:d}-{super_stop:d}):')
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.ref or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(
                        locus.alleles[a] or '-' if a is not None else '.'
                        for a in indices)
                    print(
                        f'  NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}'
                    )
            print()

            match, match_type = superlocus_equal(ref,
                                                 super_start,
                                                 super_stop,
                                                 super1,
                                                 super2,
                                                 debug=args.debug)
            match_status = match_status_map[match]

            print(f'    MATCH={match_status} TYPE={match_type}')
            print()

            write_match(out_vars[0], super1, args.name1, match_status,
                        match_type)
            write_match(out_vars[1], super2, args.name2, match_status,
                        match_type)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print(f'  VCF{i:d}: {locus.record}', end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()
Beispiel #9
0
def match_replicates(args):
    # Load FASTA reference
    refs = Fastafile(expanduser(args.reference))

    # Open input variant files
    in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]]

    out_vars = [None, None]

    if args.out1:
        in_vars[0].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)')
        in_vars[0].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)')
        out_vars[0] = VariantFile(args.out1, 'w', header=in_vars[0].header)

    if args.out2:
        in_vars[1].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)')
        in_vars[1].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)')
        out_vars[1] = VariantFile(args.out2, 'w', header=in_vars[1].header)

    match_status_map = {True : '=', False : 'X', None : '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
        superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-'*80)
            print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop))
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.alleles[0] or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
                    print('  NORM{:d}: [{:5d}-{:5d}) ref={} geno={}'.format(i, lstart, lstop, lref, geno))
            print()

            match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug)
            match_status = match_status_map[match]

            print('    MATCH={} TYPE={}'.format(match_status, match_type))
            print()

            # The hard work is done.  The rest is just output and formatting...

            if out_vars[0]:
                for locus in sorted(super1, key=NormalizedLocus.record_order_key):
                    locus.record.samples[args.name1]['BD'] = match_status
                    locus.record.samples[args.name1]['BK'] = match_type
                    out_vars[0].write(locus.record)

            if out_vars[1]:
                for locus in sorted(super2, key=NormalizedLocus.record_order_key):
                    locus.record.samples[args.name2]['BD'] = match_status
                    locus.record.samples[args.name2]['BK'] = match_type
                    out_vars[1].write(locus.record)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print('  VCF{:d}: {}'.format(i, locus.record), end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()
Beispiel #10
0
def region_filter_include(records, include):
    """Remove records that do not overlap those provided."""
    for _, _, (rec, inc) in union([records, include]):
        if inc:
            yield from rec
Beispiel #11
0
def region_filter_exclude(records, exclude):
    """Remove records that overlap those provided."""
    for _, _, (rec, exc) in union([records, exclude]):
        if not exc:
            yield from rec