def generate_matches(refs, sample, db, args): """Generate allele matches over all chromosomes.""" for chrom, ref, loci in records_by_chromosome(refs, [sample, db], [args.name, None], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [ sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci ] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) for _, _, (superlocus, alleles) in superloci: alleles.sort(key=NormalizedLocus.natural_order_key) superlocus.sort(key=NormalizedLocus.natural_order_key) yield superlocus, generate_superlocus_matches( chrom, superlocus, ref, alleles, args.mode, args.debug)
def region_filter_exclude(records, exclude): for start, stop, (rec, exc) in union([records, exclude]): if not exc: for record in rec: yield record
def region_filter_include(records, include): for start, stop, (rec, inc) in union([records, include]): if inc: for record in rec: yield record
def region_filter_exclude(records, exclude): for start, stop, (rec, exc) in union([records, exclude]): if not exc: yield from rec
def match_database(args): # Load FASTA reference refs = Fastafile(expanduser(args.reference)) # Open input variant files db = VariantFile(args.database) sample = VariantFile(args.sample) format_meta = [] for fmt, meta in db.header.formats.items(): if fmt not in sample.header.formats: format_meta.append(meta.name) sample.header.formats.add(meta.name + '_FOUND', number='.', type=meta.type, description='Allele(s) found: ' + meta.description) sample.header.formats.add(meta.name + '_NOTFOUND', number='.', type=meta.type, description='Allele(s) not found: ' + meta.description) sample.header.formats.add( meta.name + '_NOCALL', number='.', type=meta.type, description='Allele(s) with uncertain presense: ' + meta.description) info_meta = [] for info, meta in db.header.info.items(): if info not in sample.header.info: info_meta.append(meta.name) sample.header.info.add(meta.name + '_FOUND', number='.', type=meta.type, description='Allele(s) found: ' + meta.description) sample.header.info.add(meta.name + '_NOTFOUND', number='.', type=meta.type, description='Allele(s) not found: ' + meta.description) sample.header.info.add( meta.name + '_NOCALL', number='.', type=meta.type, description='Allele(s) with uncertain presense: ' + meta.description) with VariantFile(args.output, 'w', header=sample.header) as out: # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, [sample, db], [args.name, None], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [ sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci ] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (superlocus, alleles) in superloci: alleles.sort(key=NormalizedLocus.natural_order_key) superlocus.sort(key=NormalizedLocus.natural_order_key) for allele in alleles: super_allele = [ locus for locus in superlocus if locus.extremes_intersect(allele) ] # Remove all reference calls from the superlocus. # This is primarily done to remove long leading and trailing reference regions. # Interstitial reference regions will be added back, based on how gaps are handled. super_non_ref = [ locus for locus in super_allele if not locus.is_ref() ] if args.debug: super_start, super_stop = get_superlocus_bounds( [[allele], super_non_ref]) print('-' * 80, file=sys.stderr) print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop), file=sys.stderr) print(file=sys.stderr) print(' ALLELE: {} {}:[{}-{}) ref={} alt={}'.format( allele.record.id, allele.contig, allele.start, allele.stop, allele.alleles[0] or '-', allele.alleles[1] or '-'), file=sys.stderr) print(file=sys.stderr) for i, locus in enumerate(super_non_ref, 1): lref = locus.alleles[0] or '-' indices = locus.allele_indices if indices.count(None) == len(indices): geno = 'nocall' elif indices.count(0) == len(indices): geno = 'refcall' else: sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(' VAR{:d}: {}[{:5d}-{:5d}) ref={} geno={}'. format(i, locus.contig, locus.start, locus.stop, lref, geno), file=sys.stderr) # Search superlocus for allele match_zygosity = find_allele(ref, allele, super_non_ref, debug=args.debug) if args.debug: print(file=sys.stderr) print(' MATCH={}'.format(match_zygosity), file=sys.stderr) print(file=sys.stderr) # Annotate results of search if match_zygosity is None: suffix = '_NOCALL' elif match_zygosity == 0: suffix = '_NOTFOUND' else: suffix = '_FOUND' # Number of times to repeat the copied metadata times = match_zygosity if suffix == '_FOUND' else 1 for locus in super_allele: annotate_info(locus, allele, info_meta, suffix, times) annotate_format(locus, allele, format_meta, suffix, times) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
def region_filter_include(records, include): for start, stop, (rec, inc) in union([records, include]): if inc: yield from rec
def match_database(args): # Load FASTA reference refs = Fastafile(expanduser(args.reference)) # Open input variant files db = VariantFile(args.database) sample = VariantFile(args.sample) format_meta = [] for fmt, meta in db.header.formats.items(): if fmt not in sample.header.formats: format_meta.append(meta.name) sample.header.formats.add(meta.name + '_FOUND', number='.', type=meta.type, description='Allele(s) found: ' + meta.description) sample.header.formats.add(meta.name + '_NOTFOUND', number='.', type=meta.type, description='Allele(s) not found: ' + meta.description) sample.header.formats.add(meta.name + '_NOCALL', number='.', type=meta.type, description='Allele(s) with uncertain presense: ' + meta.description) info_meta = [] for info, meta in db.header.info.items(): if info not in sample.header.info: info_meta.append(meta.name) sample.header.info.add(meta.name + '_FOUND', number='.', type=meta.type, description='Allele(s) found: ' + meta.description) sample.header.info.add(meta.name + '_NOTFOUND', number='.', type=meta.type, description='Allele(s) not found: ' + meta.description) sample.header.info.add(meta.name + '_NOCALL', number='.', type=meta.type, description='Allele(s) with uncertain presense: ' + meta.description) with VariantFile(args.output, 'w', header=sample.header) as out: # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, [sample, db], [args.name, None], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (superlocus, alleles) in superloci: alleles.sort(key=NormalizedLocus.natural_order_key) superlocus.sort(key=NormalizedLocus.natural_order_key) for allele in alleles: super_allele = [locus for locus in superlocus if locus.extremes_intersect(allele)] # Remove all reference calls from the superlocus. # This is primarily done to remove long leading and trailing reference regions. # Interstitial reference regions will be added back, based on how gaps are handled. super_non_ref = [locus for locus in super_allele if not locus.is_ref()] if args.debug: super_start, super_stop = get_superlocus_bounds([[allele], super_non_ref]) print('-'*80, file=sys.stderr) print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop), file=sys.stderr) print(file=sys.stderr) print(' ALLELE: {} {}:[{}-{}) ref={} alt={}'.format(allele.record.id, allele.contig, allele.start, allele.stop, allele.alleles[0] or '-', allele.alleles[1] or '-'), file=sys.stderr) print(file=sys.stderr) for i, locus in enumerate(super_non_ref, 1): lref = locus.alleles[0] or '-' indices = locus.allele_indices if indices.count(None) == len(indices): geno = 'nocall' elif indices.count(0) == len(indices): geno = 'refcall' else: sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(' VAR{:d}: {}[{:5d}-{:5d}) ref={} geno={}'.format(i, locus.contig, locus.start, locus.stop, lref, geno), file=sys.stderr) # Search superlocus for allele match_zygosity = find_allele(ref, allele, super_non_ref, debug=args.debug) if args.debug: print(file=sys.stderr) print(' MATCH={}'.format(match_zygosity), file=sys.stderr) print(file=sys.stderr) # Annotate results of search if match_zygosity is None: suffix = '_NOCALL' elif match_zygosity == 0: suffix = '_NOTFOUND' else: suffix = '_FOUND' # Number of times to repeat the copied metadata times = match_zygosity if suffix == '_FOUND' else 1 for locus in super_allele: annotate_info(locus, allele, info_meta, suffix, times) annotate_format(locus, allele, format_meta, suffix, times) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
def match_replicates(args): """Match a genome against another presumably identical genome (i.e. replicates).""" refs = Fastafile(expanduser(args.reference)) in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]] out_vars = make_outputs(in_vars, args.out1, args.out2) match_status_map = {True: '=', False: 'X', None: '.'} # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [ sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci ] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (super1, super2) in superloci: super1.sort(key=NormalizedLocus.natural_order_key) super2.sort(key=NormalizedLocus.natural_order_key) super_start, super_stop = get_superlocus_bounds([super1, super2]) print('-' * 80) print(f'{chrom}:[{super_start:d}-{super_stop:d}):') print() for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: lstart = locus.start lstop = locus.stop lref = locus.ref or '-' indices = locus.allele_indices sep = '|' if locus.phased else '/' geno = sep.join( locus.alleles[a] or '-' if a is not None else '.' for a in indices) print( f' NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}' ) print() match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug) match_status = match_status_map[match] print(f' MATCH={match_status} TYPE={match_type}') print() write_match(out_vars[0], super1, args.name1, match_status, match_type) write_match(out_vars[1], super2, args.name2, match_status, match_type) for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: print(f' VCF{i:d}: {locus.record}', end='') print() for out_var in out_vars: if out_var is not None: out_var.close()
def match_replicates(args): # Load FASTA reference refs = Fastafile(expanduser(args.reference)) # Open input variant files in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]] out_vars = [None, None] if args.out1: in_vars[0].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)') in_vars[0].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)') out_vars[0] = VariantFile(args.out1, 'w', header=in_vars[0].header) if args.out2: in_vars[1].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)') in_vars[1].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)') out_vars[1] = VariantFile(args.out2, 'w', header=in_vars[1].header) match_status_map = {True : '=', False : 'X', None : '.'} # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (super1, super2) in superloci: super1.sort(key=NormalizedLocus.natural_order_key) super2.sort(key=NormalizedLocus.natural_order_key) super_start, super_stop = get_superlocus_bounds([super1, super2]) print('-'*80) print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop)) print() for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: lstart = locus.start lstop = locus.stop lref = locus.alleles[0] or '-' indices = locus.allele_indices sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(' NORM{:d}: [{:5d}-{:5d}) ref={} geno={}'.format(i, lstart, lstop, lref, geno)) print() match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug) match_status = match_status_map[match] print(' MATCH={} TYPE={}'.format(match_status, match_type)) print() # The hard work is done. The rest is just output and formatting... if out_vars[0]: for locus in sorted(super1, key=NormalizedLocus.record_order_key): locus.record.samples[args.name1]['BD'] = match_status locus.record.samples[args.name1]['BK'] = match_type out_vars[0].write(locus.record) if out_vars[1]: for locus in sorted(super2, key=NormalizedLocus.record_order_key): locus.record.samples[args.name2]['BD'] = match_status locus.record.samples[args.name2]['BK'] = match_type out_vars[1].write(locus.record) for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: print(' VCF{:d}: {}'.format(i, locus.record), end='') print() for out_var in out_vars: if out_var is not None: out_var.close()
def region_filter_include(records, include): """Remove records that do not overlap those provided.""" for _, _, (rec, inc) in union([records, include]): if inc: yield from rec
def region_filter_exclude(records, exclude): """Remove records that overlap those provided.""" for _, _, (rec, exc) in union([records, exclude]): if not exc: yield from rec