def get_phased_counts_variant(record, LR_bam, reference_pyfasta): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) if LR_bam.references[0][0:3] != "chr": chrom = chrom[3:] # this function does the realignment counts, _, _, _, _, _ = tk_bam.get_phased_allele_read_info( chrom, pos, ref, alt_alleles, 30, 0, 0, 0, LR_bam, reference_pyfasta, match=1, mismatch=-3, gap_open=-1, gap_extend=-4) unphased = (counts[0][1], sum(counts[0])) hap_1 = (counts[1][1], sum(counts[1])) hap_2 = (counts[2][1], sum(counts[2])) return (unphased, hap_1, hap_2)
def get_phase_set(record, bam): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) for read in bam.fetch(chrom, pos-1, pos+1): if dict(read.tags).get('PS') is not None: return dict(read.tags).get('PS') return None
def validate_variant(record, validation_bam, reference_pyfasta): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) if validation_bam.references[0][0:3] != "chr": chrom = chrom[3:] # this function does the realignment counts, _, _, _, _, _ = tk_bam.get_allele_read_info(chrom, pos, ref, alt_alleles, 30, 0, 0, 0, validation_bam, reference_pyfasta, match=1, mismatch=-3, gap_open=-1, gap_extend=-4) validation_cov = sum(counts) validation_ao = counts[1] return (validation_ao, validation_cov)
def filter_variant(var, bam, reference_pyfasta): if tk_io.get_record_qual(var) < 50: tk_io.set_record_filters(var, ['10X_QUAL_FILTER']) return chrom = tk_io.get_record_chrom(var) pos = tk_io.get_record_pos(var) ref = tk_io.get_record_ref(var) alts = tk_io.get_record_alt_alleles(var) (counts, _, _, _, _, _) = tk_bam.get_allele_read_info(chrom, pos, ref, alts, 30, 30, 30, 45, bam, reference_pyfasta) if float(counts[1]) < 2 or float( counts[1]) / float(counts[0] + counts[1]) < 0.15: tk_io.set_record_filters(var, ['10X_ALLELE_FRACTION_FILTER'])
def populate_fields(record, bam, reference_pyfasta, args): alleles = tk_io.get_record_alt_alleles(record) ref = tk_io.get_record_ref(record) post_homopolymer_counts = [] post_homopolymer_bases = [] chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) ref = tk_io.get_record_ref(record) post_homopolymer_counts = [] post_homopolymer_bases = [] post_dinucleotide_counts = [] post_dinucleotide_bases = [] post_trinucleotide_counts = [] post_trinucleotide_bases = [] for allele in alleles: variant_length = tk_io.get_allele_length(ref, allele) if variant_length != 0: post_hp_c, post_hp_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 1) post_dn_c, post_dn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 2) post_tn_c, post_tn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 3) post_homopolymer_counts.append(post_hp_c) post_homopolymer_bases.append(post_hp_b) post_dinucleotide_counts.append(post_dn_c) post_dinucleotide_bases.append(post_dn_b) post_trinucleotide_counts.append(post_tn_c) post_trinucleotide_bases.append(post_tn_b) if len(post_homopolymer_counts) != 0: record.INFO['POSTHPC'] = post_homopolymer_counts record.INFO['POSTHPB'] = post_homopolymer_bases record.INFO['POSTDNC'] = post_dinucleotide_counts record.INFO['POSTDNB'] = post_dinucleotide_bases record.INFO['POSTTNC'] = post_trinucleotide_counts record.INFO['POSTTNB'] = post_trinucleotide_bases (counts, mean_mapqs, bc_qual_string, molecule_differences, AS, rescue) = tk_bam.get_allele_read_info(chrom, pos, ref, alleles, 30, -1, args.min_mapq_attach_bc, args.default_indel_qual, bam, reference_pyfasta) tk_io.set_record_barcodes(record, bc_qual_string) record.INFO['MMD'] = numpy.mean(molecule_differences[1]) if math.isnan(record.INFO['MMD']): record.INFO['MMD'] = -1 record.INFO['MUMAP_REF'] = mean_mapqs[0] record.INFO['MUMAP_ALT'] = mean_mapqs[1:] record.INFO['RO'] = counts[0] record.INFO['AO'] = counts[1:] record.INFO['RESCUED'] = numpy.sum(numpy.sum(x) for x in rescue) record.INFO['NOT_RESCUED'] = numpy.sum([y for y in [numpy.sum([1-z for z in x]) for x in rescue]])
def load_variant_barcode_phasing_info(record, fragment_barcode_info): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) end = pos + tk_io.get_record_max_length(record) barcode_info = {} sample = record.samples[0] phase_set = int(get_data(sample.data, "PS", -1)) for line in tk_tabix.tabix_safe_fetch(fragment_barcode_info, chrom, pos, end + 1): info = line.strip("\n").split("\t") barcode = info[6] frag_phase_set = int(info[3]) if frag_phase_set != phase_set and phase_set != -1: continue assert (not barcode in barcode_info) barcode_info[barcode] = (float(info[7]), float(info[8]), float(info[9])) return barcode_info
def test_call_haps(self): out_vcf = open(OUTPUT_VCF, 'w') vfw = VariantFileWriter(out_vcf, template_file=open(SNP_INPUT_VCF, 'r')) out_bc_haps = open(OUTPUT_TSV, 'w') self.p.call_haps(vfw, out_bc_haps) out_vcf.close() out_bc_haps.close() vfr = VariantFileReader(OUTPUT_VCF) hap_calls = {} for record in vfr.record_getter(): chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) - 1 genotype, phased = tk_io.get_record_genotype_phased(record) hap_calls[(chrom, pos)] = genotype self.assertTrue(phased) print hap_calls self.assertTrue((hap_calls[('chr1', 2)] == [1, 2] and hap_calls[('chr1', 3)] == [1, 0]) or hap_calls[('chr1', 2)] == [2, 1] and hap_calls[('chr1', 3)] == [0, 1])
def __init__(self, current_phase_set, record): self.chrom = tk_io.get_record_chrom(record) self.pos = tk_io.get_record_pos(record) self.key = (self.chrom, self.pos) self.ref = tk_io.get_record_ref(record) self.filters = tk_io.get_record_passes_filters(record) alt_alleles = tk_io.get_record_alt_alleles(record) all_alleles = [self.ref] + alt_alleles (genotype, self.phased) = tk_io.get_record_genotype_phased(record) # always set homozygotes as phased if genotype[0] == genotype[1]: self.phased = True # note -- if there are two alts, this will just pick one. self.phase_set = current_phase_set self.hap = (all_alleles[genotype[0]], all_alleles[genotype[1]]) self.record = record
def populate_repeat_info(record, bam, variant_length, reference_pyfasta, length): post_poly_count = 0 post_poly_base = None chrom = tk_io.get_record_chrom(record) pos = tk_io.get_record_pos(record) lastBase = None gap = min(30, len(reference_pyfasta[chrom])-pos-1) #sequence = {x: tk_bam.get_base_counts_at_locus(chrom, pos + x, bam) for x in range(0 , gap + max(-variant_length,1))} sequence = reference_pyfasta[chrom][(pos+1):(pos+gap+1)].upper() #from the base after the indel to the end of the gap for base in range(0, gap, length): if lastBase is None: post_poly_count = 1 post_poly_base = sequence[base:base+length] lastBase = post_poly_base elif lastBase is not None: if lastBase == sequence[base:base+length]: post_poly_count += 1 else: break else: break return post_poly_count, post_poly_base
def check_vcf(filename, args): fasta = tenkit.reference.open_reference(args.reference_path) record_cap = 1000 record_cursor = 0 lines = 0 with open(filename, 'r') as vcf_file: for line in vcf_file: if lines == 0 and (not line.startswith("##fileformat=VCFv4.")): martian.exit(filename + " does not have a proper header. First line should begin with ##fileformat=VCFv4.") if not line.startswith("#"): break lines += 1 with open(filename, 'r') as f: try: vcf_iter = vcf.Reader(f) except: trace = traceback.format_exc() martian.exit(filename+" failed on parsing with PyVCF. Traceback:\n"+trace) while True: try: record = vcf_iter.next() except StopIteration: break except: trace = traceback.format_exc() martian.exit(filename+" failed on parsing with PyVCF. Approximate line number of failure occured at "+str(lines+record_cursor+1)+". Traceback:\n"+trace) try: record_str = "\nErrored on record " + str(record) + " in file " + filename + " Approximate line number "+str(lines+record_cursor+1) except: martian.exit(filename+" failed on parsing with pyvcf at approximate line number "+str(lines+record_cursor+1)+". Traceback:\n"+traceback.format_exc()) # Check for multiple sample columns. if len(record.samples) != 1: martian.exit("The supplied VCF file contains multiple samples, which is not currently supported: " + str(record.samples)) try: chrom = tk_io.get_record_chrom(record) ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) except: martian.exit(filename+" failed on parsing with pyvcf at approximate line number "+str(lines+record_cursor+1)+". Traceback:\n"+traceback.format_exc()) # Check for chromosome name that doesn't start with 'chr'. if tenkit.reference.is_tenx(args.reference_path) and tenkit.reference.get_genome(args.reference_path) == "10X_hg19_ucsc": if not chrom.startswith('chr'): martian.exit("The supplied VCF file does not use UCSC-style 'chrX' chromosome names, and this is not currently supported."+record_str) # Check that chromosome exists in reference if not chrom in fasta: martian.exit("The supplied VCF file contains chromosomes not found in reference genome."+record_str) # Check that ref allele exists if ref is not None: if ref == "." or ref == "": martian.exit("The supplied VCF file contains entries with . or missing reference alleles."+record_str) else: martian.exit("The supplied VCF file contains entries with missing reference alleles."+record_str) # Check ref allele is upper case if ref != ref.upper(): martian.exit("The supplied VCF file contains entries with lower case or mixed case reference alleles."+record_str) # Check that alt allele isnt empty or '.' if alt_alleles is not None: for allele in alt_alleles: if allele is None or allele == '.': martian.exit("The supplied VCF file contains entries where ALT allele is either empty or '.'"+record_str) elif allele != allele.upper(): martian.exit("The supplied VCF file contains entries with lower case or mixed case alleles."+record_str) else: martian.exit("The supplied VCF file contains entries with no ALT alleles." + record_str) record_cursor += 1 if record_cursor >= record_cap: break with open("temp.vcf",'w') as temp: subprocess.check_call(['head','-n','3000',filename],stdout=temp) with open("temp2.vcf",'w') as temp2: try: subprocess.check_call(['vcfallelicprimitives','--keep-info','-t','VCFALLELICPRIMITIVE','temp.vcf'], stdout = temp2) except: trace = traceback.format_exc() martian.exit(filename+" failed on parsing with vcfallelicprimitives. Traceback:\n"+trace) with open(os.devnull, "w") as fnull: try: subprocess.check_call(['bcftools', 'filter', 'temp2.vcf'],stdout=fnull) except: trace = traceback.format_exc() martian.exit(filename+" failed on parsing with vcfallelicprimitives or bcftools. Traceback:\n"+trace) subprocess.check_call(['rm', 'temp.vcf', 'temp2.vcf'])
def get_record_data(record): record_set = tk_io.get_record_phase_set(record) record_chrom = tk_io.get_record_chrom(record) record_pos = tk_io.get_record_pos(record) - 1 return (record_set, record_chrom, record_pos)
def main(args, outs): vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode) (chrom, start, stop) = tk_io.get_locus_info(args.locus) chrom = str(chrom) if chrom in ['chrM', 'MT', 'M'] or (args.sex.lower() in ["f", "female"] and chrom in ["chrY", "Y"]): return fragment_barcode_info = pysam.Tabixfile(args.fragment_phasing) AH_0_BH_0 = ( 'AH_0_BH_0', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 0' ) AH_1_BH_1 = ( 'AH_1_BH_1', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 1' ) AH_0_BH_1 = ( 'AH_0_BH_1', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 1' ) AH_1_BH_0 = ( 'AH_1_BH_0', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 0' ) BX_HAP_OR = ( 'BX_HAP_OR', '1', 'Float', "Barcode aware haplotype filtering score (log odds ratio currently)") BARCODE_AWARE_FILTER = [( "BARCODE_AWARE_FILTER", "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with haplotype (ie variants should have most of their allele haplotype 0 alleles coming from barcodes whose fragments are haplotype 0 etc)" )] extra_fields = [AH_0_BH_0, AH_1_BH_1, AH_0_BH_1, AH_1_BH_0, BX_HAP_OR] input_variants = tk_io.VariantFileReader(args.variants) with open(outs.default.strip(".gz"), 'w') as output_file: output_variants = tk_io.VariantFileWriter( output_file, template_file=open(args.variants, 'r'), new_info_fields=extra_fields, new_filters=BARCODE_AWARE_FILTER) variant_iterator = tk_io.get_variant_iterator_pos( input_variants, None, args.locus) for record in variant_iterator: sample = record.samples[0] ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) if not tk_io.get_record_passes_filters(record): output_variants.write_record(record) continue if len(sample.gt_alleles) > 1: genotype_1 = int(sample.gt_alleles[0]) genotype_2 = int(sample.gt_alleles[1]) if genotype_1 == genotype_2: output_variants.write_record(record) continue #homozygous, can't filter this way else: output_variants.write_record(record) continue #homozygous, can't filter this way chrom = tk_io.get_record_chrom(record) if not chrom == "chrM": variant_barcode_info = load_variant_barcode_phasing_info( record, fragment_barcode_info) if not barcode_aware_filter(record, variant_barcode_info): if record.FILTER is None: record.FILTER = [] if tk_io.get_var_type(ref, alt_alleles[0]) == "S" and ( (vc_mode == 'call') or (vc_mode == "precalled_plus" and "TENX" in record.INFO)): record.FILTER.append("BARCODE_AWARE_FILTER") output_variants.write_record(record)