Ejemplo n.º 1
0
def get_phased_counts_variant(record, LR_bam, reference_pyfasta):

    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    ref = tk_io.get_record_ref(record)
    alt_alleles = tk_io.get_record_alt_alleles(record)

    if LR_bam.references[0][0:3] != "chr":
        chrom = chrom[3:]

    # this function does the realignment
    counts, _, _, _, _, _ = tk_bam.get_phased_allele_read_info(
        chrom,
        pos,
        ref,
        alt_alleles,
        30,
        0,
        0,
        0,
        LR_bam,
        reference_pyfasta,
        match=1,
        mismatch=-3,
        gap_open=-1,
        gap_extend=-4)
    unphased = (counts[0][1], sum(counts[0]))
    hap_1 = (counts[1][1], sum(counts[1]))
    hap_2 = (counts[2][1], sum(counts[2]))
    return (unphased, hap_1, hap_2)
Ejemplo n.º 2
0
def get_phase_set(record, bam):
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    for read in bam.fetch(chrom, pos-1, pos+1):
        if dict(read.tags).get('PS') is not None:
            return dict(read.tags).get('PS')
    return None
Ejemplo n.º 3
0
def validate_variant(record, validation_bam, reference_pyfasta):

    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    ref = tk_io.get_record_ref(record)
    alt_alleles = tk_io.get_record_alt_alleles(record)

    if validation_bam.references[0][0:3] != "chr":
        chrom = chrom[3:]

    # this function does the realignment
    counts, _, _, _, _, _ = tk_bam.get_allele_read_info(chrom,
                                                        pos,
                                                        ref,
                                                        alt_alleles,
                                                        30,
                                                        0,
                                                        0,
                                                        0,
                                                        validation_bam,
                                                        reference_pyfasta,
                                                        match=1,
                                                        mismatch=-3,
                                                        gap_open=-1,
                                                        gap_extend=-4)
    validation_cov = sum(counts)
    validation_ao = counts[1]
    return (validation_ao, validation_cov)
Ejemplo n.º 4
0
def filter_variant(var, bam, reference_pyfasta):
    if tk_io.get_record_qual(var) < 50:
        tk_io.set_record_filters(var, ['10X_QUAL_FILTER'])
        return
    chrom = tk_io.get_record_chrom(var)
    pos = tk_io.get_record_pos(var)
    ref = tk_io.get_record_ref(var)
    alts = tk_io.get_record_alt_alleles(var)
    (counts, _, _, _, _,
     _) = tk_bam.get_allele_read_info(chrom, pos, ref, alts, 30, 30, 30, 45,
                                      bam, reference_pyfasta)
    if float(counts[1]) < 2 or float(
            counts[1]) / float(counts[0] + counts[1]) < 0.15:
        tk_io.set_record_filters(var, ['10X_ALLELE_FRACTION_FILTER'])
Ejemplo n.º 5
0
def populate_fields(record, bam, reference_pyfasta, args):
    alleles = tk_io.get_record_alt_alleles(record)
    ref = tk_io.get_record_ref(record)
    post_homopolymer_counts = []
    post_homopolymer_bases = []
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    ref = tk_io.get_record_ref(record)
    post_homopolymer_counts = []
    post_homopolymer_bases = []
    post_dinucleotide_counts = []
    post_dinucleotide_bases = []
    post_trinucleotide_counts = []
    post_trinucleotide_bases = []
    for allele in alleles:
        variant_length = tk_io.get_allele_length(ref, allele)
        if variant_length != 0:
            post_hp_c, post_hp_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 1)
            post_dn_c, post_dn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 2)
            post_tn_c, post_tn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 3)
            post_homopolymer_counts.append(post_hp_c)
            post_homopolymer_bases.append(post_hp_b)
            post_dinucleotide_counts.append(post_dn_c)
            post_dinucleotide_bases.append(post_dn_b)
            post_trinucleotide_counts.append(post_tn_c)
            post_trinucleotide_bases.append(post_tn_b)
    if len(post_homopolymer_counts) != 0:
        record.INFO['POSTHPC'] = post_homopolymer_counts
        record.INFO['POSTHPB'] = post_homopolymer_bases
        record.INFO['POSTDNC'] = post_dinucleotide_counts
        record.INFO['POSTDNB'] = post_dinucleotide_bases
        record.INFO['POSTTNC'] = post_trinucleotide_counts
        record.INFO['POSTTNB'] = post_trinucleotide_bases

    (counts, mean_mapqs, bc_qual_string, molecule_differences, AS, rescue) = tk_bam.get_allele_read_info(chrom, pos, ref, alleles, 30, -1, args.min_mapq_attach_bc, args.default_indel_qual, bam, reference_pyfasta)

    tk_io.set_record_barcodes(record, bc_qual_string)
    record.INFO['MMD'] = numpy.mean(molecule_differences[1])
    if math.isnan(record.INFO['MMD']):
        record.INFO['MMD'] = -1
    record.INFO['MUMAP_REF'] = mean_mapqs[0]
    record.INFO['MUMAP_ALT'] = mean_mapqs[1:]
    record.INFO['RO'] = counts[0]
    record.INFO['AO'] = counts[1:]
    record.INFO['RESCUED'] = numpy.sum(numpy.sum(x) for x in rescue)
    record.INFO['NOT_RESCUED'] = numpy.sum([y for y in [numpy.sum([1-z for z in x]) for x in rescue]])
Ejemplo n.º 6
0
def load_variant_barcode_phasing_info(record, fragment_barcode_info):
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    end = pos + tk_io.get_record_max_length(record)
    barcode_info = {}
    sample = record.samples[0]
    phase_set = int(get_data(sample.data, "PS", -1))
    for line in tk_tabix.tabix_safe_fetch(fragment_barcode_info, chrom, pos,
                                          end + 1):
        info = line.strip("\n").split("\t")
        barcode = info[6]
        frag_phase_set = int(info[3])
        if frag_phase_set != phase_set and phase_set != -1:
            continue
        assert (not barcode in barcode_info)
        barcode_info[barcode] = (float(info[7]), float(info[8]),
                                 float(info[9]))
    return barcode_info
Ejemplo n.º 7
0
    def test_call_haps(self):
        out_vcf = open(OUTPUT_VCF, 'w')
        vfw = VariantFileWriter(out_vcf,
                                template_file=open(SNP_INPUT_VCF, 'r'))
        out_bc_haps = open(OUTPUT_TSV, 'w')
        self.p.call_haps(vfw, out_bc_haps)
        out_vcf.close()
        out_bc_haps.close()
        vfr = VariantFileReader(OUTPUT_VCF)
        hap_calls = {}
        for record in vfr.record_getter():
            chrom = tk_io.get_record_chrom(record)
            pos = tk_io.get_record_pos(record) - 1
            genotype, phased = tk_io.get_record_genotype_phased(record)
            hap_calls[(chrom, pos)] = genotype
            self.assertTrue(phased)

        print hap_calls
        self.assertTrue((hap_calls[('chr1', 2)] == [1, 2]
                         and hap_calls[('chr1', 3)] == [1, 0])
                        or hap_calls[('chr1', 2)] == [2, 1]
                        and hap_calls[('chr1', 3)] == [0, 1])
Ejemplo n.º 8
0
    def __init__(self, current_phase_set, record):
        self.chrom = tk_io.get_record_chrom(record)
        self.pos = tk_io.get_record_pos(record)
        self.key = (self.chrom, self.pos)

        self.ref = tk_io.get_record_ref(record)
        self.filters = tk_io.get_record_passes_filters(record)

        alt_alleles = tk_io.get_record_alt_alleles(record)
        all_alleles = [self.ref] + alt_alleles

        (genotype, self.phased) = tk_io.get_record_genotype_phased(record)

        # always set homozygotes as phased
        if genotype[0] == genotype[1]:
            self.phased = True

        # note -- if there are two alts, this will just pick one.
        self.phase_set = current_phase_set
        self.hap = (all_alleles[genotype[0]], all_alleles[genotype[1]])

        self.record = record
Ejemplo n.º 9
0
def populate_repeat_info(record, bam, variant_length, reference_pyfasta, length):
    post_poly_count = 0
    post_poly_base = None
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    lastBase = None
    gap = min(30, len(reference_pyfasta[chrom])-pos-1)
    #sequence = {x: tk_bam.get_base_counts_at_locus(chrom, pos + x, bam) for x in range(0 , gap + max(-variant_length,1))}
    sequence = reference_pyfasta[chrom][(pos+1):(pos+gap+1)].upper()
    #from the base after the indel to the end of the gap
    for base in range(0, gap, length):
        if lastBase is None:
            post_poly_count = 1
            post_poly_base = sequence[base:base+length]
            lastBase = post_poly_base
        elif lastBase is not None:
            if lastBase == sequence[base:base+length]:
                post_poly_count += 1
            else:
                break
        else:
            break
    return post_poly_count, post_poly_base
Ejemplo n.º 10
0
def check_vcf(filename, args):
    fasta = tenkit.reference.open_reference(args.reference_path)
    record_cap = 1000
    record_cursor = 0
    lines = 0
    with open(filename, 'r') as vcf_file:
        for line in vcf_file:
            if lines == 0 and (not line.startswith("##fileformat=VCFv4.")):
                martian.exit(filename + " does not have a proper header. First line should begin with ##fileformat=VCFv4.")
            if not line.startswith("#"):
                break
            lines += 1

    with open(filename, 'r') as f:
        try:
            vcf_iter = vcf.Reader(f)
        except:
            trace = traceback.format_exc()
            martian.exit(filename+" failed on parsing with PyVCF. Traceback:\n"+trace)

        while True:
            try:
                record = vcf_iter.next()
            except StopIteration:
                break
            except:
                trace = traceback.format_exc()
                martian.exit(filename+" failed on parsing with PyVCF. Approximate line number of failure occured at "+str(lines+record_cursor+1)+". Traceback:\n"+trace)
            try:
                record_str = "\nErrored on record " + str(record) + " in file " + filename + " Approximate line number "+str(lines+record_cursor+1)
            except:
                martian.exit(filename+" failed on parsing with pyvcf at approximate line number "+str(lines+record_cursor+1)+". Traceback:\n"+traceback.format_exc())

            # Check for multiple sample columns.
            if len(record.samples) != 1:
                martian.exit("The supplied VCF file contains multiple samples, which is not currently supported: " + str(record.samples))
            try:
                chrom = tk_io.get_record_chrom(record)
                ref = tk_io.get_record_ref(record)
                alt_alleles = tk_io.get_record_alt_alleles(record)
            except:
                martian.exit(filename+" failed on parsing with pyvcf at approximate line number "+str(lines+record_cursor+1)+". Traceback:\n"+traceback.format_exc())

            # Check for chromosome name that doesn't start with 'chr'.
            if tenkit.reference.is_tenx(args.reference_path) and tenkit.reference.get_genome(args.reference_path) == "10X_hg19_ucsc":
                if not chrom.startswith('chr'):
                    martian.exit("The supplied VCF file does not use UCSC-style 'chrX' chromosome names, and this is not currently supported."+record_str)

            # Check that chromosome exists in reference
            if not chrom in fasta:
                martian.exit("The supplied VCF file contains chromosomes not found in reference genome."+record_str)

            # Check that ref allele exists
            if ref is not None:
                if ref == "." or ref == "":
                    martian.exit("The supplied VCF file contains entries with . or missing reference alleles."+record_str)
            else:
                martian.exit("The supplied VCF file contains entries with missing reference alleles."+record_str)

            # Check ref allele is upper case
            if ref != ref.upper():
                martian.exit("The supplied VCF file contains entries with lower case or mixed case reference alleles."+record_str)

            # Check that alt allele isnt empty or '.'
            if alt_alleles is not None:
                for allele in alt_alleles:
                    if allele is None or allele == '.':
                        martian.exit("The supplied VCF file contains entries where ALT allele is either empty or '.'"+record_str)
                    elif allele != allele.upper():
                        martian.exit("The supplied VCF file contains entries with lower case or mixed case alleles."+record_str)
            else:
                martian.exit("The supplied VCF file contains entries with no ALT alleles." + record_str)

            record_cursor += 1
            if record_cursor >= record_cap:
                break
    with open("temp.vcf",'w') as temp:
        subprocess.check_call(['head','-n','3000',filename],stdout=temp)
    with open("temp2.vcf",'w') as temp2:
        try:
            subprocess.check_call(['vcfallelicprimitives','--keep-info','-t','VCFALLELICPRIMITIVE','temp.vcf'], stdout = temp2)
        except:
            trace = traceback.format_exc()
            martian.exit(filename+" failed on parsing with vcfallelicprimitives. Traceback:\n"+trace)
    with open(os.devnull, "w") as fnull:
        try:
            subprocess.check_call(['bcftools', 'filter', 'temp2.vcf'],stdout=fnull)
        except:
            trace = traceback.format_exc()
            martian.exit(filename+" failed on parsing with vcfallelicprimitives or bcftools. Traceback:\n"+trace)
    subprocess.check_call(['rm', 'temp.vcf', 'temp2.vcf'])
Ejemplo n.º 11
0
 def get_record_data(record):
     record_set = tk_io.get_record_phase_set(record)
     record_chrom = tk_io.get_record_chrom(record)
     record_pos = tk_io.get_record_pos(record) - 1
     return (record_set, record_chrom, record_pos)
Ejemplo n.º 12
0
def main(args, outs):
    vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode)

    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    chrom = str(chrom)

    if chrom in ['chrM', 'MT', 'M'] or (args.sex.lower() in ["f", "female"]
                                        and chrom in ["chrY", "Y"]):
        return

    fragment_barcode_info = pysam.Tabixfile(args.fragment_phasing)
    AH_0_BH_0 = (
        'AH_0_BH_0', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 0'
    )
    AH_1_BH_1 = (
        'AH_1_BH_1', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 1'
    )
    AH_0_BH_1 = (
        'AH_0_BH_1', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 1'
    )
    AH_1_BH_0 = (
        'AH_1_BH_0', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 0'
    )
    BX_HAP_OR = (
        'BX_HAP_OR', '1', 'Float',
        "Barcode aware haplotype filtering score (log odds ratio currently)")
    BARCODE_AWARE_FILTER = [(
        "BARCODE_AWARE_FILTER",
        "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with haplotype (ie variants should have most of their allele haplotype 0 alleles coming from barcodes whose fragments are haplotype 0 etc)"
    )]
    extra_fields = [AH_0_BH_0, AH_1_BH_1, AH_0_BH_1, AH_1_BH_0, BX_HAP_OR]
    input_variants = tk_io.VariantFileReader(args.variants)
    with open(outs.default.strip(".gz"), 'w') as output_file:
        output_variants = tk_io.VariantFileWriter(
            output_file,
            template_file=open(args.variants, 'r'),
            new_info_fields=extra_fields,
            new_filters=BARCODE_AWARE_FILTER)
        variant_iterator = tk_io.get_variant_iterator_pos(
            input_variants, None, args.locus)
        for record in variant_iterator:
            sample = record.samples[0]
            ref = tk_io.get_record_ref(record)
            alt_alleles = tk_io.get_record_alt_alleles(record)

            if not tk_io.get_record_passes_filters(record):
                output_variants.write_record(record)
                continue
            if len(sample.gt_alleles) > 1:
                genotype_1 = int(sample.gt_alleles[0])
                genotype_2 = int(sample.gt_alleles[1])
                if genotype_1 == genotype_2:
                    output_variants.write_record(record)
                    continue  #homozygous, can't filter this way
            else:
                output_variants.write_record(record)
                continue  #homozygous, can't filter this way

            chrom = tk_io.get_record_chrom(record)
            if not chrom == "chrM":
                variant_barcode_info = load_variant_barcode_phasing_info(
                    record, fragment_barcode_info)
                if not barcode_aware_filter(record, variant_barcode_info):
                    if record.FILTER is None:
                        record.FILTER = []
                    if tk_io.get_var_type(ref, alt_alleles[0]) == "S" and (
                        (vc_mode == 'call') or (vc_mode == "precalled_plus"
                                                and "TENX" in record.INFO)):
                        record.FILTER.append("BARCODE_AWARE_FILTER")
            output_variants.write_record(record)