Example #1
0
def get_phased_counts_variant(record, LR_bam, reference_pyfasta):

    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    ref = tk_io.get_record_ref(record)
    alt_alleles = tk_io.get_record_alt_alleles(record)

    if LR_bam.references[0][0:3] != "chr":
        chrom = chrom[3:]

    # this function does the realignment
    counts, _, _, _, _, _ = tk_bam.get_phased_allele_read_info(
        chrom,
        pos,
        ref,
        alt_alleles,
        30,
        0,
        0,
        0,
        LR_bam,
        reference_pyfasta,
        match=1,
        mismatch=-3,
        gap_open=-1,
        gap_extend=-4)
    unphased = (counts[0][1], sum(counts[0]))
    hap_1 = (counts[1][1], sum(counts[1]))
    hap_2 = (counts[2][1], sum(counts[2]))
    return (unphased, hap_1, hap_2)
Example #2
0
def get_phase_set(record, bam):
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    for read in bam.fetch(chrom, pos-1, pos+1):
        if dict(read.tags).get('PS') is not None:
            return dict(read.tags).get('PS')
    return None
Example #3
0
def validate_variant(record, validation_bam, reference_pyfasta):

    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    ref = tk_io.get_record_ref(record)
    alt_alleles = tk_io.get_record_alt_alleles(record)

    if validation_bam.references[0][0:3] != "chr":
        chrom = chrom[3:]

    # this function does the realignment
    counts, _, _, _, _, _ = tk_bam.get_allele_read_info(chrom,
                                                        pos,
                                                        ref,
                                                        alt_alleles,
                                                        30,
                                                        0,
                                                        0,
                                                        0,
                                                        validation_bam,
                                                        reference_pyfasta,
                                                        match=1,
                                                        mismatch=-3,
                                                        gap_open=-1,
                                                        gap_extend=-4)
    validation_cov = sum(counts)
    validation_ao = counts[1]
    return (validation_ao, validation_cov)
Example #4
0
def lockstep_variant_iterator(vfr_left, vfr_right, shared_locus):
    """
    Traverse two copies of the same variants in lockstep, making sure
    we never get out of sync.
    """
    iter_left = get_variant_iterator(vfr_left, shared_locus)
    iter_right = get_variant_iterator(vfr_right, shared_locus)

    for (var_left, var_right) in zip(iter_left, iter_right):
        # keep these 1-indexed since they're only used for the error message
        pos_left = tk_io.get_record_pos(var_left)
        pos_right = tk_io.get_record_pos(var_right)
        if pos_left != pos_right:
            raise Exception(
                "Variant positions are out of sync: {0}:{1}, {0}:{2}".format(
                    shared_locus.chrom, pos_left, pos_right))
        yield (var_left, var_right)
Example #5
0
def get_closest_variant_pos(variants, target_pos, direction, het_only=False):
    """
    Get closest variant to target, looking in specified direction (-1 = before target, 1 = after target)
    """
    # too lazy to implement binary search.
    for variant in variants[::direction]:
        pos = tk_io.get_record_pos(variant) - 1
        right_direction = (pos <= target_pos) if direction < 0 else (
            pos >= target_pos)
        if (gt_is_het(variant) or not het_only) and right_direction:
            return pos
    # edge case - no variants in that direction.
    return None
Example #6
0
def filter_variant(var, bam, reference_pyfasta):
    if tk_io.get_record_qual(var) < 50:
        tk_io.set_record_filters(var, ['10X_QUAL_FILTER'])
        return
    chrom = tk_io.get_record_chrom(var)
    pos = tk_io.get_record_pos(var)
    ref = tk_io.get_record_ref(var)
    alts = tk_io.get_record_alt_alleles(var)
    (counts, _, _, _, _,
     _) = tk_bam.get_allele_read_info(chrom, pos, ref, alts, 30, 30, 30, 45,
                                      bam, reference_pyfasta)
    if float(counts[1]) < 2 or float(
            counts[1]) / float(counts[0] + counts[1]) < 0.15:
        tk_io.set_record_filters(var, ['10X_ALLELE_FRACTION_FILTER'])
Example #7
0
def pair_iter(i1, i2):
    v1 = None
    v2 = None

    while True:

        if v1 is None:
            try:
                v1 = i1.next()
            except StopIteration:
                if v2 is not None:
                    yield (None, v2)
                for x2 in i2:
                    yield (None, x2)
                break
        if v2 is None:
            try:
                v2 = i2.next()
            except StopIteration:
                if v1 is not None:
                    yield (v1, None)
                for x1 in i1:
                    yield (x1, None)
                break
        k1 = tk_io.get_record_pos(v1)
        k2 = tk_io.get_record_pos(v2)
        if k1 == k2:
            yield (v1, v2)
            v1 = None
            v2 = None
        elif k1 < k2:
            yield (v1, None)
            v1 = None
        else:
            yield (None, v2)
            v2 = None
Example #8
0
def split_variant_iterator(vfr_left, vfr_right, new_locus_left,
                           new_locus_right):
    # assert no overlap
    assert (new_locus_left.end <= new_locus_right.start)
    for record_out in get_variant_iterator(vfr_left, new_locus_left):
        yield record_out
    first_phase_set_right = None
    for record_out in get_variant_iterator(vfr_right, new_locus_right):
        if first_phase_set_right is None:
            first_phase_set_right = tk_io.get_record_pos(record_out) - 1
        # if we see a real phase set that's less than the new one,
        # then the block was truncated and should be updated
        current_ps = tk_io.get_record_phase_set(record_out)
        if current_ps > 0 and current_ps < first_phase_set_right:
            adjust_phasing(record_out, first_phase_set_right, flip=False)
        yield record_out
Example #9
0
def populate_fields(record, bam, reference_pyfasta, args):
    alleles = tk_io.get_record_alt_alleles(record)
    ref = tk_io.get_record_ref(record)
    post_homopolymer_counts = []
    post_homopolymer_bases = []
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    ref = tk_io.get_record_ref(record)
    post_homopolymer_counts = []
    post_homopolymer_bases = []
    post_dinucleotide_counts = []
    post_dinucleotide_bases = []
    post_trinucleotide_counts = []
    post_trinucleotide_bases = []
    for allele in alleles:
        variant_length = tk_io.get_allele_length(ref, allele)
        if variant_length != 0:
            post_hp_c, post_hp_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 1)
            post_dn_c, post_dn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 2)
            post_tn_c, post_tn_b = populate_repeat_info(record, bam, variant_length, reference_pyfasta, 3)
            post_homopolymer_counts.append(post_hp_c)
            post_homopolymer_bases.append(post_hp_b)
            post_dinucleotide_counts.append(post_dn_c)
            post_dinucleotide_bases.append(post_dn_b)
            post_trinucleotide_counts.append(post_tn_c)
            post_trinucleotide_bases.append(post_tn_b)
    if len(post_homopolymer_counts) != 0:
        record.INFO['POSTHPC'] = post_homopolymer_counts
        record.INFO['POSTHPB'] = post_homopolymer_bases
        record.INFO['POSTDNC'] = post_dinucleotide_counts
        record.INFO['POSTDNB'] = post_dinucleotide_bases
        record.INFO['POSTTNC'] = post_trinucleotide_counts
        record.INFO['POSTTNB'] = post_trinucleotide_bases

    (counts, mean_mapqs, bc_qual_string, molecule_differences, AS, rescue) = tk_bam.get_allele_read_info(chrom, pos, ref, alleles, 30, -1, args.min_mapq_attach_bc, args.default_indel_qual, bam, reference_pyfasta)

    tk_io.set_record_barcodes(record, bc_qual_string)
    record.INFO['MMD'] = numpy.mean(molecule_differences[1])
    if math.isnan(record.INFO['MMD']):
        record.INFO['MMD'] = -1
    record.INFO['MUMAP_REF'] = mean_mapqs[0]
    record.INFO['MUMAP_ALT'] = mean_mapqs[1:]
    record.INFO['RO'] = counts[0]
    record.INFO['AO'] = counts[1:]
    record.INFO['RESCUED'] = numpy.sum(numpy.sum(x) for x in rescue)
    record.INFO['NOT_RESCUED'] = numpy.sum([y for y in [numpy.sum([1-z for z in x]) for x in rescue]])
Example #10
0
def load_variant_barcode_phasing_info(record, fragment_barcode_info):
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    end = pos + tk_io.get_record_max_length(record)
    barcode_info = {}
    sample = record.samples[0]
    phase_set = int(get_data(sample.data, "PS", -1))
    for line in tk_tabix.tabix_safe_fetch(fragment_barcode_info, chrom, pos,
                                          end + 1):
        info = line.strip("\n").split("\t")
        barcode = info[6]
        frag_phase_set = int(info[3])
        if frag_phase_set != phase_set and phase_set != -1:
            continue
        assert (not barcode in barcode_info)
        barcode_info[barcode] = (float(info[7]), float(info[8]),
                                 float(info[9]))
    return barcode_info
    def test_basic(self):
        self.run_stage(self.args)

        # Load the output file
        vfr = tk_io.VariantFileReader(os.path.join(job_dir,"default.vcf"))
        for r in vfr.record_getter():
            pos = tk_io.get_record_pos(r)
            barcodes = tk_io.get_record_barcodes(r)
            if pos == 26357747 or pos == 26357748:

                print barcodes
                assert(barcodes[1][0] =='1-ATAGGAGTTCAGGG_63')
                print tk_io.get_record_alt_allele_counts(r)
                assert(int(tk_io.get_record_alt_allele_counts(r)[0]) in [31,32])
                assert(int(tk_io.get_record_ref_allele_count(r)) == 0)
            if pos == 26501280:
                print barcodes
                assert(barcodes[0][0] == '1-TGAAGACATAACCC_61_61')
                assert(int(r.INFO['POSTHPC'][0]) == 10)
Example #12
0
    def test_call_haps(self):
        out_vcf = open(OUTPUT_VCF, 'w')
        vfw = VariantFileWriter(out_vcf,
                                template_file=open(SNP_INPUT_VCF, 'r'))
        out_bc_haps = open(OUTPUT_TSV, 'w')
        self.p.call_haps(vfw, out_bc_haps)
        out_vcf.close()
        out_bc_haps.close()
        vfr = VariantFileReader(OUTPUT_VCF)
        hap_calls = {}
        for record in vfr.record_getter():
            chrom = tk_io.get_record_chrom(record)
            pos = tk_io.get_record_pos(record) - 1
            genotype, phased = tk_io.get_record_genotype_phased(record)
            hap_calls[(chrom, pos)] = genotype
            self.assertTrue(phased)

        print hap_calls
        self.assertTrue((hap_calls[('chr1', 2)] == [1, 2]
                         and hap_calls[('chr1', 3)] == [1, 0])
                        or hap_calls[('chr1', 2)] == [2, 1]
                        and hap_calls[('chr1', 3)] == [0, 1])
Example #13
0
    def __init__(self, current_phase_set, record):
        self.chrom = tk_io.get_record_chrom(record)
        self.pos = tk_io.get_record_pos(record)
        self.key = (self.chrom, self.pos)

        self.ref = tk_io.get_record_ref(record)
        self.filters = tk_io.get_record_passes_filters(record)

        alt_alleles = tk_io.get_record_alt_alleles(record)
        all_alleles = [self.ref] + alt_alleles

        (genotype, self.phased) = tk_io.get_record_genotype_phased(record)

        # always set homozygotes as phased
        if genotype[0] == genotype[1]:
            self.phased = True

        # note -- if there are two alts, this will just pick one.
        self.phase_set = current_phase_set
        self.hap = (all_alleles[genotype[0]], all_alleles[genotype[1]])

        self.record = record
Example #14
0
def populate_repeat_info(record, bam, variant_length, reference_pyfasta, length):
    post_poly_count = 0
    post_poly_base = None
    chrom = tk_io.get_record_chrom(record)
    pos = tk_io.get_record_pos(record)
    lastBase = None
    gap = min(30, len(reference_pyfasta[chrom])-pos-1)
    #sequence = {x: tk_bam.get_base_counts_at_locus(chrom, pos + x, bam) for x in range(0 , gap + max(-variant_length,1))}
    sequence = reference_pyfasta[chrom][(pos+1):(pos+gap+1)].upper()
    #from the base after the indel to the end of the gap
    for base in range(0, gap, length):
        if lastBase is None:
            post_poly_count = 1
            post_poly_base = sequence[base:base+length]
            lastBase = post_poly_base
        elif lastBase is not None:
            if lastBase == sequence[base:base+length]:
                post_poly_count += 1
            else:
                break
        else:
            break
    return post_poly_count, post_poly_base
Example #15
0
 def get_record_data(record):
     record_set = tk_io.get_record_phase_set(record)
     record_chrom = tk_io.get_record_chrom(record)
     record_pos = tk_io.get_record_pos(record) - 1
     return (record_set, record_chrom, record_pos)