Beispiel #1
0
 def test_match_to_gt_haps(self):
     gt_vcf_in = tk_io.VariantFileReader(
         os.path.join(TEST_FILE_DIR, 'phasing_gt.vcf.gz'))
     res = match_to_gt_haps(gt_vcf_in, gt_vcf_in, 'chr1', 0, 31)
     self.assertEqual(res[0], 0)
     res = match_to_gt_haps(gt_vcf_in, gt_vcf_in, 'chr1', 500, 531)
     assert (res is None)
     vcf_in = tk_io.VariantFileReader(
         os.path.join(TEST_FILE_DIR, 'phasing.vcf.gz'))
     res = match_to_gt_haps(gt_vcf_in, vcf_in, 'chr1', 0, 31)
     self.assertEqual(res[0], 0)
     res = match_to_gt_haps(gt_vcf_in, vcf_in, 'chr1', 31, 61)
     self.assertEqual(res[0], 1)
     res = match_to_gt_haps(gt_vcf_in, vcf_in, 'chr1', 61, 120)
     assert (res is None)
Beispiel #2
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()
    input_vfr = tk_io.VariantFileReader(args.input)

    bc_mix_prob = args.bc_mix_prob
    min_var_hap_conf = args.min_var_hap_conf
    min_junction_hap_conf = args.min_junction_hap_conf
    hap_block_size = args.hap_block_size
    hap_block_buffer_size = args.hap_block_buffer_size
    max_reassign_rounds = args.max_reassign_rounds
    chrom, start, stop = tk_io.get_locus_info(args.locus)

    output_file = open(outs.default.strip('.gz'), 'w')
    fragment_output_file = open(outs.fragment_phasing.strip('.gz'), 'w')
    vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode)

    # Add the component name and the version of the phasing code
    new_source = "10X/pipelines/stages/snpindels/phase_snpindels %s" % martian.get_pipelines_version(
    )
    new_filters = [
        ("10X_PHASING_INCONSISTENT",
         "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with phasing."
         ),
        ("10X_HOMOPOLYMER_UNPHASED_INSERTION",
         "Unphased insertions in homopolymer regions tend to be false positives"
         )
    ]
    new_formats = [
        ("PS", 1, "Integer", "ID of Phase Set for Variant"),
        ("PQ", 1, "Integer",
         "Phred QV indicating probability at this variant is incorrectly phased"
         ),
        ("JQ", 1, "Integer",
         "Phred QV indicating probability of a phasing switch error in gap prior to this variant"
         ),
    ]
    vfw = tk_io.VariantFileWriter(output_file,
                                  template_file=open(args.input),
                                  new_source=new_source,
                                  new_format_fields=new_formats,
                                  new_filters=new_filters)
    if args.do_phasing:
        phaser = Phaser(input_vfr, args.fragments, chrom, start, stop,
                        bc_mix_prob, min_junction_hap_conf, min_var_hap_conf,
                        hap_block_buffer_size, hap_block_size,
                        max_reassign_rounds, vc_mode)
        phaser.call_haps(vfw, fragment_output_file)
    else:
        pass_variants(input_vfr,
                      vfw,
                      chrom,
                      start,
                      stop,
                      strip_phasing_info=True)
    output_file.close()
    fragment_output_file.close()

    tk_tabix.sort_unique_tabix_vcf(outs.default.strip('.gz'))
Beispiel #3
0
def join_simple(vcf_left, vcf_right, vcf_out, ff_left, ff_right, ff_out,
                loc_left, loc_right):
    frags_left = pandas.read_table(ff_left, header=None, names=frag_col_names)
    frags_right = pandas.read_table(ff_right,
                                    header=None,
                                    names=frag_col_names)

    # make sure chrom is formatted as string categorical
    frags_left.chrom = frags_left.chrom.astype('str').astype('category')
    frags_right.chrom = frags_right.chrom.astype('str').astype('category')

    vfr_left = tk_io.VariantFileReader(vcf_left + ".gz")
    vfr_right = tk_io.VariantFileReader(vcf_right + ".gz")
    with open(vcf_left + ".gz", 'r') as template_in, open(vcf_out,
                                                          'w') as results_out:
        vfw = tk_io.VariantFileWriter(results_out, template_file=template_in)
        stitch(vfr_left, vfr_right, vfw, frags_left, frags_right, ff_out,
               loc_left, loc_right)
    tk_tabix.sort_unique_tabix_vcf(vcf_out)
Beispiel #4
0
def get_phase_blocks_from_vcf(vcf, filter_trivial=True, locus=None):
    if locus is not None:
        (loc_chrom, loc_start, loc_end) = locus
    else:
        (loc_chrom, loc_start, loc_end) = (None, None, None)

    records = tk_io.VariantFileReader(vcf).record_getter(fetch_chrom=loc_chrom,
                                                         fetch_start=loc_start,
                                                         fetch_end=loc_end)

    phase_set, chrom, start, end, length = [], [], [], [], []
    curr_set, curr_chrom, curr_start, curr_end = None, None, None, None

    def get_record_data(record):
        record_set = tk_io.get_record_phase_set(record)
        record_chrom = tk_io.get_record_chrom(record)
        record_pos = tk_io.get_record_pos(record) - 1
        return (record_set, record_chrom, record_pos)

    def write_curr():
        phase_set.append(curr_set)
        chrom.append(curr_chrom)
        start.append(curr_start)
        end.append(curr_end)
        length.append(curr_end - curr_start)

    for record in records:
        (next_set, next_chrom, next_pos) = get_record_data(record)
        if curr_set is None and next_set is not None:
            # initialize
            curr_set, curr_chrom, curr_start, curr_end = next_set, next_chrom, next_pos, next_pos
        elif next_set is not None and (curr_set != next_set
                                       or curr_chrom != next_chrom):
            # new set. write curr, then update all
            write_curr()
            curr_set, curr_chrom, curr_start, curr_end = next_set, next_chrom, next_pos, next_pos
        elif next_set is not None:
            # continuation of set. just update end
            curr_end = next_pos

    # write the last set
    if curr_set is not None:
        write_curr()

    pb = pandas.DataFrame({
        'chrom': chrom,
        'phase_set': phase_set,
        'start': start,
        'end': end,
        'length': length
    })
    if filter_trivial:
        pb = pb[(pb.length > 0) & (pb.phase_set != -1)]
    return pb[['chrom', 'phase_set', 'start', 'end',
               'length']].sort(['chrom', 'phase_set']).reset_index(drop=True)
Beispiel #5
0
def vcf_record_iter(in_filename, min_snp_qual):
    in_vcf = tk_io.VariantFileReader(in_filename)
    for record in in_vcf.record_getter(restrict_type='snp'):
        # Only support 1 ALT
        if len(record.ALT) > 1:
            continue
        assert len(record.ALT) == 1

        # Filter SNP based on call quality
        if record.QUAL < min_snp_qual:
            continue

        yield record
Beispiel #6
0
def sample_by_locus(vcf, locus):
    (chrom, start, end) = locus
    recs = tk_io.VariantFileReader(vcf).record_getter(fetch_chrom=chrom,
                                                      fetch_start=start,
                                                      fetch_end=end)
    total_recs = 0
    het_snp_recs = 0
    het_snp_bcs = 0
    for rec in recs:
        total_recs += 1
        if rec.var_type == 'snp' and not tk_io.get_record_homozygous(rec):
            het_snp_recs += 1
            bcs_per_hap = tk_io.get_record_barcodes(rec)
            het_snp_bcs += sum([len(hap) for hap in bcs_per_hap])
    return (total_recs, het_snp_recs, het_snp_bcs)
Beispiel #7
0
def main(args, outs):
    vc_mode, variant_caller, precalled_file, gatk_path = tk_io.get_vc_mode(
        args.vc_precalled, args.variant_mode)
    locus = args.locus
    (chrom, start, stop) = tk_io.get_locus_info(locus)
    fasta_path = tk_reference.get_fasta(args.reference_path)

    bedfile = outs.default + ".bed"
    regions = Regions()
    if args.targets_file is not None:
        for (chrom, start,
             end) in tk_io.get_bed_iterator(args.targets_file, args.locus):
            regions.add_region((start, end))
    else:
        (chrom, start, stop) = tk_io.get_locus_info(args.locus)
        regions.add_region((start, stop))
    coverage_regions = None
    if (vc_mode !=
            "precalled") and args.high_coverage_excluded_bed is not None:
        coverage_regions = get_coverage_regions(args)
        regions = regions.intersect(coverage_regions)

    bed_length = 0
    with open(bedfile, 'w') as bed_writer:
        for region in regions.get_region_list():
            (start, end) = region
            bed_writer.write(chrom + "\t" + str(start) + "\t" + str(end) +
                             "\n")
            bed_length += 1
    if vc_mode == "precalled" or vc_mode == "precalled_plus":
        outs.default = None
        precalled_vars_path = args.split_input
        vcf = tk_io.VariantFileReader(precalled_vars_path)
        with open(outs.precalled, "w") as file_write:
            output = tk_io.VariantFileWriter(
                file_write, template_file=open(precalled_vars_path))
            variant_iter = tk_io.get_variant_iterator_pos(
                vcf, bedfile, args.locus)
            for record in variant_iter:
                output.write_record(record)
    if not (vc_mode == "precalled"):
        outs.precalled = None
        primary_contigs = tk_reference.load_primary_contigs(
            args.reference_path)
        if bed_length > 0 and chrom in primary_contigs:
            vc.run_variant_caller(variant_caller, gatk_path, args.__mem_gb,
                                  fasta_path, args.input, outs.default,
                                  bedfile)
Beispiel #8
0
def canonicalize(filename, output_name):
    with open(output_name + "tmp.vcf", 'w') as canon_file:
        # Left-align indels and output variants as constitutent indels
        tenkit.log_subprocess.check_call([
            'vcfallelicprimitives', '--keep-info', '-t', 'VCFALLELICPRIMITIVE',
            filename
        ],
                                         stdout=canon_file)
    with open(output_name + "tmp2.vcf", 'w') as fixed_vcf:
        # the reason we are doing this is because vcfallelicprimitives screws up the vcf format in some places in the info fields
        #  it changes some tags from KEY=.; to KEY=; which is invalid. bcftools fixes this, but we dont actually want to filter anything
        #   this should do that
        tenkit.log_subprocess.check_call(
            ['bcftools', 'filter', output_name + "tmp.vcf"], stdout=fixed_vcf)
    with open(output_name + "tmp3.vcf", 'w') as unphased_file:
        vcf_in = tk_io.VariantFileReader(output_name + "tmp2.vcf")
        unsupported_genotype_filter = [(
            "UNSUPPORTED_GENOTYPE",
            "If genotype field contains '.' we assume that this is due to making a single sample vcf from a multiple sample vcf in which this sample does not contain the variant."
        )]
        tenx = ('TENX', '0', 'Flag', "called by 10X", None, None)
        vcf_out = tk_io.VariantFileWriter(
            unphased_file,
            template_file=open(output_name + "tmp2.vcf"),
            new_info_fields=[tenx],
            new_filters=unsupported_genotype_filter)
        for record in vcf_in.record_getter():
            sample = record.samples[0]
            unsupported_genotype = False
            try:
                if len(sample.gt_alleles) > 0:
                    genotype1 = sample.gt_alleles[0]
                    if genotype1 == '.':
                        unsupported_genotype = True
                else:
                    unsupported_genotype = True
                if len(sample.gt_alleles) > 1:
                    genotype2 = sample.gt_alleles[1]
                    if genotype2 is '.':
                        unsupported_genotype = True
            except:
                unsupported_genotype = True
            if unsupported_genotype:
                record.FILTER = ["UNSUPPORTED_GENOTYPE"]
            vcf_out.write_record(record)
    tk_tabix.sort_vcf(output_name + "tmp3.vcf", output_name)
    def test_basic(self):
        self.run_stage(self.args)

        # Load the output file
        vfr = tk_io.VariantFileReader(os.path.join(job_dir,"default.vcf"))
        for r in vfr.record_getter():
            pos = tk_io.get_record_pos(r)
            barcodes = tk_io.get_record_barcodes(r)
            if pos == 26357747 or pos == 26357748:

                print barcodes
                assert(barcodes[1][0] =='1-ATAGGAGTTCAGGG_63')
                print tk_io.get_record_alt_allele_counts(r)
                assert(int(tk_io.get_record_alt_allele_counts(r)[0]) in [31,32])
                assert(int(tk_io.get_record_ref_allele_count(r)) == 0)
            if pos == 26501280:
                print barcodes
                assert(barcodes[0][0] == '1-TGAAGACATAACCC_61_61')
                assert(int(r.INFO['POSTHPC'][0]) == 10)
Beispiel #10
0
def main(args, outs):
    def real_file(f):
        return f is not None and os.path.isfile(f) and os.path.getsize(f) > 0
    precalled = False
    called = False
    if real_file(args.precalled_chunk):
        precalled = True
        if args.precalled_no_chunk:
            return
    elif real_file(args.chunk_input):
        called = True
    if called:
        input_vcf = "canonicalized.vcf"
        canonicalize(args.chunk_input, input_vcf)
    elif precalled:
        canonicalize(args.precalled_chunk, "canonicalized.vcf")
        input_vcf = outs.default+"tmp.vcf"
        vcf_reader = tk_io.VariantFileReader("canonicalized.vcf")
        with open(input_vcf, "w") as file_write:
            output = tk_io.VariantFileWriter(file_write, template_file=open("canonicalized.vcf"))
            for record in vcf_reader.record_getter():
                (bad, message) = tk_io.is_record_bad_variant(record)
                if bad:
                    try:
                        record_strified = str(record)
                    except:
                        record_strified = "<error displaying record>"
                    martian.exit("error on vcf record: "+record_strified+", "+message)
                if record.QUAL == None:
                    record.QUAL = 0
                info_fields = {key: record.INFO.get(key) for key in VCF_WHITE_LIST_INFO_FIELDS if key in record.INFO}
                record.INFO = info_fields
                sample_call = tk_io.get_record_sample_call(record)
                data = sample_call.data
                data_dict = data._asdict()
                if "GT" in data_dict:
                    new_sample_vals = [data_dict["GT"]]
                    new_format = "GT"
                    new_fields = ["GT"]
                else:
                    new_sample_vals = ["./."]
                    new_format = "GT"
                    new_fields = ["GT"]
                data_instantiator = vcf.model.make_calldata_tuple(new_fields)
                data = data_instantiator(*new_sample_vals)
                sample_call.data = data
                record.samples[0] = sample_call
                record.FORMAT = new_format
                output.write_record(record)
    else:
        outs.default = None
        return

    reference_pyfasta = tenkit.reference.open_reference(args.reference_path)

    chunk_record_size = 10000

    vcf_reader = tk_io.VariantFileReader(input_vcf)

    # Check whether to write GL field
    v = vcf.Reader(open(input_vcf))
    write_gl = v.formats.has_key('GL')

    bam = tk_bam.create_bam_infile(args.bam)
    outfile = outs.default
    if not precalled:
        outfile = outs.default+"tmp2.vcf"
    with open(outfile, "w") as file_write:
        new_source = "10X/pipelines/stages/snpindels/attach_bcs_snpindels %s" % martian.get_pipelines_version()
        new_formats = [("BX", ".", "String", "Barcodes and Associated Qual-Scores Supporting Alleles")]
        AO = ('AO','.','Integer','Alternate allele observed count', None, None)
        RO = ('RO','1','Integer','Reference allele observed count', None, None)
        post_homopolymer_counts_field = ('POSTHPC','.', 'Integer', 'Postvariant homopolymer count', None, None)
        post_homopolymer_base_field = ('POSTHPB','.', 'Character','Postvariant homopolymer base', None, None)
        post_dinucleotide_base_field = ('POSTDNB','.', 'String', 'Post variant dinucleotide repeat sequence', None, None)
        post_dinucleotide_counts_field = ('POSTDNC','.','Integer','Post variant dinucleotide repeat count', None, None)
        post_trinucleotide_base_field = ('POSTTNB','.','String','Post variant trinucleotide repeat sequence', None, None)
        post_trinucleotide_counts_field = ('POSTTNC','.','Integer','Post variant trinucleotide repeat count', None, None)
        mean_map_alt = ('MUMAP_ALT', '.', 'Float', 'Mean mapping scores of alt alleles', None, None)
        mean_map_ref = ('MUMAP_REF', '1', 'Float', 'Mean mapping score of ref allele', None, None)
        rescued = ('RESCUED','.', 'Integer','How many reads were rescued via cross barcode mapq correction', None, None)
        not_rescued = ('NOT_RESCUED','.','Integer', 'How many reads were not rescued via cross barcode mapq correction', None, None)
        mean_molecule_difference = ("MMD", '.', 'Float', 'Mean molecule divergence from reference per read', None, None)
        haplocalled = ("HAPLOCALLED", "1", "Integer", "1 for variants that were called after phasing via splitting the bam into its component haplotypes and calling variants in haploid mode", None, None)
        extra_fields = [post_homopolymer_counts_field, post_homopolymer_base_field, mean_map_ref, mean_map_alt, AO, RO, mean_molecule_difference, rescued, not_rescued,
                        post_dinucleotide_base_field, post_dinucleotide_counts_field, post_trinucleotide_base_field, post_trinucleotide_counts_field, haplocalled]
        output = tk_io.VariantFileWriter(file_write, template_file=open(input_vcf), new_source=new_source, new_info_fields=extra_fields,
                                  new_format_fields=new_formats)
        record_list = []
        for record in vcf_reader.record_getter():
            if len(record_list) == chunk_record_size:
                for record_out in record_list:
                    if tk_io.get_record_passes_filters(record_out):
                        populate_fields(record_out, bam, reference_pyfasta, args)
                for record_out in record_list:
                    output.write_record(record_out)
                record_list = []

            record_list.append(record)

        for record_out in record_list:
            if tk_io.get_record_passes_filters(record_out):
                populate_fields(record_out, bam, reference_pyfasta, args)
        for record_out in record_list:
            output.write_record(record_out)

    if not precalled and args.haploid_merge is not None:
        output_filename = outs.default+"tmp3.vcf"
        merge_haploid(outfile, args.haploid_merge, args.locus, output_filename, bam, reference_pyfasta, args, write_gl)
        outfile = output_filename

    # Filter variants
    if not precalled:
        last_file = outfile
        for filter in VARIANT_CALL_FILTER:
            with open(outs.default+"tmp.vcf",'w') as output_file:
                filt = VARIANT_CALL_FILTER[filter]
                tenkit.log_subprocess.check_call(['bcftools','filter','-O', 'v','--soft-filter',filter,'-e',filt,'-m','\'+\'', last_file],stdout=output_file)
            tenkit.log_subprocess.check_call(['mv', outs.default+"tmp.vcf", outs.default+"tmp2.vcf"])
            last_file = outs.default+"tmp2.vcf"

        tenkit.log_subprocess.check_call(['mv', last_file, outs.default])
Beispiel #11
0
def check_concordance(pb_left,
                      pb_right,
                      vcf,
                      max_allowable_overhang_length=1000,
                      max_allowable_overhang_variants=2,
                      filter_trivial=True):
    '''
    Compare two sets of phase blocks - what's shared and what's different. 
    The max_allowable_overhang parameters allow one to ignore minor differences.
    '''
    # hack: change chrom names to numbers to get around pandas 0.15.2 bug when merging categoricals
    # (see https://github.com/pydata/pandas/issues/9426)
    all_chroms = pandas.Categorical(
        pb_left.chrom.append(pb_right.chrom).drop_duplicates())
    chrom_left_codes = pandas.Categorical(
        pb_left.chrom).set_categories(all_chroms).codes
    chrom_right_codes = pandas.Categorical(
        pb_right.chrom).set_categories(all_chroms).codes

    pb_left_tmp = pandas.DataFrame({
        'chrom': chrom_left_codes,
        'phase_set': pb_left.phase_set,
        'start_left': pb_left.start,
        'end_left': pb_left.end
    })
    pb_right_tmp = pandas.DataFrame({
        'chrom': chrom_right_codes,
        'phase_set': pb_right.phase_set,
        'start_right': pb_right.start,
        'end_right': pb_right.end
    })

    pb_merged = pandas.merge(pb_left_tmp,
                             pb_right_tmp,
                             on=['chrom', 'phase_set'],
                             how='outer')

    if filter_trivial:
        pb_merged = pb_merged[(pb_merged.end_right - pb_merged.start_right > 1)
                              | (pb_merged.end_left -
                                 pb_merged.start_left > 1)].reset_index()

    # map codes back to chrom names
    pb_chroms = pandas.Categorical.from_codes(pb_merged.chrom,
                                              all_chroms.categories)
    pb_merged.chrom = pb_chroms

    # split into shared/non-shared phase sets
    pb_merged_shared = pb_merged[pb_merged.start_left.notnull()
                                 & pb_merged.start_right.notnull()]
    pb_merged_left_only = pb_merged[pb_merged.start_right.isnull()]
    pb_merged_right_only = pb_merged[pb_merged.start_left.isnull()]

    # look at overlaps in the shared sets
    start_max = pb_merged_shared.start_left.combine(
        pb_merged_shared.start_right, max)
    end_min = pb_merged_shared.end_left.combine(pb_merged_shared.end_right,
                                                min)
    overlap = map(lambda x: max(x, 0), end_min - start_max)
    non_overlap = (pb_merged_shared.end_left - pb_merged_shared.start_left) + (
        pb_merged_shared.end_right - pb_merged_shared.start_right) - map(
            lambda x: 2 * x, overlap)
    discord = (non_overlap > max_allowable_overhang_length)

    pb_merged_shared_concordant = pb_merged_shared[~discord]
    pb_merged_shared_discordant = pb_merged_shared[discord]

    # ignore cases with only a few overhang variants (probably due to off-by-one errors)
    if len(pb_merged_shared_discordant) > 0:
        vfr = tk_io.VariantFileReader(vcf)
        pb_merged_shared_discordant = pb_merged_shared_discordant[
            pb_merged_shared_discordant.apply(
                lambda x: too_many_overhang_variants(
                    x, vfr, max_allowable_overhang_variants),
                axis=1)]

    return (pb_merged_shared_concordant, pb_merged_shared_discordant,
            pb_merged_left_only, pb_merged_right_only)
Beispiel #12
0
def mk_var_tuples(fn, targets, locus):
    reader = tk_io.VariantFileReader(fn)
    var_iter = tk_io.get_variant_iterator_pos(reader, targets, locus)
    var_tuples = variant_tuples(var_iter)
    return var_tuples
Beispiel #13
0
def main(args, outs):
    vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode)

    (chrom, start, stop) = tk_io.get_locus_info(args.locus)
    chrom = str(chrom)

    if chrom in ['chrM', 'MT', 'M'] or (args.sex.lower() in ["f", "female"]
                                        and chrom in ["chrY", "Y"]):
        return

    fragment_barcode_info = pysam.Tabixfile(args.fragment_phasing)
    AH_0_BH_0 = (
        'AH_0_BH_0', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 0'
    )
    AH_1_BH_1 = (
        'AH_1_BH_1', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 1'
    )
    AH_0_BH_1 = (
        'AH_0_BH_1', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 1'
    )
    AH_1_BH_0 = (
        'AH_1_BH_0', '1', 'Integer',
        'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 0'
    )
    BX_HAP_OR = (
        'BX_HAP_OR', '1', 'Float',
        "Barcode aware haplotype filtering score (log odds ratio currently)")
    BARCODE_AWARE_FILTER = [(
        "BARCODE_AWARE_FILTER",
        "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with haplotype (ie variants should have most of their allele haplotype 0 alleles coming from barcodes whose fragments are haplotype 0 etc)"
    )]
    extra_fields = [AH_0_BH_0, AH_1_BH_1, AH_0_BH_1, AH_1_BH_0, BX_HAP_OR]
    input_variants = tk_io.VariantFileReader(args.variants)
    with open(outs.default.strip(".gz"), 'w') as output_file:
        output_variants = tk_io.VariantFileWriter(
            output_file,
            template_file=open(args.variants, 'r'),
            new_info_fields=extra_fields,
            new_filters=BARCODE_AWARE_FILTER)
        variant_iterator = tk_io.get_variant_iterator_pos(
            input_variants, None, args.locus)
        for record in variant_iterator:
            sample = record.samples[0]
            ref = tk_io.get_record_ref(record)
            alt_alleles = tk_io.get_record_alt_alleles(record)

            if not tk_io.get_record_passes_filters(record):
                output_variants.write_record(record)
                continue
            if len(sample.gt_alleles) > 1:
                genotype_1 = int(sample.gt_alleles[0])
                genotype_2 = int(sample.gt_alleles[1])
                if genotype_1 == genotype_2:
                    output_variants.write_record(record)
                    continue  #homozygous, can't filter this way
            else:
                output_variants.write_record(record)
                continue  #homozygous, can't filter this way

            chrom = tk_io.get_record_chrom(record)
            if not chrom == "chrM":
                variant_barcode_info = load_variant_barcode_phasing_info(
                    record, fragment_barcode_info)
                if not barcode_aware_filter(record, variant_barcode_info):
                    if record.FILTER is None:
                        record.FILTER = []
                    if tk_io.get_var_type(ref, alt_alleles[0]) == "S" and (
                        (vc_mode == 'call') or (vc_mode == "precalled_plus"
                                                and "TENX" in record.INFO)):
                        record.FILTER.append("BARCODE_AWARE_FILTER")
            output_variants.write_record(record)