Example #1
0
def _af_filter(data, in_file, out_file):
    """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER)
    """
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            # GATK 3.x can produce VCFs without sample names for empty VCFs
            try:
                tumor_index = vcf.samples.index(dd.get_sample_name(data))
            except ValueError:
                tumor_index = None
            for rec in vcf:
                if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq):
                    vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #2
0
def _af_filter(data, in_file, out_file):
    """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER)
    """
    min_freq = float(
        utils.get_in(data["config"],
                     ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.info(
        "Filtering MuTect2 calls with allele fraction threshold of %s" %
        min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(
            ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_filter_to_header({
                'ID':
                'MinAF',
                'Description':
                'Allele frequency is lower than %s%% ' % (min_freq * 100) +
                ('(configured in bcbio as min_allele_fraction)'
                 if utils.get_in(data["config"],
                                 ("algorithm", "min_allele_fraction")) else
                 '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)'
                 )
            })
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if np.all(rec.format('AF')[tumor_index] < min_freq):
                    vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #3
0
def mark_missing_sites(vcffile, region, missing_threshold, soft_filter):
    vcf = VCF(vcffile)
    header_param_id = {
        'ID':
        'MISSING',
        'Description':
        'failed variant site missingness threshold ({} %)'.format(
            missing_threshold)
    }
    header_param_info = {
        'ID': 'MISSINGPCT',
        'Description': 'site missingness percentage',
        'Type': 'Float',
        'Number': '1'
    }
    vcf.add_filter_to_header(header_param_id)
    vcf.add_info_to_header(header_param_info)
    out = Writer('-', vcf)
    (total_sites, noted_sites) = (0, 0)

    for variant in vcf(region):
        total_sites += 1
        (missing_pct, missing, total) = compute_missingness(variant)
        verdict = variant_missing_criteria(missing_threshold, missing_pct)
        variant = update_variant(variant, verdict, missing_pct)
        if verdict == "pass":
            noted_sites += 1
            out.write_record(variant)
        elif verdict == "fail" and soft_filter:
            out.write_record(variant)

    out.close()
    msg = "After filtering, passed {} out of a possible {} Sites ({})"
    msg = msg.format(noted_sites, total_sites, 'pass')
    print(msg, file=sys.stderr)
Example #4
0
    def setUp(self):
        # load test data
        # store each variant object into specific variables for tes
        test_directory = os.path.dirname(os.path.abspath(__file__))
        reader = VCF(os.path.join(test_directory, "test.vcf"))
        self.test_filter = refilter.Filter(0.3, 0.7, 'AB', 'VAR_DP', 5, ['MISSING'], ['DB'])
        reader.add_filter_to_header(self.test_filter.filtered_header())
        reader.add_info_to_header(self.test_filter.rescued_header())

        self.variants = [ variant for variant in reader ]
Example #5
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #6
0
def _af_annotate_and_filter(paired, items, in_file, out_file):
    """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction

    Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields:
    somatic
      snps:    GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU                 dp=DP                {ALT}U[0] = alt_counts(tier1,tier2)
      indels:  GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50  dp=DP                TIR = alt_counts(tier1,tier2)
    germline
      snps:    GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS)       dp=sum(alt_counts)   AD = ref_count,alt_counts
      indels:  GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS)             dp=sum(alt_counts)   AD = ref_count,alt_counts
    """
    data = paired.tumor_data if paired else items[0]
    min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
    logger.info("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
    ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"):
        with file_transaction(data, ungz_out_file) as tx_out_file:
            vcf = VCF(in_file)
            vcf.add_format_to_header({
                'ID': 'AF',
                'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), '
                               'TIR/DPI (somatic indels)',
                'Type': 'Float',
                'Number': '.'})
            vcf.add_filter_to_header({
                'ID': 'MinAF',
                'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + (
                    '(configured in bcbio as min_allele_fraction)'
                    if utils.get_in(data["config"], ("algorithm", "min_allele_fraction"))
                    else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')})
            w = Writer(tx_out_file, vcf)
            tumor_index = vcf.samples.index(data['description'])
            for rec in vcf:
                if paired:  # somatic?
                    if rec.is_snp:  # snps?
                        alt_counts = rec.format(rec.ALT[0] + 'U')[:,0]  # {ALT}U=tier1_depth,tier2_depth
                    else:  # indels
                        alt_counts = rec.format('TIR')[:,0]  # TIR=tier1_depth,tier2_depth
                    dp = rec.format('DP')[:,0]
                elif rec.format("AD") is not None:  # germline?
                    alt_counts = rec.format('AD')[:,1:]  # AD=REF,ALT1,ALT2,...
                    dp = np.sum(rec.format('AD')[:,0:], axis=1)
                else: # germline gVCF record
                    alt_counts, dp = (None, None)
                if dp is not None:
                    with np.errstate(divide='ignore', invalid='ignore'):  # ignore division by zero and put AF=.0
                        af = np.true_divide(alt_counts, dp)
                        af[~np.isfinite(af)] = .0  # -inf inf NaN -> .0
                    rec.set_format('AF', af)
                    if paired and np.all(af[tumor_index] < min_freq):
                        vcfutils.cyvcf_add_filter(rec, 'MinAF')
                w.write_record(rec)
            w.close()
    return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
Example #7
0
def main(min_allele_balance, max_allele_balance, allele_balance_tag,
         variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields,
         vcf):
    reader = VCF(vcf)
    refilter = Filter(min_allele_balance, max_allele_balance,
                      allele_balance_tag, variant_sample_depth_tag, min_depth,
                      exclude_filters, exclude_fields)
    reader.add_filter_to_header(refilter.filtered_header())
    reader.add_info_to_header(refilter.rescued_header())
    writer = Writer('-', reader)

    for variant in reader:
        refilter(variant)  # Modifies variant filter status in place
        writer.write_record(variant)
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--vcf", help="VCF file", type=str, required=True)
    parser.add_argument("--statsfile", help="File with chrom, start, locus stats", type=str, required=True)
    parser.add_argument("--out", help="Prefix for output files", type=str, required=True)
    parser.add_argument("--min-hwep", help="Minimum HWE p-value", type=float, default=0)
    parser.add_argument("--min-callrate", help="Minimum call rate", type=float, default=0)
    parser.add_argument("--min-het", help="Minimum heterozygosity", type=float, default=0)
    parser.add_argument("--max-hrun-offset", help="For periods 5+, discard if the ref has " \
                            "homopolymer run > period+offset", type=int, default=100000)
    parser.add_argument("--filter-segdup", help="Filter loci overlapping a segdup", action="store_true")
    args = parser.parse_args()

    # Get VCF reader
    reader = VCF(args.vcf)

    # Load locus filters
    sys.stderr.write("Getting filters...\n")
    locstats = pd.read_csv(args.statsfile, sep="\t")
    locstats["FILTER"] = locstats.apply(lambda x: GetFilters(x, args, len(reader.samples)), 1)
    locstats.to_csv(args.out + ".tab", sep="\t", index=False)

    # Get filter dictionary
    sys.stderr.write("Getting filter dictionary...\n")
    filter_dict = dict(zip(list(locstats["start"]), list(locstats["FILTER"])))

    # Set filter field
    sys.stderr.write("Setting filter field in VCFs...\n")
    adict = {
        "HWE": "HWE less than %s"%args.min_hwep,
        "Callrate": "Callrate less than %s"%args.min_callrate,
        "Het": "Het less than %s"%args.min_het,
        "Hrun": "Hrun greater than %s"%args.max_hrun_offset,
        "Segdup": "Locus in a segmental duplication",
        "MissingInfo": "No stats provided for the locus",
        }
    for f in adict:
        reader.add_filter_to_header({"ID": f, "Description": adict[f]})
    writer = Writer("/dev/stdout", reader)
    for record in reader:
        filters = filter_dict.get(record.INFO["START"], "MissingInfo")
        if filters != ".":
            record.FILTER = filters.split(";")
        else: record.FILTER = "PASS"
        writer.write_record(record)
    writer.close()
    reader.close()
Example #9
0
def test_add_filter_to_header():
    v = VCF(VCF_PATH)
    # NOTE that we have to add the filter to the header of the reader,
    # not the writer because the record will be associated with the reader
    v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'})

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    rec = v.next()

    rec.FILTER = ["abcdefg"]
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.FILTER == "abcdefg", v.FILTER
Example #10
0
def test_add_filter_to_header():
    v = VCF(VCF_PATH)
    # NOTE that we have to add the filter to the header of the reader,
    # not the writer because the record will be associated with the reader
    v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'})

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    rec = v.next()

    rec.FILTER = ["abcdefg"]
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.FILTER == "abcdefg", v.FILTER
Example #11
0
def filter_cli(input, output, trash, params_file, index_sample,
               immediate_return):
    vcf = VCF(input, gts012=True)

    idx = vcf.samples.index(index_sample)
    for filter_item in list(FilterClass):
        vcf.add_filter_to_header(filter_item.value)

    out = Writer(output, vcf)
    tr = Writer(trash, vcf)

    filter_params = FilterParams(params_file)

    filter_it = Filterer(vcf, filter_params, idx, immediate_return)

    for record, fi in filter_it:
        if fi is None or len(fi) == 0:
            out.write_record(record)
        else:
            record.FILTER = [x.name for x in fi]
            tr.write_record(record)

    out.close()
    tr.close()
Example #12
0
pass_threshold = 0

if args.conditions is None:
    wantFilters = False
else:
    conditions = formatString(args.INFO_conditions)
    pass_threshold += 1

if args.GTconditions is None:
    wantGTFilters = False
else:
    pass_threshold += 1

vcf = VCF(args.inputvcf)
vcf.add_filter_to_header({
    'ID': args.filtername,
    'Description': 'SV filter ' + args.filtername
})

for v in vcf:

    #Apply variant filters
    if wantFilters == True:
        evaluateINFO(args.conditions_str, v)

        #result = vt.evaluateINFO(line,cond,args.logic)
        #if result == True:
        #	is_filtered += 1

    #Apply per-sample filters
    if wantGTFilters == True:
        cond = args.GTconditions.split(",")
Example #13
0
    # API and example
    # https://brentp.github.io/cyvcf2/
    # https://brentp.github.io/cyvcf2/docstrings.html#api
    invcf = VCF(
        '/dev/stdin', lazy=True, gts012=True
    )  # if gts012=True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN.
    # invcf = VCF('test.vcf.gz', lazy=True)

    # adjust the header to contain the new field
    # the keys 'ID', 'Description', 'Type', and 'Number' are required.

    invcf.add_filter_to_header({
        'ID':
        'VCFSiteMissingFilter.py',
        'Description':
        'Exclude the site with missing rate higher than > ' +
        str(MISS_THRESHOLD)
    })

    # create a new vcf Writer using the input vcf as a template.
    # Only need to write out updated VCF header.
    # Other parts output as string.
    outvcf = Writer('/dev/stdout', invcf)
    outvcf.close()

    for variant in invcf:
        missing_rate = 1 - variant.call_rate
        if missing_rate <= MISS_THRESHOLD:
            ss = str(variant).split()
            sys.stdout.write(str(variant))
Example #14
0
def fix_vcf(vcf, output, fai, jasmine=False):
    chromsizes = {line.split()[0]: int(line.split()[1]) for line in open(fai)}

    vcf_in = VCF(vcf)
    if not output:
        output = vcf.replace(".vcf", "_{}.vcf".format("fixed"))
    vcf_in.add_info_to_header({
        'ID': 'TRUNCATED',
        'Description': "SVLEN truncated",
        'Type': 'Flag',
        'Number': '0'
    })
    vcf_in.add_info_to_header({
        'ID': 'STRANDS2',
        'Description':
        "alt reads first +,alt reads first -,alt reads second +,alt reads second -.",
        'Type': 'Integer',
        'Number': '4'
    })
    vcf_in.add_info_to_header({
        'ID': 'Strandbias_pval',
        'Description': "P-value for fisher exact test for strand bias.",
        'Type': 'Float',
        'Number': 'A'
    })
    vcf_in.add_filter_to_header({
        'ID':
        'STRANDBIAS',
        'Description':
        "Strand is biased if Strandbias_pval< 0.01."
    })
    if jasmine:
        vcf_in.add_info_to_header({
            'ID': 'STRANDS',
            'Description': "foo",
            'Type': 'String',
            'Number': '1'
        })
        vcf_in.add_info_to_header({
            'ID': 'AF',
            'Description': "foo",
            'Type': 'Float',
            'Number': '1'
        })
    handle, interm_output = tempfile.mkstemp(suffix=".vcf")
    vcf_out = Writer(interm_output, vcf_in)
    records_fixed = 0
    records_truncated = 0
    mito_variants = 0
    interchromosomal_bnds = 0
    for v in vcf_in:
        if v.CHROM == 'chrM':
            mito_variants += 1
            continue
        if v.start == -1:
            v.set_pos(0)
            records_fixed += 1
        try:
            if (v.INFO.get('SVTYPE')
                    == 'BND') and (v.CHROM != v.INFO.get('CHR2')):
                del v.INFO['END']
                interchromosomal_bnds += 1
        except KeyError:
            pass
        try:
            if chromsizes[v.INFO.get('CHR2')] < v.INFO.get('END'):
                v.INFO['SVLEN'] = 1
                v.INFO['END'] = v.start + 1
                v.INFO['TRUNCATED'] = True
                records_truncated += 1
        except KeyError:
            pass
        if v.INFO.get('SVLEN') == 999999999:
            v.INFO['SVLEN'] = 1
            v.INFO['TRUNCATED'] = True
        vcf_out.write_record(v)
    vcf_out.close()
    vcf_sort(interm_output, output)
    if mito_variants != 0:
        sys.stderr.write(f"Removed {mito_variants} records on chrM.\n")
    if records_fixed != 0:
        sys.stderr.write(f"Fixed {records_fixed} records.\n")
    if records_truncated != 0:
        sys.stderr.write(
            f"Truncated {records_truncated} records where END > chromosome size\n"
        )
    if interchromosomal_bnds != 0:
        sys.stderr.write(
            f"Dropped END for {interchromosomal_bnds} interchromosomal BNDs\n")
Example #15
0
        ShowFormat()
        sys.exit(-1)

    mingq = int(args['--mingq'])

    # API and example
    # https://brentp.github.io/cyvcf2/
    # https://brentp.github.io/cyvcf2/docstrings.html#api
    invcf = VCF('/dev/stdin', lazy=True, gts012=True)
    # invcf = VCF('test.vcf.gz', lazy=True)

    # adjust the header to contain the new field
    # the keys 'ID', 'Description', 'Type', and 'Number' are required.
    invcf.add_filter_to_header({
        'ID':
        'VCFGQFilter.py',
        'Description':
        'Mask genotype as missing if GQ value  < ' + str(mingq)
    })

    # create a new vcf Writer using the input vcf as a template.
    # Only need to write out updated VCF header.
    # Other parts output as string.
    sys.stdout.write('%s' % (invcf.raw_header))
    # outvcf = Writer('/dev/stdout', invcf)
    # outvcf.close()

    # Cache data for faster process.
    DATA_COL = 9
    FMT_COL = 8
    FMT_STRING_CACHE = ''
    DP_COL = -1
Example #16
0
for name, filt in filters.items():
    if name in BUILTIN_FILTERS:
        if not isinstance(filt, tuple):
            filt = (filt, )
        filters[name] = lambda variant: BUILTIN_FILTERS[name](variant, *filt)
        filters[name].__doc__ = BUILTIN_FILTERS[name].__doc__
    else:
        filters[name] = eval(filt)
        filters[name].__doc__ = filter_descs.get(name, filt)


invcf = VCF(infile)
for name, filt in filters.items():
    invcf.add_filter_to_header({
        'ID': name,
        'Description': filt.__doc__,
    })

if outfile.endswith(".gz"):
    outvcf = Writer(outfile, invcf, "wz")
else:
    outvcf = Writer(outfile, invcf)

for variant in invcf:
    for name, filt in filters.items():
        if not filt(variant):
            if not variant.FILTER:
                variant.FILTER = name
            else:
                variant.FILTER = f"{variant.FILTER};{name}"
    if variant.FILTER and not keep:
    def add_filters_to_header(self, vcf: VCF):
        if self.min_depth > 0:
            header = {
                "ID": str(Tags.LowDepth),
                "Description": (
                    f"Depth ({Tags.Depth}) less than {self.min_depth} - i.e., {Tags.Depth}<{self.min_depth:.1f}"
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. depth: {header}")

        if self.min_fed > 0:
            header = {
                "ID": str(Tags.LowFed),
                "Description": (
                    f"High-quality depth of the called allele as a fraction of the expected (median; {self.expected_depth}) is "
                    f"less than {self.min_fed}"
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. FED: {header}")

        if self.min_mq > 0:
            header = {
                "ID": str(Tags.LowMapQual),
                "Description": (
                    f"Mapping quality ({Tags.MapQual.value}) less than {self.min_mq} - i.e., {Tags.MapQual.value}<{self.min_mq:.0f}"
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. depth: {header}")

        if self.max_depth > 0:
            header = {
                "ID": str(Tags.HighDepth),
                "Description": (
                    f"Depth ({Tags.Depth}) more than {self.max_depth} - i.e., {Tags.Depth}>{self.max_depth:.1f}"
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for max. depth: {header}")

        if self.min_qual > 0:
            header = {
                "ID": str(Tags.LowQual),
                "Description": f"QUAL less than {self.min_qual}",
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. QUAL: {header}")

        if self.min_strand_bias > 0:
            header = {
                "ID": str(Tags.StrandBias),
                "Description": (
                    f"A strand on the called allele has less than  "
                    f"{self.min_strand_bias:.2%} of the high-quality depth for that "
                    f"allele. This is judged on the {Tags.StrandDepth} tag."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for strand bias: {header}")

        if self.min_frs > 0:
            header = {
                "ID": str(Tags.LowSupport),
                "Description": f"Fraction of read support on called allele is less than {self.min_frs}",
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. FRS: {header}")

        if self.min_bqb > 0:
            header = {
                "ID": str(Tags.LowBaseQualBias),
                "Description": (
                    f"Base Quality Bias ({Tags.BaseQualBias}) is less than "
                    f"{self.min_bqb}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. base quality bias: {header}")

        if self.min_mqb > 0:
            header = {
                "ID": str(Tags.LowMapQualBias),
                "Description": (
                    f"Mapping Quality Bias ({Tags.MapQualBias}) is less than "
                    f"{self.min_mqb}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. mapping quality bias: {header}")

        if self.min_rpb > 0:
            header = {
                "ID": str(Tags.LowReadPosBias),
                "Description": (
                    f"Read Position Bias ({Tags.ReadPosBias}) is less than "
                    f"{self.min_rpb}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. read position bias: {header}")

        if self.min_rpbz is not None:
            header = {
                "ID": str(Tags.LowReadPosBiasZ),
                "Description": (
                    f"Read Position Bias z-test score ({Tags.ReadPosBiasZ}) is less than "
                    f"{self.min_rpbz}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. read position bias z-test: {header}")

        if self.max_rpbz is not None:
            header = {
                "ID": str(Tags.HighReadPosBiasZ),
                "Description": (
                    f"Read Position Bias z-test score ({Tags.ReadPosBiasZ}) is more than "
                    f"{self.max_rpbz}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for max. read position bias z-test: {header}")

        if self.max_scbz is not None:
            header = {
                "ID": str(Tags.HighSoftClipBiasZ),
                "Description": (
                    f"Soft-Clip Length Bias z-test score ({Tags.SoftClipBiasZ}) is more than "
                    f"{self.max_scbz}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for max. soft-clip length bias z-test: {header}")

        if self.max_sgb != 0:
            header = {
                "ID": str(Tags.HighSegBias),
                "Description": (
                    f"Segregation-based metric ({Tags.SegregationBias}) is greater "
                    f"than {self.max_sgb}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for max. segregation bias: {header}")

        if self.min_vdb > 0:
            header = {
                "ID": str(Tags.LowVarDistBias),
                "Description": (
                    f"Variant distance bias ({Tags.VariantDistanceBias}) is less "
                    f"than {self.min_vdb}."
                ),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. variant distance bias: {header}")
        ShowFormat()
        sys.exit(-1)

    MRR_THRESHOLD = float(args['-c'])  # the threshold minor reads ratio.

    # API and example
    # https://brentp.github.io/cyvcf2/
    # https://brentp.github.io/cyvcf2/docstrings.html#api
    invcf = VCF('/dev/stdin', lazy=True, gts012=True)
    # invcf = VCF('test.vcf.gz', lazy=True)

    # adjust the header to contain the new field
    # the keys 'ID', 'Description', 'Type', and 'Number' are required.
    invcf.add_filter_to_header({
        'ID':
        'VCFHOMOMinorReadsRatioFilter.py',
        'Description':
        'Mask the genotype as missing if the MRR >=  ' + args['-c']
    })

    # create a new vcf Writer using the input vcf as a template.
    # Only need to write out updated VCF header.
    # Other parts output as string.
    sys.stdout.write('%s' % (invcf.raw_header))
    # outvcf = Writer('/dev/stdout', invcf)
    # outvcf.close()

    # Cache data for faster process.
    DATA_COL = 9
    FMT_COL = 8
    FMT_STRING_CACHE = ''
    DP_COL = -1
Example #19
0
    def add_filters_to_header(self, vcf: VCF):
        if self.min_depth > 0:
            header = {
                "ID":
                str(Tags.LowDepth),
                "Description":
                (f"Depth ({Tags.Depth}) less than {self.min_depth_frac:.1%} the "
                 f"expected depth of {self.expected_depth:.1f}. "
                 f"{Tags.Depth}<{self.min_depth:.1f}"),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. depth: {header}")

        if self.max_depth > 0:
            header = {
                "ID":
                str(Tags.HighDepth),
                "Description":
                (f"Depth ({Tags.Depth}) more than {self.max_depth_frac:.1%} the "
                 f"expected depth of {self.expected_depth:.1f}. "
                 f"{Tags.Depth}>{self.max_depth:.1f}"),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for max. depth: {header}")

        if self.min_qual > 0:
            header = {
                "ID": str(Tags.LowQual),
                "Description": f"QUAL less than {self.min_qual}",
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. QUAL: {header}")

        if self.min_strand_bias > 0:
            header = {
                "ID":
                str(Tags.StrandBias),
                "Description":
                (f"A strand on the called allele has less than  "
                 f"{self.min_strand_bias:.2%} of the high-quality depth for that "
                 f"allele. This is judged on the {Tags.StrandDepth} tag."),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for strand bias: {header}")

        if self.min_bqb > 0:
            header = {
                "ID":
                str(Tags.LowBaseQualBias),
                "Description":
                (f"Base Quality Bias ({Tags.BaseQualBias}) is less than "
                 f"{self.min_bqb}."),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. base quality bias: {header}")

        if self.min_mqb > 0:
            header = {
                "ID":
                str(Tags.LowMapQualBias),
                "Description":
                (f"Mapping Quality Bias ({Tags.MapQualBias}) is less than "
                 f"{self.min_mqb}."),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. mapping quality bias: {header}")
        if self.min_rpb > 0:
            header = {
                "ID":
                str(Tags.LowReadPosBias),
                "Description":
                (f"Read Position Bias ({Tags.ReadPosBias}) is less than "
                 f"{self.min_rpb}."),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. read position bias: {header}")

        if self.max_sgb != 0:
            header = {
                "ID":
                str(Tags.HighSegBias),
                "Description":
                (f"Segregation-based metric ({Tags.SegregationBias}) is greater "
                 f"than {self.max_sgb}."),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for max. segregation bias: {header}")

        if self.min_vdb > 0:
            header = {
                "ID":
                str(Tags.LowVarDistBias),
                "Description":
                (f"Variant distance bias ({Tags.VariantDistanceBias}) is less "
                 f"than {self.min_vdb}."),
            }
            vcf.add_filter_to_header(header)
            logging.debug(f"Header for min. variant distance bias: {header}")