Example #1
0
def main(args):
    # open input vcf
    vcf = vcf_parser.Vcf(args['inputfile'])
    # add 3 new tag definitions - for hg19 liftover: chr, pos, and end
    hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">'
    hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">'
    hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">'
    vcf.header.add_tag_definition(hg19END_definition)
    vcf.header.add_tag_definition(hg19POS_definition)
    vcf.header.add_tag_definition(hg19CHROM_definition)

    # get chain file for liftover
    lo = LiftOver(args['chainfile'])

    # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants.
    with open(args['outputfile'], 'w') as fo:
        vcf.write_header(fo)
        for vnt_obj in vcf.parse_variants():

            # generate hg19 LO coordinates based on CHROM and POS
            hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1)
            if len(hits) > 0:
                #add hg19_chr
                hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1]
                vnt_obj.add_tag_info(hg19CHROM_value)
                #add hg19_pos
                hg19POS_value = 'hg19_pos='+str(hits[0][1]+1)
                vnt_obj.add_tag_info(hg19POS_value)

            # also want to incorporate END position for SV and CNV
            # check if "END" exists in INFO and if it does, try a liftover
            try:
                END = int(vnt_obj.INFO.split("END=")[1].split(";")[0])
            except:
                END = ''

            if END != '':
                hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1)
                if len(hits_end) > 0:
                    try:
                        #if hg19_chr is already defined, don't add it
                        vnt_obj.get_tag_value("hg19_chr")
                        #add hg19_end
                        hg19END_value = 'hg19_end='+str(hits_end[0][1]+1)
                        vnt_obj.add_tag_info(hg19END_value)
                    except:
                        #if hg19_chr is not defined, add hg19_chr
                        hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1]
                        vnt_obj.add_tag_info(hg19CHROM_value)
                        #add hg19_end
                        hg19END_value = 'hg19_end='+str(hits_end[0][1]+1)
                        vnt_obj.add_tag_info(hg19END_value)
            vcf.write_variant(fo, vnt_obj)

    subprocess.run(["bgzip", args['outputfile']])
    subprocess.run(["tabix",args['outputfile']+".gz"])
Example #2
0
def main(args):
    # Variables
    VEPtag = 'CSQ'
    samplegeno_def = '##INFO=<ID=SAMPLEGENO,Number=.,Type=String,Description="Sample genotype information. Subembedded:\'samplegeno\':Format:\'NUMGT|GT|AD|SAMPLEID|AC\'">'
    counts_dict = {}  # {ENSG: {sample: alt_count, ...}, ...}

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Indexes
    ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene')
    most_severe_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'most_severe')

    # Counting alleles per most severe gene
    for vnt_obj in vcf_obj.parse_variants():
        ENSG = get_most_severe(vnt_obj, VEPtag, ENSG_idx, most_severe_idx)
        if ENSG:
            counts_dict.setdefault(ENSG, {})
            for ID_genotype in vnt_obj.IDs_genotypes:
                counts_dict[ENSG].setdefault(ID_genotype, 0)
                GT_0, GT_1 = vnt_obj.get_genotype_value(
                    ID_genotype, 'GT').replace('|', '/').split('/')
                if GT_0 not in ['0', '.']:
                    counts_dict[ENSG][ID_genotype] += 1
                if GT_1 not in ['0', '.']:
                    counts_dict[ENSG][ID_genotype] += 1
        else:
            continue

    # Buffers
    fo = codecs.open(args['outputfile'], 'w', 'utf-8')

    # Update and write header
    vcf_obj.header.remove_tag_definition('SAMPLEGENO')
    vcf_obj.header.add_tag_definition(samplegeno_def, 'INFO')
    vcf_obj.write_header(fo)

    # Reading variants and adding samplegeno
    for vnt_obj in vcf_obj.parse_variants():
        ENSG = get_most_severe(vnt_obj, VEPtag, ENSG_idx, most_severe_idx)
        # Update samplegeno
        samplegeno = []
        samplegeno_ = vnt_obj.get_tag_value('SAMPLEGENO').split(',')
        for sample_ in samplegeno_:
            _, _, _, SAMPLEID = sample_.split('|')
            sample = sample_ + '|' + str(counts_dict[ENSG][SAMPLEID])
            samplegeno.append(sample)

        # Add samplegeno to variant INFO
        vnt_obj.remove_tag_info('SAMPLEGENO')
        vnt_obj.add_tag_info('SAMPLEGENO={0}'.format(','.join(samplegeno)))

        # Write variant
        vcf_obj.write_variant(fo, vnt_obj)

    fo.close()
Example #3
0
def main(args):
    ''' '''
    # Variables
    granite_def = '##GRANITE=<ID=SAMPLEGENO>'
    samplegeno_def = '##INFO=<ID=SAMPLEGENO,Number=.,Type=String,Description="Sample genotype information. Subembedded:\'samplegeno\':Format:\'NUMGT|GT|AD|SAMPLEID\'">'

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Update and write header
    vcf_obj.header.add_tag_definition(granite_def + '\n' + samplegeno_def, 'INFO')
    vcf_obj.write_header(fo)

    # Reading variants and adding samplegeno
    for vnt_obj in vcf_obj.parse_variants():
        # Init empty samplegeno
        samplegeno = []

        # Get possible alleles
        # REF is at idx 0, ALT idxs are maintained
        alleles = [vnt_obj.REF] + vnt_obj.ALT.split(',')

        # Calculate samplegeno
        for ID_genotype in vnt_obj.IDs_genotypes:
            samplegeno_ = []
            GT_0_, GT_1_ = vnt_obj.get_genotype_value(ID_genotype, 'GT').replace('|', '/').split('/')
            if GT_0_ != '.': GT_0 = alleles[int(GT_0_)]
            else: GT_0 = GT_0_
            #end if
            if GT_1_ != '.': GT_1 = alleles[int(GT_1_)]
            else: GT_1 = GT_1_
            #end if
            AD = vnt_obj.get_genotype_value(ID_genotype, 'AD').replace(',', '/')

            samplegeno_.append(GT_0_ + '/' +  GT_1_)
            samplegeno_.append(GT_0 + '/' + GT_1)
            samplegeno_.append(AD)
            samplegeno_.append(ID_genotype)

            samplegeno.append('|'.join(samplegeno_))
        #end for

        # Add samplegeno to variant INFO
        vnt_obj.add_tag_info('SAMPLEGENO={0}'.format(','.join(samplegeno)))

        # Write variant
        vcf_obj.write_variant(fo, vnt_obj)
    #end for

    fo.close()
Example #4
0
def main(args):
    in_vcf = vcf_parser.Vcf(args['inputSampleVCF'])

    min_depth_to_keep = int(args['min_depth'])

    with open(args['outputfile'], 'w') as fo:
        in_vcf.write_header(fo)
        for vnt_obj in in_vcf.parse_variants():
            sample_list = vnt_obj.IDs_genotypes
            for sample in sample_list:
                if vnt_obj.get_genotype_value(sample, "DP") != ".":
                    if int(vnt_obj.get_genotype_value(
                            sample, "DP")) >= min_depth_to_keep:
                        in_vcf.write_variant(fo, vnt_obj)
                        break

    subprocess.run(["bgzip", args['outputfile']])
    subprocess.run(["tabix", args['outputfile'] + ".gz"])
Example #5
0
def main(args):
    ''' '''
    # Variables
    NA_chroms = real_NA_chroms
    is_verbose = True if args['verbose'] else False
    sample_novo = args['novo'] if args['novo'] else ''
    sample_het_list = args['het'] if args['het'] else []
    anchor_list = args['anchor']

    # Check pedigree and anchor args
    if len(anchor_list) != len(args['pedigree']) and \
       len(args['pedigree']) != 1:
        sys.exit(
            '\nERROR in parsing arguments: number of pedigrees and anchors is different\n'
        )
    #end if

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Get pedigree / pedigrees information
    pedigree_list = []
    for pedigree in args['pedigree']:
        # Loading pedigree
        if os.path.isfile(pedigree):
            with open(pedigree) as fi:
                pedigree_list.append(json.load(fi))
            #end with
        else:
            try:
                pedigree_list.append(json.loads(pedigree))
            except Exception:
                sys.exit(
                    '\nERROR in parsing arguments: {0} must be either a json file or a string representing a json\n'
                    .format(pedigree))
            #end try
        #end if
    #end for

    # Check novoPP
    if sample_novo:
        try:
            novotag, _ = vcf_obj.header.check_tag_definition('novoPP')
        except Exception:
            sys.exit(
                '\nERROR in parsing arguments: novoCaller information missing in VCF file\n'
            )
        #end try
    #end if

    # Creating Pedigree object / objects
    pedigree_obj_list = []
    for pedigree in pedigree_list:
        pedigree_obj_list.append(pedigree_parser.Pedigree(pedigree))
    #end for

    # Initializing stat_dict for each anchor
    stat_dict_list = []
    for anchor in anchor_list:
        stat_dict_list.append({
            'error_het_family': {},
            'error_het': {},
            'error_novo_family': {}
        })
    #end for

    # Building family / families
    family_list = []
    if len(pedigree_obj_list) == 1:
        for anchor in anchor_list:
            family_list.append(pedigree_obj_list[0].get_family(anchor))
        #end for
    else:
        for i, pedigree_obj in enumerate(pedigree_obj_list):
            family_list.append(pedigree_obj.get_family(anchor_list[i]))
        #end for
    #end if

    # Reading variants
    analyzed = 0
    vnt_obj_ = None
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(i + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM):
        #     continue
        # #end if
        analyzed += 1

        # Skip MAV that are redundant
        if vnt_obj_:
            if vnt_obj_.CHROM == vnt_obj.CHROM and vnt_obj_.POS == vnt_obj.POS:
                continue
            #end if
        #end if
        vnt_obj_ = vnt_obj

        # Getting and updating stats for each stat_dict / family
        for l, family in enumerate(family_list):
            if sample_novo:
                if anchor_list[l] == sample_novo:
                    get_stats_novo(vnt_obj, stat_dict_list[l], family,
                                   NA_chroms, sample_novo, novotag)
                #end if
            #end if
            if sample_het_list:
                if anchor_list[l] in sample_het_list:
                    get_stats_het(vnt_obj, stat_dict_list[l], family,
                                  NA_chroms)
                #end if
            #end if
        #end for
    #end for

    # Writing output
    sys.stderr.write('\n\n...Writing results for ' + str(analyzed) +
                     ' analyzed variants out of ' + str(i + 1) +
                     ' total variants\n')
    sys.stderr.flush()

    # Create json
    stat_json = to_json(stat_dict_list, sample_het_list, sample_novo)

    # Write variants
    if sample_novo:
        novo_variants(stat_dict_list, args['outputfile'])
        for l, family in enumerate(family_list):
            if anchor_list[l] == sample_novo:
                try:
                    plot_AD_DP_ratio(stat_dict_list[l], sample_novo)
                except Exception:  # no variants with novoPP >= 0.9, skip
                    pass
                #end try
            #end if
        #end for
    #end if

    # Plots
    if len(sample_het_list) == 2:
        plot_error_het_family(stat_dict_list, sample_het_list)
        plot_distr_het_family(stat_dict_list, sample_het_list)
        plot_distr_het(stat_dict_list, sample_het_list)
    #end if

    # Write json to file
    json.dump(stat_json, fo, indent=2, sort_keys=False)

    # Closing buffers
    fo.close()
Example #6
0
def main(args, test=False):
    ''' '''
    # Definitions
    CLNSIG_encode = [
        # high impact
        ('Pathogenic', 'C'),
        ('Likely_pathogenic', 'C'),
        # moderate/low impact
        ('Conflicting_interpretations', 'c'),
        ('Uncertain_significance', 'c'),
        ('risk_factor', 'c')
    ]
    # VEP_encode = {...} -> import from shared_vars
    # DStags = {...} -> import from shared_vars
    IMPCT_encode = {'HIGH': 1, 'MODERATE': 2, 'LOW': 3, 'MODIFIER': 4}
    IMPCT_decode = {1: 'H', 2: 'M', 3: 'L', 4: 'm'}

    # Variables
    is_impct = True if args['impact'] else False
    is_IMPACT = False  # True if IMPACT field is in VEP
    CLNSIGtag, CLNSIG_idx, is_CLNSIG = '', 0, False
    SpAItag_list, SpAI_idx_list, is_SpAI = [], [], False
    ENSG_idx, ENST_idx, IMPCT_idx = 0, 0, 0
    SpliceAItag = args['SpliceAItag']  # default None
    VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ'
    sep = args['sep'] if args['sep'] else '&'
    allow_undef = True if args['allow_undef'] else False
    filter_cmpHet = True if args['filter_cmpHet'] else False
    granite_def = '##GRANITE=<ID=comHet>'
    comHet_def = '##INFO=<ID=comHet,Number=.,Type=String,Description="Putative compound heterozygous pairs. Subembedded:\'cmpHet\':Format:\'phase|gene|transcript|mate_variant\'">'
    comHet_impct_def = '##INFO=<ID=comHet,Number=.,Type=String,Description="Putative compound heterozygous pairs. Subembedded:\'cmpHet\':Format:\'phase|gene|transcript|impact_gene|impact_transcript|mate_variant\'">'
    is_verbose = True if args['verbose'] else False

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Add definition to header
    if is_impct:
        vcf_obj.header.add_tag_definition(
            granite_def + '\n' + comHet_impct_def, 'INFO')
    else:
        vcf_obj.header.add_tag_definition(granite_def + '\n' + comHet_def,
                                          'INFO')
    #end if

    # Writing header
    vcf_obj.write_header(fo)

    # Data structures
    stat_dict = {
        'genes': {},
        'trscrpts': {},
        'pairs': {
            'pairs_set': set(),
            'Phased': 0,
            'Unphased': 0
        },
        'vnts': {
            'Phased': 0,
            'Unphased': 0
        },
        'impact': {}
    }
    ENSG_dict = {}  # {ENSG: [vntHet_obj1, vntHet_obj2], ...}
    ENST_dict_tmp = {}  # {ENSG: ENST_set, ...}
    vntHet_set = set()  # variants to write in output -> {(i, vntHet_obj), ...}
    # i is to track and keep variants order as they are read from input

    # Get idx for ENST and ENSG
    ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene')
    ENST_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Feature')

    # Get idx for SpliceAI, CLINVAR and VEP IMPACT
    if is_impct:
        try:
            IMPCT_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'IMPACT')
            is_IMPACT = True
        except Exception:  # missing IMPACT, IMPCT_idx will point to Consequence
            try:
                IMPCT_idx = vcf_obj.header.get_tag_field_idx(
                    VEPtag, 'Consequence')
            except Exception:
                sys.exit(
                    '\nERROR in VCF structure: either IMPACT or Consequence field in VEP is necessary to assign "--impact"\n'
                )
            #end try
        #end try
        try:
            if SpliceAItag:  # single tag has been specified
                tag, idx = vcf_obj.header.check_tag_definition(SpliceAItag)
                SpAItag_list.append(tag)
                SpAI_idx_list.append(idx)
            else:  # search for delta scores as default
                for DStag in DStags:
                    tag, idx = vcf_obj.header.check_tag_definition(DStag)
                    SpAItag_list.append(tag)
                    SpAI_idx_list.append(idx)
                #end for
            #end if
            is_SpAI = True
        except Exception:
            is_SpAI = False
        #end try
        try:
            CLNSIGtag, CLNSIG_idx = vcf_obj.header.check_tag_definition(
                'CLNSIG')
            is_CLNSIG = True
        except Exception:
            is_CLNSIG = False
        #end try
    #end if

    # Get trio IDs
    if len(args['trio']) > 3:
        sys.exit(
            '\nERROR in parsing arguments: too many sample IDs provided for trio\n'
        )
    #end if
    ID_list = args['trio']  # [proband_ID, parent_ID, parent_ID]

    # Reading variants
    analyzed = 0
    for c, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(c + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM):
        #     continue
        # #end if
        analyzed += 1

        # Reset data structures
        ENST_dict_tmp = {}
        IMPCT_dict_tmp = {}

        # Creating VariantHet object
        vntHet_obj = VariantHet(vnt_obj, c)
        if not filter_cmpHet:  # if not filter, all variants are added to vntHet_set here
            # if filter, no variant is added here to vntHet_set,
            # compound heterozygous variants will be added after pairing
            vntHet_set.add((vntHet_obj.i, vntHet_obj))
        #end if

        # Check proband_ID genotype
        if vnt_obj.get_genotype_value(ID_list[0],
                                      'GT').replace('|',
                                                    '/') not in ['0/1', '1/0']:
            continue  # go next if is not 0/1
        #end if

        # Get transcripts and genes information from VEP
        ENSG_list = VEP_field(vnt_obj, ENSG_idx, VEPtag)
        ENST_list = VEP_field(vnt_obj, ENST_idx, VEPtag)

        # Assign transcripts to genes
        for ENSG, ENST in zip(ENSG_list, ENST_list):
            if ENSG and ENST:
                ENST_dict_tmp.setdefault(ENSG, set())
                ENST_dict_tmp[ENSG].add(ENST)
            #end if
        #end for

        # Assign variant to genes if VEP
        if ENST_dict_tmp:
            # Assign variant to genes and update transcripts for variant
            for ENSG, ENST_set in ENST_dict_tmp.items():
                ENSG_dict.setdefault(ENSG, [])
                ENSG_dict[ENSG].append(vntHet_obj)
                vntHet_obj.add_ENST(ENSG, ENST_set)
            #end for
        #end if

        # Add impact information if is_impct
        if is_impct:
            # VEP
            IMPCT_list = VEP_field(vnt_obj, IMPCT_idx, VEPtag)
            IMPCT_encoded = encode_IMPACT(IMPCT_list, VEP_encode, IMPCT_encode,
                                          is_IMPACT, sep)
            for i, (ENSG, IMPCT) in enumerate(zip(ENSG_list, IMPCT_encoded)):
                if ENSG and IMPCT:
                    IMPCT_dict_tmp.setdefault(ENSG, set())
                    IMPCT_dict_tmp[ENSG].add(IMPCT)
                    vntHet_obj.add_ENST_IMPCT(ENST_list[i], IMPCT)
                #end if
            #end for
            if IMPCT_dict_tmp:
                for ENSG, IMPCT_set in IMPCT_dict_tmp.items():
                    vntHet_obj.add_ENSG_IMPCT(ENSG, IMPCT_set)
                #end for
            #end if
            # SpliceAI
            if is_SpAI:
                # if SpliceAI is within VEP
                # fetching only the first transcript
                # expected the same scores for all transcripts
                SpAI_vals = []
                for i, SpAItag in enumerate(SpAItag_list):
                    SpAI_val = get_tag_idx(vnt_obj, SpAItag, SpAI_idx_list[i])
                    # if SpliceAI is with VEP and is at the end of Format
                    # need to remove , that separate next transcript
                    try:
                        SpAI_vals.append(float(SpAI_val.split(',')[0]))
                    except Exception:
                        break
                    #end try
                #end for
                if SpAI_vals:
                    max_SpAI_vals = max(SpAI_vals)
                    if max_SpAI_vals >= 0.2:
                        vntHet_obj.add_SpAI(max_SpAI_vals)
                    #end if
                #end if
            #end if
            # CLINVAR
            if is_CLNSIG:
                # if CLNSIG is within VEP
                # fetching only the first transcript
                # expected the same CLNSIG for all transcripts
                CLNSIG_val = get_tag_idx(vnt_obj, CLNSIGtag, CLNSIG_idx)
                if CLNSIG_val:
                    vntHet_obj.add_CLINVAR(CLNSIG_val, CLNSIG_encode)
                #end if
            #end if
        #end if
    #end for

    # Pairing variants
    sys.stderr.write('\n')
    n = len(ENSG_dict)
    for n_i, (ENSG, vntHet_list) in enumerate(ENSG_dict.items()):
        if is_verbose:
            sys.stderr.write('\rPairing variants... {:.0f}%'.format(
                float(n_i) / n * 100))
            sys.stderr.flush()
        #end if
        p, l = 0, len(vntHet_list)
        while p < l:
            vntHet_obj = vntHet_list[p]
            for i, vntHet_obj_i in enumerate(vntHet_list):
                if i != p:
                    # if parents information,
                    # check genotypes to confirm is compound het or not
                    if is_comHet(vntHet_obj, vntHet_obj_i, ID_list,
                                 allow_undef):
                        vntHet_obj.add_pair(
                            vntHet_obj_i, ENSG,
                            phase(vntHet_obj, vntHet_obj_i, ID_list), sep,
                            is_impct, IMPCT_decode, test)
                        # Add vntHet to set to write since there is at least one pair
                        vntHet_set.add((vntHet_obj.i, vntHet_obj))
                    #end if
                #end if
            #end for
            p += 1
        #end while
    #end for
    if is_verbose:
        sys.stderr.write('\rPairing variants... {0}%'.format(100))
        sys.stderr.flush()
    #end if

    # Writing output
    sys.stderr.write('\n\n...Writing results for ' + str(analyzed) +
                     ' analyzed variants out of ' + str(c + 1) +
                     ' total variants\n')
    sys.stderr.flush()

    # Order and write variants to output file
    for _, vntHet_obj in sorted(vntHet_set, key=lambda x: x[0]):
        fo.write(vntHet_obj.to_string())
        update_stats(vntHet_obj, stat_dict, sep, is_impct)
    #end for

    # Print summary
    fs = open(args['outputfile'] + '.summary', 'w')
    fj = open(args['outputfile'] + '.json', 'w')

    # Get stats as json
    stat_json = to_json(stat_dict, is_impct)

    # Write to file
    print_stats(stat_json, fs, is_impct)
    json.dump(stat_json, fj, indent=2, sort_keys=True)

    # Close buffers
    fo.close()
    fs.close()
    fj.close()
Example #7
0
def runner(args):
    ''' TODO add docstring '''
    # Variables
    is_verbose = args['verbose']
    VEPtag = 'CSQ'
    VEP_order = {
        # HIGH
        'transcript_ablation': 1,
        'splice_acceptor_variant': 2,
        'splice_donor_variant': 3,
        'stop_gained': 4,
        'frameshift_variant': 5,
        'stop_lost': 6,
        'start_lost': 7,
        'transcript_amplification': 8,
        # MODERATE
        'inframe_insertion': 9,
        'inframe_deletion': 10,
        'missense_variant': 11,
        'protein_altering_variant': 12,
        # LOW
        'splice_region_variant': 13,
        'incomplete_terminal_codon_variant': 14,
        'start_retained_variant': 15,
        'stop_retained_variant': 16,
        'synonymous_variant': 17,
        # MODIFIER
        'coding_sequence_variant': 18,
        'mature_miRNA_variant': 19,
        '5_prime_UTR_variant': 20,
        '3_prime_UTR_variant': 21,
        'intron_variant': 22,
        'MODIFIER': 23
    }
    dbNSFP_fields = {
        # dbNSFP fields that may be a list
        # and need to be assigned to transcripts
        'Polyphen2_HVAR_pred': 0,
        'Polyphen2_HVAR_score': 0,
        'SIFT_pred': 0,
        'SIFT_score': 0
    }

    # Definitions
    vep_init = '##VEP=<ID={0}>'.format(VEPtag)
    genes_init = '##CGAP=<ID=GENES>'
    spliceai_def = '##INFO=<ID=spliceaiMaxds,Number=1,Type=Float,Description="SpliceAI max delta score">'
    genes_def = '##INFO=<ID=GENES,Number=.,Type=String,Description=". Subembedded:\'genes\':Format:\'most_severe_gene|most_severe_transcript|most_severe_feature_ncbi|most_severe_hgvsc|most_severe_hgvsp|most_severe_amino_acids|most_severe_sift_score|most_severe_polyphen_score|most_severe_maxentscan_diff|most_severe_consequence\'">'
    variant_def = '##INFO=<ID=variantClass,Number=1,Type=String,Description="Variant type">'

    # Buffers
    fo = io.open(args['outputfile'], 'w', encoding='utf-8')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Modify VEP definition
    vep_def = '##INFO=<ID={0},Number=.,Type=String,Description="Consequence annotations from Ensembl VEP.  Subembedded:\'transcript\':Format:\'{1}\'">'
    for line in vcf_obj.header.definitions.split('\n')[:-1]:
        if line.startswith('##INFO=<ID=' + VEPtag +
                           ','):  ##<tag_type>=<ID=<tag>,...
            format = line.split('Format:')[1]
            # Cleaning format
            format = format.replace(' ', '')
            format = format.replace('\'', '')
            format = format.replace('\"', '')
            format = format.replace('>', '')
            # Update definition
            vep_field_list = format.split('|')
            vep_field_list.append('most_severe')
            vep_def = vep_def.format(VEPtag, '|'.join(vep_field_list))
            break

    # Remove older VEP definition
    vcf_obj.header.remove_tag_definition(VEPtag)

    # Update and write custom definitions
    vcf_obj.header.add_tag_definition(vep_init + '\n' + genes_init, 'INFO')
    vcf_obj.header.add_tag_definition(spliceai_def, 'INFO')
    vcf_obj.header.add_tag_definition(genes_def, 'INFO')
    vcf_obj.header.add_tag_definition(variant_def, 'INFO')
    vcf_obj.header.add_tag_definition(vep_def, 'INFO')

    # Write header
    vcf_obj.write_header(fo)

    # Get SpliceAI ds indexes
    # DStags import from granite.shared_vars
    SpAItag_list, SpAI_idx_list = [], []
    for DStag in DStags:
        tag, idx = vcf_obj.header.check_tag_definition(DStag)
        SpAItag_list.append(tag)
        SpAI_idx_list.append(idx)

    # Get VEP indexes
    # Indexes to resolve dbNSFP values by transcript
    dbnsfp_ENST_idx = vcf_obj.header.get_tag_field_idx(VEPtag,
                                                       'Ensembl_transcriptid')
    for field in dbNSFP_fields:
        dbNSFP_fields[field] = vcf_obj.header.get_tag_field_idx(VEPtag, field)

    # Indexes for worst transcript (GENES)
    CNONICL_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'CANONICAL')
    ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene')
    ENST_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Feature')
    MANE_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'MANE')  #feature_ncbi
    HGVSC_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'HGVSc')
    HGVSP_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'HGVSp')
    AACIDS_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Amino_acids')
    SIFT_idx = dbNSFP_fields['SIFT_score']
    PPHEN_idx = dbNSFP_fields['Polyphen2_HVAR_score']
    MAXENTDIFF_idx = vcf_obj.header.get_tag_field_idx(VEPtag,
                                                      'MaxEntScan_diff')
    CONSEQUENCE_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Consequence')

    # Reading variants and adding new tags
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\r' + str(i + 1))
            sys.stderr.flush()

        # Clean dbNSFP by resolving values by transcript
        VEP_clean = clean_dbnsfp(vnt_obj, VEPtag, dbNSFP_fields,
                                 dbnsfp_ENST_idx, ENST_idx)

        if not VEP_clean:
            continue

        # Get max SpliceAI max_ds
        maxds = get_maxds(vnt_obj, SpAItag_list, SpAI_idx_list)

        # Get most severe transcript
        worst_trscrpt = get_worst_trscrpt(VEP_clean, VEP_order, CNONICL_idx,
                                          CONSEQUENCE_idx)

        # Get variant class
        # import from granite.shared_functions
        clss = variant_type_ext(vnt_obj.REF, vnt_obj.ALT)

        # Add MAXDS to variant INFO
        if maxds:
            vnt_obj.add_tag_info('spliceaiMaxds={0}'.format(maxds))

        # Add CLASS to variant INFO
        vnt_obj.add_tag_info('variantClass={0}'.format(clss.upper()))

        # Update and replace VEP tag in variant INFO
        # Adding field most_severe (0|1) to transcripts
        VEP_update = update_worst(VEP_clean, worst_trscrpt)
        # Replace VEP
        vnt_obj.remove_tag_info(VEPtag)
        vnt_obj.add_tag_info('{0}={1}'.format(VEPtag, VEP_update))

        # Add GENES to variant INFO
        worst_trscrpt_ = worst_trscrpt.split('|')
        worst_consequence_ = get_worst_consequence(
            worst_trscrpt_[CONSEQUENCE_idx], VEP_order)
        genes = '{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}'.format(
            worst_trscrpt_[ENSG_idx], worst_trscrpt_[ENST_idx],
            worst_trscrpt_[MANE_idx], worst_trscrpt_[HGVSC_idx],
            worst_trscrpt_[HGVSP_idx], worst_trscrpt_[AACIDS_idx],
            worst_trscrpt_[SIFT_idx], worst_trscrpt_[PPHEN_idx],
            worst_trscrpt_[MAXENTDIFF_idx], worst_consequence_)
        vnt_obj.add_tag_info('GENES={0}'.format(genes))

        # Write variant
        vcf_obj.write_variant(fo, vnt_obj)

    # Close buffers
    sys.stderr.write('\n')
    fo.close()
Example #8
0
def main(args):
    ''' '''
    # Variables
    ENSG_set, ENSG_idx = set(), 0
    VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ'
    is_verbose = True if args['verbose'] else False

    # Read genes list
    with open(args['geneslist']) as fi:
        for line in fi:
            line = line.rstrip()
            if line: ENSG_set.add(line)
            #end if
        #end for
    #end with

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Writing header
    vcf_obj.write_header(fo)

    # Get ENSG (Gene) index in VEP
    ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene')

    # Reading variants and writing passed
    analyzed = 0
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(i + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM):
        #     continue
        # #end if
        analyzed += 1

        # Apply genes list and clean VEP
        VEP_clean = clean_VEP_byfield(vnt_obj, ENSG_idx, ENSG_set, VEPtag)

        # Remove old VEP
        vnt_obj.remove_tag_info(VEPtag)

        # Add cleaned VEP if any
        if VEP_clean:
            vnt_obj.add_tag_info('{0}={1}'.format(VEPtag, VEP_clean))
        #end if

        # Write variant
        vcf_obj.write_variant(fo, vnt_obj)
    #end for
    sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) +
                     ' analyzed variants out of ' + str(i + 1) +
                     ' total variants\n')
    sys.stderr.flush()

    # Closing buffers
    fo.close()
Example #9
0
def main(args):
    ''' '''
    # Variables
    afthr, aftag = 0., ''
    big_dict = {}
    is_afthr = True if args['afthr'] else False
    is_bigfile = True if args['bigfile'] else False
    is_verbose = True if args['verbose'] else False

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Check arguments
    if is_afthr:
        afthr = float(args['afthr'])
        if args['aftag']:
            aftag, aftag_idx = vcf_obj.header.check_tag_definition(
                args['aftag'])
        else:
            sys.exit(
                '\nERROR in parsing arguments: to filter by population allele frequency please specify the TAG to use\n'
            )
        #end if
    else:
        if not is_bigfile:
            sys.exit(
                '\nERROR in parsing arguments: to blacklist specify a BIG file and/or a threshold for population allele frequency and the TAG to use\n'
            )
        #end if
    #end if

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Loading big if specified
    if is_bigfile: big_dict = load_big(args['bigfile'])
    #end if

    # Writing header
    vcf_obj.write_header(fo)

    # Reading variants and writing passed
    analyzed = 0
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(i + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM):
        #     continue
        # #end if
        analyzed += 1

        # Get allele frequency from aftag tag if requested
        if is_afthr:
            af = allele_frequency(vnt_obj, aftag, aftag_idx)
            # Check allele frequency
            if af > afthr:
                continue
            #end if
        #end if

        if is_bigfile:
            vtype = variant_type(vnt_obj.REF, vnt_obj.ALT)
            try:
                key = vnt_obj.CHROM + '_' + vtype
                is_blacklist = big_dict[key][vnt_obj.POS]
            except Exception:
                sys.exit(
                    '\nERROR in blacklist check: {0}:{1} missing in BIG file'.
                    format(key, vnt_obj.POS))
            #end try
            if is_blacklist:
                continue
            #end if
        #end if

        # All good, pass and write variant
        vcf_obj.write_variant(fo, vnt_obj)
    #end for
    sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) +
                     ' analyzed variants out of ' + str(i + 1) +
                     ' total variants\n')
    sys.stderr.flush()

    # Closing buffers
    fo.close()
Example #10
0
def main(args, test=False):
    ''' '''
    # Variables
    is_bam = True if args['bam'] else False
    is_afthr = True if args['afthr'] else False
    afthr, aftag, aftag_idx = 1., 'novoAF', 0 # novoAF as aftag placeholder if not is_afthr
    ppthr = float(args['ppthr']) if args['ppthr'] else 0.
    afthr_unrelated = float(args['afthr_unrelated']) if args['afthr_unrelated'] else 1.
    MQthr = int(args['MQthr']) if args['MQthr'] else 0
    BQthr = int(args['BQthr']) if args['BQthr'] else 0
    ADthr = int(args['ADthr']) if args['ADthr'] else 0
    RSTR_def = '##FORMAT=<ID=RSTR,Number=4,Type=Integer,Description="Read counts by strand for ref and alt alleles (Rf,Af,Rr,Ar)">'
    novoCaller_def = '##INFO=<ID=novoPP,Number=1,Type=Float,Description="Posterior probability from novoCaller">'
    # NA chromosomes set -> import from shared_vars
    if test: NA_chroms = test_NA_chroms
    else: NA_chroms = real_NA_chroms
    #end if
    is_NA = False
    is_verbose = True if args['verbose'] else False

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Check arguments
    if is_afthr:
        afthr = float(args['afthr'])
        if args['aftag']:
            aftag, aftag_idx = vcf_obj.header.check_tag_definition(args['aftag'])
        else:
            sys.exit('\nERROR in parsing arguments: to filter by population allele frequency please specify the TAG to use\n')
        #end if
    #end if

    # Data structures
    variants_passed = []

    # Getting files and associated IDs
    sys.stderr.write('Getting unrelated and trio files...\n')
    sys.stderr.flush()

    if is_bam: # if bam files
        unrelated_files, IDs_unrelated = buffering_bams(args['unrelatedfiles'])
        trio_files, IDs_trio = buffering_bams(args['triofiles']) # [parent, parent, child]
    else:
        unrelated_files, IDs_unrelated = buffering_rcks(args['unrelatedfiles'])
        trio_files, IDs_trio = buffering_rcks(args['triofiles']) # [parent, parent, child]
    #end if

    # Checking info files for trio is complete
    if len(trio_files) != 3:
        sys.exit('\nERROR in BAMs info file for trio: missing information for some family member\n')
    #end if

    # Checking information for trio is complete in the vcf
    for ID in IDs_trio:
        if ID not in vcf_obj.header.IDs_genotypes:
            sys.exit('\nERROR in VCF file: missing information for some family member\n')
        #end if
    #end for

    # Reading variants
    analyzed = 0
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(i + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM): # skip variant if not
        #     continue
        # #end if

        # Getting allele frequency from novoAF tag
        af = allele_frequency(vnt_obj, aftag, aftag_idx)

        # is_NA reset
        is_NA = False

        # Calculate statistics
        if af <= afthr: # hard filter on allele frequency
            analyzed += 1
            PP, ADfs, ADrs, ADfs_U, ADrs_U, _, _, _, AF_unrel = \
                PP_calc(trio_files, unrelated_files, vnt_obj.CHROM, vnt_obj.POS, vnt_obj.REF, vnt_obj.ALT, af, MQthr, BQthr, is_bam)
            if vnt_obj.CHROM.replace('chr', '') in NA_chroms:
            # model assumptions does not apply to sex and mithocondrial chromosomes, PP -> NA
                PP = 0.
                is_NA = True
            #end if
            if ADthr and ALT_count_check_parents(ADfs, ADrs, ADthr):
            # AD in parents over ADthr, PP -> 0
                PP = 0.
                is_NA = False
            #end if
            if AF_unrel <= afthr_unrelated and PP >= ppthr: # hard filter on AF_unrel, PP
                variants_passed.append([PP, ADfs, ADrs, ADfs_U, ADrs_U, AF_unrel, is_NA, vnt_obj])
            #end if
        #end if
    #end for

    # Writing output
    sys.stderr.write('\n\n...Writing results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n')
    sys.stderr.flush()

    # Header definitions
    is_RSTR = 'RSTR' in vcf_obj.header.definitions
    is_novoCaller = 'novoPP' in vcf_obj.header.definitions

    if not is_RSTR:
        vcf_obj.header.add_tag_definition(RSTR_def, 'FORMAT')
    #end if
    if not is_novoCaller:
        vcf_obj.header.add_tag_definition(novoCaller_def, 'INFO')
    #end if
    vcf_obj.write_definitions(fo)

    # Adding to header columns unrelated samples missing IDs
    fo.write(vcf_obj.header.columns.rstrip())
    for ID in IDs_unrelated:
        if ID not in vcf_obj.header.IDs_genotypes:
            fo.write('\t' + ID)
        #end if
    #end for
    fo.write('\n')

    # Variants passed
    for variant in sorted(variants_passed, key=lambda x: x[0], reverse=True):
        PP, ADfs, ADrs, ADfs_U, ADrs_U, AF_unrel, is_NA, vnt_obj = variant

        # Removing older tags fields if present
        if is_RSTR:
            vnt_obj.remove_tag_genotype('RSTR')
        #end if
        if is_novoCaller:
            vnt_obj.remove_tag_info('novoPP')
        #end if

        # Adding new tag
        if not is_NA:
            vnt_obj.add_tag_info('novoPP={0}'.format(PP))
        #end if

        # Fill the trailing fields dropped in genotypes
        vnt_obj.complete_genotype()

        # Updating genotypes trio
        for i, ID in enumerate(IDs_trio):
            values = '{0},{1},{2},{3}'.format(int(ADfs[i][0]), int(ADfs[i][1]), int(ADrs[i][0]), int(ADrs[i][1]))
            vnt_obj.add_values_genotype(ID, values)
        #end for

        # Updating genotypes unrelated
        unrelated_genotypes = []
        for i, ID in enumerate(IDs_unrelated):
            values = '{0},{1},{2},{3}'.format(int(ADfs_U[i][0]), int(ADfs_U[i][1]), int(ADrs_U[i][0]), int(ADrs_U[i][1]))
            if ID in vnt_obj.GENOTYPES:
                vnt_obj.add_values_genotype(ID, values)
            else:
                unrelated_genotypes.append(vnt_obj.empty_genotype() + ':' + values)
            #end if
        #end for

        # Updating FORMAT
        vnt_obj.add_tag_format('RSTR')

        # Writing output
        if unrelated_genotypes:
            fo.write(vnt_obj.to_string().rstrip() + '\t' + '\t'.join(unrelated_genotypes) + '\n')
        else:
            vcf_obj.write_variant(fo, vnt_obj)
        #end if
    #end for

    # Closing files buffers
    fo.close()
    if is_bam:
        for buffer in unrelated_files:
            buffer.close()
        #end for
        for buffer in trio_files:
            buffer.close()
Example #11
0
def main(args):
    ''' '''
    # Variables
    VEPrescue, consequence_idx = set(), 0
    # VEPremove = {...} -> import from shared_vars
    # VEPSpliceAI = {...} -> import from shared_vars
    # DStags = {...} -> import from shared_vars
    SpAItag_list, SpAI_idx_list = [], []
    is_VEP = True if args['VEP'] else False
    is_filter_VEP = True if args['filter_VEP'] else False
    VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ'
    VEPsep = args['VEPsep'] if args['VEPsep'] else '&'
    SpliceAI_thr = float(args['SpliceAI']) if args['SpliceAI'] else 0.
    SpliceAItag = args['SpliceAItag']  # default None
    is_SpAI = False
    is_verbose = True if args['verbose'] else False

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Clean header definitions in INFO block for specified tags
    if args['tag']:
        for tag in args['tag']:
            vcf_obj.header.remove_tag_definition(tag, 'INFO')
        #end for
    #end if

    # Writing header
    vcf_obj.write_header(fo)

    # VEP
    if is_VEP:
        consequence_idx = vcf_obj.header.get_tag_field_idx(
            VEPtag, 'Consequence')
        if args['VEPrescue']: VEPrescue = {term for term in args['VEPrescue']}
        #end if
        if args['VEPremove']:
            VEPremove.update({term for term in args['VEPremove']})
        #end if
    elif args['VEPrescue'] or args['VEPremove']:
        sys.exit(
            '\nERROR in parsing arguments: specify the flag "--VEP" to filter by VEP annotations to apply rescue terms or remove additional terms\n'
        )
    #end if

    # SpliceAI
    if SpliceAI_thr:
        if SpliceAItag:  # single tag has been specified
            tag, idx = vcf_obj.header.check_tag_definition(SpliceAItag)
            SpAItag_list.append(tag)
            SpAI_idx_list.append(idx)
        else:  # search for delta scores as default
            for DStag in DStags:
                tag, idx = vcf_obj.header.check_tag_definition(DStag)
                SpAItag_list.append(tag)
                SpAI_idx_list.append(idx)
            #end for
        #end if
    #end if

    # Reading variants and writing passed
    analyzed = 0
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(i + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM):
        #     continue
        # #end if
        analyzed += 1

        # Clean tags if specified
        if args['tag']:
            for tag in args['tag']:
                vnt_obj.remove_tag_info(tag)
            #end for
        #end if

        # is_SpAI reset
        is_SpAI = False

        # Check SpliceAI
        if SpliceAI_thr:
            if check_spliceAI(vnt_obj, SpAI_idx_list, SpAItag_list,
                              SpliceAI_thr):
                is_SpAI = True
            #end if
        #end if

        # Clean VEP
        if is_VEP:
            # Get cleaned VEP
            if is_SpAI:
                VEP_clean = clean_VEP(vnt_obj, consequence_idx, VEPremove,
                                      VEPrescue.union(VEPSpliceAI), VEPtag,
                                      VEPsep)
            else:
                VEP_clean = clean_VEP(vnt_obj, consequence_idx, VEPremove,
                                      VEPrescue, VEPtag, VEPsep)
            #end if
            # Remove old VEP
            vnt_obj.remove_tag_info(VEPtag)
            # Add cleaned VEP if any
            if VEP_clean:
                vnt_obj.add_tag_info('{0}={1}'.format(VEPtag, VEP_clean))
            else:  # check if is filter_VEP
                if is_filter_VEP:
                    continue
                #end if
            #end if
        #end if

        # Write variant
        vcf_obj.write_variant(fo, vnt_obj)
    #end for
    sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) +
                     ' analyzed variants out of ' + str(i + 1) +
                     ' total variants\n')
    sys.stderr.flush()

    # Closing buffers
    fo.close()
Example #12
0
def main(args):
    ''' '''
    # Variables
    VEPrescue, consequence_idx = {}, 0
    # VEPremove = {...} -> import from shared_vars
    # DStags = {...} -> import from shared_vars
    CLINVARonly = {}
    CLNtag = ''
    CLNSIGtag, CLNSIG_idx = '', 0
    SpAItag_list, SpAI_idx_list = [], []
    BED_bitarrays = {}
    is_VEP = True if args['VEP'] else False
    is_CLINVAR = True if args['CLINVAR'] else False
    SpliceAI_thr = float(args['SpliceAI']) if args['SpliceAI'] else 0.
    is_BEDfile = True if args['BEDfile'] else False
    VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ'
    CLINVARtag = args['CLINVARtag'] if args['CLINVARtag'] else 'ALLELEID'
    SpliceAItag = args['SpliceAItag']  # default None
    VEPsep = args['VEPsep'] if args['VEPsep'] else '&'
    is_verbose = True if args['verbose'] else False

    # Buffers
    fo = open(args['outputfile'], 'w')

    # Creating Vcf object
    vcf_obj = vcf_parser.Vcf(args['inputfile'])

    # Writing header
    vcf_obj.write_header(fo)

    # VEP
    if is_VEP:
        consequence_idx = vcf_obj.header.get_tag_field_idx(
            VEPtag, 'Consequence')
        if args['VEPrescue']: VEPrescue = {term for term in args['VEPrescue']}
        #end if
        if args['VEPremove']:
            VEPremove.update({term for term in args['VEPremove']})
        #end if
    elif args['VEPrescue'] or args['VEPremove']:
        sys.exit(
            '\nERROR in parsing arguments: specify the flag "--VEP" to filter by VEP annotations to apply rescue terms or remove additional terms\n'
        )
    #end if

    #CLINVAR
    if is_CLINVAR:
        CLNtag, CLN_idx = vcf_obj.header.check_tag_definition(CLINVARtag)
        if args['CLINVARonly']:
            CLINVARonly = {term for term in args['CLINVARonly']}
            CLNSIGtag, CLNSIG_idx = vcf_obj.header.check_tag_definition(
                'CLNSIG')
        #end if
    elif args['CLINVARonly']:
        sys.exit(
            '\nERROR in parsing arguments: specify the flag "--CLINVAR" to filter by CLINVAR annotations to specify tags or keywords to whitelist\n'
        )
    #end if

    # SpliceAI
    if SpliceAI_thr:
        if SpliceAItag:  # single tag has been specified
            tag, idx = vcf_obj.header.check_tag_definition(SpliceAItag)
            SpAItag_list.append(tag)
            SpAI_idx_list.append(idx)
        else:  # search for delta scores as default
            for DStag in DStags:
                tag, idx = vcf_obj.header.check_tag_definition(DStag)
                SpAItag_list.append(tag)
                SpAI_idx_list.append(idx)
            #end for
        #end if
    #end if

    # BED
    if is_BEDfile:
        BED_bitarrays = bed_to_bitarray(args['BEDfile'])
    #end if

    # Reading variants and writing passed
    analyzed = 0
    for i, vnt_obj in enumerate(vcf_obj.parse_variants()):
        if is_verbose:
            sys.stderr.write('\rAnalyzing variant... ' + str(i + 1))
            sys.stderr.flush()
        #end if

        # # Check if chromosome is canonical and in valid format
        # if not check_chrom(vnt_obj.CHROM):
        #     continue
        # #end if
        analyzed += 1

        # Check BED
        if is_BEDfile:
            try:  # CHROM and POS can miss in the BED file, if that just pass to next checks
                if BED_bitarrays[vnt_obj.CHROM][vnt_obj.POS]:
                    vcf_obj.write_variant(fo, vnt_obj)
                    continue
                #end if
            except Exception:
                pass
            #end try
        #end if

        # Check VEP
        if is_VEP:
            if check_VEP(vnt_obj, consequence_idx, VEPremove, VEPrescue,
                         VEPtag, VEPsep):
                vcf_obj.write_variant(fo, vnt_obj)
                continue
            #end if
        #end if

        # Check SpliceAI
        if SpliceAI_thr:
            if check_spliceAI(vnt_obj, SpAI_idx_list, SpAItag_list,
                              SpliceAI_thr):
                vcf_obj.write_variant(fo, vnt_obj)
                continue
            #end if
        #end if

        # Check CLINVAR
        if is_CLINVAR:
            if check_CLINVAR(vnt_obj, CLN_idx, CLNtag, CLNSIG_idx, CLNSIGtag,
                             CLINVARonly):
                vcf_obj.write_variant(fo, vnt_obj)
                continue
            #end if
        #end if
    #end for
    sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) +
                     ' analyzed variants out of ' + str(i + 1) +
                     ' total variants\n')
    sys.stderr.flush()

    # Closing buffers
    fo.close()