Example #1
0
def test_add_flag():
    vcf = VCF(VCF_PATH)
    vcf.add_info_to_header({
        'ID': 'myflag',
        'Description': 'myflag',
        'Type': 'Flag',
        'Number': '0'
    })
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec = next(vcf)

    rec.INFO["myflag"] = True
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["myflag"] is True, dict(v.INFO)

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec.INFO["myflag"] = False
    w.write_record(rec)
    v = next(VCF(f))
    assert_raises(KeyError, v.INFO.__getitem__, "myflag")
Example #2
0
def filter_vcf(vcf,
               output,
               minlength=0,
               truncate_svlen=float("inf"),
               suffix=""):
    vcf_in = VCF(vcf)
    if not output:
        output = vcf.replace(".vcf", "_{}.vcf".format(suffix))
    vcf_in.add_info_to_header({
        'ID': 'TRUNCATED',
        'Description': "SVLEN truncated",
        'Type': 'Flag',
        'Number': '0'
    })
    vcf_out = Writer(output, vcf_in)
    records_truncated = 0
    records_filtered = 0
    for v in vcf_in:
        svlen = get_svlen(v)
        if svlen >= minlength:
            if svlen > truncate_svlen:
                v.INFO['SVLEN'] = 1
                v.INFO['END'] = v.start + 1
                v.INFO['TRUNCATED'] = True
                records_truncated += 1
            vcf_out.write_record(v)
        else:
            records_filtered += 1
    if records_truncated != 0:
        sys.stderr.write("Truncated {} records where SVLEN > {}\n".format(
            records_truncated, int(truncate_svlen)))
    if records_filtered != 0:
        sys.stderr.write("Filtered {} records where SVLEN < {}\n".format(
            records_filtered, int(minlength)))
def filter_annotate_calls():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--alignments', type=str, help='')
    parser.add_argument('--regions', type=str, help='')
    parser.add_argument('--calls', type=str, help='')
    parser.add_argument('--parameters', type=str, help='')
    args = parser.parse_args()

    import json
    parameters = json.load(open('{}.json'.format(args.parameters)))

    vcf = VCF(args.calls + '.vcf.gz')
    vcf.add_info_to_header({
        'ID': 'Confidence',
        'Description':
        'Measure of confidence in call based upon unitig structure',
        'Type': 'String',
        'Number': '1'
    })
    with pysam.AlignmentFile(args.alignments + '.bam',
                             'rb') as unitigs, gzip.open(
                                 args.regions + '.bed.gz', 'rt') as regions:
        print(vcf.raw_header, end='')
        for region in regions:
            chromosome, start, end = region.strip().split('\t')
            region = '{}:{}-{}'.format(chromosome, start, end)
            for variant in vcf(region):
                retain_call, call_confidence = retainCall_reportConfidence(
                    unitigs, variant, region, parameters)
                if retain_call:
                    print(annotate(variant, call_confidence), end='')
    vcf.close()
Example #4
0
File: cli.py Project: dnil/stranger
def cli(context, vcf, repeats_file, loglevel):
    """Annotate str variants with str status"""
    coloredlogs.install(level=loglevel)

    header_string = 'STR_STATUS'
    repeat_information = None
    with open(repeats_file, 'r') as file_handle:
        repeat_information = parse_repeat_file(file_handle)

    if not repeat_information:
        LOG.warning("Could not find any repeat info")
        context.abort()

    vcf_obj = VCF(vcf)
    vcf_obj.add_info_to_header({
        "ID":
        header_string,
        "Number":
        'A',
        "Type":
        "String",
        "Description":
        "Repeat expansion status. Alternatives in ['normal', 'pre_mutation', 'full_mutation']"
    })

    print_headers(vcf_obj)

    for var in vcf_obj:
        repeat_string = get_repeat_info(var, repeat_information)
        if repeat_string:
            var.INFO[header_string] = repeat_string
        click.echo(str(var).rstrip())
Example #5
0
def annotate_allelic_balance(vcffile, region):
    vcf = VCF(vcffile)

    header_hetab_param_info = {
        'ID': 'HetAB',
        'Description': 'heterozygous genotype allele balance',
        'Type': 'Float',
        'Number': '1'
    }

    header_het_hom_alt_ab_param_info = {
        'ID': 'HetHomAltAB',
        'Description': 'heterozygous + homozygous ALT genotype allele balance',
        'Type': 'Float',
        'Number': '1'
    }

    vcf.add_info_to_header(header_hetab_param_info)
    vcf.add_info_to_header(header_het_hom_alt_ab_param_info)
    out = Writer('-', vcf)
    (total_sites, noted_sites) = (0, 0)

    for variant in vcf(region):
        total_sites += 1
        if is_biallelic(variant):
            noted_sites += 1
            (hetab, het_hom_alt_ab) = compute_allelic_balances(variant)
            variant = update_variant(variant, hetab, het_hom_alt_ab)
        out.write_record(variant)

    out.close()
    msg = "Annotated {} out of a possible {} sites"
    msg = msg.format(noted_sites, total_sites)
    log(msg)
Example #6
0
def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_in.add_info_to_header({
        'ID': 'SVLEN',
        'Description': 'length of sv',
        'Type': 'Integer',
        'Number': '1'
    })
    vcf_in.add_info_to_header({
        'ID': 'SVTYPE',
        'Description': 'type of sv - just DEL or INS based on SVLEN',
        'Type': 'String',
        'Number': '1'
    })
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if abs(len(v.REF) - max([len(alt) for alt in v.ALT])) > 49:
            v.INFO["SVLEN"] = max([len(alt) for alt in v.ALT]) - len(v.REF)
            if v.INFO["SVLEN"] > 0:
                v.INFO["SVTYPE"] = "INS"
            else:
                v.INFO["SVTYPE"] = "DEL"
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close()
Example #7
0
def merge(in_vcf, cadd_tsv):
    new_headers = annotation_info_headers()

    log("Collecting the CADD annotation information")
    cadd_annotations = create_CADD_annotation_dictionary(cadd_tsv)

    log("Processing the build37 vcf")
    vcf = VCF(in_vcf)

    for info_hdr in new_headers:
        vcf.add_info_to_header(info_hdr)

    out = Writer('-', vcf)

    in_vcf_variants = set()
    for variant in vcf:
        (variant, key) = update_variant(variant, cadd_annotations)
        in_vcf_variants.add(key)
        out.write_record(variant)

    out.close()

    log("Checking whether CADD completed correctly")
    ensure_cadd_completed_successfully(
        in_vcf, cadd_tsv, in_vcf_variants,
        frozenset(list(cadd_annotations.keys())))

    log("All Done!")
Example #8
0
def test_add_info_to_header():
    v = VCF(VCF_PATH)
    v.add_info_to_header({
        'ID': 'abcdefg',
        'Description': 'abcdefg',
        'Type': 'Character',
        'Number': '1'
    })
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    import sys
    rec = next(v)

    rec.INFO["abcdefg"] = "XXX"
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    ret = v.INFO["abcdefg"]
    if isinstance(ret, bytes):
        ret = ret.decode()
    assert ret == "XXX", (dict(v.INFO), v.INFO["abcdefg"])
Example #9
0
def mark_missing_sites(vcffile, region, missing_threshold, soft_filter):
    vcf = VCF(vcffile)
    header_param_id = {
        'ID':
        'MISSING',
        'Description':
        'failed variant site missingness threshold ({} %)'.format(
            missing_threshold)
    }
    header_param_info = {
        'ID': 'MISSINGPCT',
        'Description': 'site missingness percentage',
        'Type': 'Float',
        'Number': '1'
    }
    vcf.add_filter_to_header(header_param_id)
    vcf.add_info_to_header(header_param_info)
    out = Writer('-', vcf)
    (total_sites, noted_sites) = (0, 0)

    for variant in vcf(region):
        total_sites += 1
        (missing_pct, missing, total) = compute_missingness(variant)
        verdict = variant_missing_criteria(missing_threshold, missing_pct)
        variant = update_variant(variant, verdict, missing_pct)
        if verdict == "pass":
            noted_sites += 1
            out.write_record(variant)
        elif verdict == "fail" and soft_filter:
            out.write_record(variant)

    out.close()
    msg = "After filtering, passed {} out of a possible {} Sites ({})"
    msg = msg.format(noted_sites, total_sites, 'pass')
    print(msg, file=sys.stderr)
Example #10
0
def test_add_flag():
    vcf = VCF(VCF_PATH)
    vcf.add_info_to_header({'ID': 'myflag', 'Description': 'myflag',
        'Type':'Flag', 'Number': '0'})
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec = vcf.next()

    rec.INFO["myflag"] = True
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["myflag"] is None, dict(v.INFO)

    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, vcf)
    rec.INFO["myflag"] = False
    w.write_record(rec)
    v = next(VCF(f))
    assert_raises(KeyError, v.INFO.__getitem__, "myflag")
Example #11
0
def processVCF(invcf, remm, dann, out):
    vcf_data = VCF(invcf, gts012=True)
    tbx_remm = pysam.TabixFile(remm)
    tbx_dann = pysam.TabixFile(dann)
    vcf_data.add_info_to_header({
        'ID': 'DANN',
        'Description':
        'A deep neural network aimed to recognize pathogenic variants by annotating genetic variants, especially in noncoding regions.',
        'Type': 'String',
        'Number': '.'
    })
    w = Writer(out, vcf_data)
    for record in vcf_data:
        try:
            for row in tbx_remm.fetch(record.CHROM, record.start, record.end):

                if int(str(row).split()[1]) == record.POS:
                    record.INFO["ReMM"] = str(row).split()[2]
            if not record.INFO["ReMM"]:
                record.INFO["ReMM"] = "."
        except ValueError:
            record.INFO["ReMM"] = "."

        try:
            for row in tbx_dann.fetch(record.CHROM, record.start, record.end):
                if int(row.split()[1]) == record.POS and row.split(
                )[2] == record.REF and row.split()[3] == record.ALT[0]:
                    record.INFO["DANN"] = round(float(row.split()[4]), 3)
                    break
                else:
                    record.INFO["DANN"] = "."
        except ValueError:
            record.INFO["DANN"] = "."

        w.write_record(record)
Example #12
0
    def setUp(self):
        # load test data
        # store each variant object into specific variables for tes
        test_directory = os.path.dirname(os.path.abspath(__file__))
        reader = VCF(os.path.join(test_directory, "test.vcf"))
        self.test_filter = refilter.Filter(0.3, 0.7, 'AB', 'VAR_DP', 5, ['MISSING'], ['DB'])
        reader.add_filter_to_header(self.test_filter.filtered_header())
        reader.add_info_to_header(self.test_filter.rescued_header())

        self.variants = [ variant for variant in reader ]
Example #13
0
def main():
    opt = parse_arguments()

    aggregated_variants = {}

    for vcf_fn in opt.vcfs:

        vcf_reader = VCF(vcf_fn)
        vcf_reader.add_info_to_header({
            'ID': 'blaha',
            'Description':
            'aList of variant callers which detected the variant',
            'Type': 'Character',
            'Number': '1'
        })
        vcf_reader.add_info_to_header({
            'ID': 'variant_callers',
            'Description':
            'List of variant callers which detected the variant',
            'Type': 'Character',
            'Number': '1'
        })

        variant_caller = which_variantcaller(vcf_reader)

        for var in vcf_reader:

            # Check if multi-allelic site
            if len(var.ALT) > 1:
                raise NameError('Split and normalize you variants!')

            #calc_GT_fields(var, variant_caller)

            # Save variant in aggregated_variants if it hasn't been found before
            var_id = str(var.CHROM) + "_" + str(var.POS) + "_" + str(
                var.REF) + "_" + str(var.ALT[0])

            if not aggregated_variants.get(var_id):
                aggregated_variants[var_id] = var

            # Add variant caller information to an INFO field
            vcs = ""
            if not aggregated_variants[var_id].INFO.get("variant_callers"):
                vcs = variant_caller
                aggregated_variants[var_id].INFO["variant_callers"] = vcs
            #aggregated_variants[var_id].INFO["variant_callers"] = variant_caller
            else:
                print("INTHERE")
                vcs = "TEST"
                aggregated_variants[var_id].INFO["blaha"] = vcs
                #aggregated_variants[var_id].INFO["blaha"] = "multiple"

            print var
Example #14
0
def main(min_allele_balance, max_allele_balance, allele_balance_tag,
         variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields,
         vcf):
    reader = VCF(vcf)
    refilter = Filter(min_allele_balance, max_allele_balance,
                      allele_balance_tag, variant_sample_depth_tag, min_depth,
                      exclude_filters, exclude_fields)
    reader.add_filter_to_header(refilter.filtered_header())
    reader.add_info_to_header(refilter.rescued_header())
    writer = Writer('-', reader)

    for variant in reader:
        refilter(variant)  # Modifies variant filter status in place
        writer.write_record(variant)
def add_info_headers(vcf: VCF):
    vcf.add_info_to_header({
        "ID": LOCI_ID,
        "Number": "1",
        "Type": "String",
        "Description": "CHROM (loci) in the original VCF",
    })
    vcf.add_info_to_header({
        "ID":
        LOCI_POS_ID,
        "Number":
        "1",
        "Type":
        "Integer",
        "Description":
        f"{LOCI_ID} (see other INFO header) POS in the original VCF",
    })
Example #16
0
def test_add_info_to_header():
    v = VCF(VCF_PATH)
    v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg',
        'Type':'Character', 'Number': '1'})
    # NOTE that we have to add the info to the header of the reader,
    # not the writer because the record will be associated with the reader
    f = tempfile.mktemp(suffix=".vcf")
    atexit.register(os.unlink, f)
    w = Writer(f, v)
    import sys
    rec = v.next()

    rec.INFO["abcdefg"] = "XXX"
    w.write_record(rec)
    w.close()

    v = next(VCF(f))
    assert v.INFO["abcdefg"] == "XXX", dict(v.INFO)
Example #17
0
def process_vcf(vcf):

    vcf_data = VCF(vcf, gts012=True)
    vcf_data.add_info_to_header({
        'ID': 'Gene_SpliceAI',
        'Description': 'Gene for which spliceAI gave the prediction.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_AG',
        'Description': 'SpliceAI score for an acceptor gain.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_AL',
        'Description': 'SpliceAI score for an acceptor lost.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_DG',
        'Description': 'SpliceAI score for a donor gain.',
        'Type': 'String',
        'Number': '.'
    })
    vcf_data.add_info_to_header({
        'ID': 'DS_DL',
        'Description': 'SpliceAI score for a donor lost.',
        'Type': 'String',
        'Number': '.'
    })
    print(vcf_data.raw_header.rstrip())
    for record in vcf_data:
        snvs = record.INFO.get('SpliceAI')
        indels = record.INFO.get('SpliceAI_ind')
        if snvs:
            record = set_new_fields(record, snvs)
        elif indels:
            record = set_new_fields(record, indels)
        print(str(record).rstrip())
    vcf_data.close()
Example #18
0
 def processVariants(self):
     cyVCF = VCF(self.vcfFilePath)
     self.families.setSampleIdxs(cyVCF.samples)
     getCSQList = self.getCSQList(cyVCF.raw_header)
     cyVCF.add_info_to_header({
         "ID":
         "Evidence_Codes",
         "Number":
         "1",
         "Type":
         "String",
         "Description":
         "All ACMG evidence codes that apply to this variant"
     })
     cyVCF.add_info_to_header({
         "ID":
         "Posterior_Pathogenic_Probability",
         "Number":
         "1",
         "Type":
         "String",
         "Description":
         "Posterior Pathogenic Probability"
     })
     self.outputVCF.write(cyVCF.raw_header)
     for v in cyVCF:
         matchingClinVarVariants = []
         for alt in v.ALT:
             key = "%s:%s:%s:%s" % (v.CHROM, v.POS, v.REF, alt)
             if key in self.clinVarData:
                 matchingClinVarVariants.append(self.clinVarData[key])
         var = variant.Variant(v, self.families, self.gnomAD_AF_Threshold,
                               self.REVEL_Threshold, getCSQList,
                               matchingClinVarVariants)
         if not var.printVariant:
             continue
         posterior = self.getPosterior(var)
         v.INFO["Evidence_Codes"] = var.getEvidenceCodesString()
         v.INFO["Posterior_Pathogenic_Probability"] = str(
             format(self.getPosterior(var), '.3f'))
         self.outputVCF.write(str(v))
def parse_header_vcf(vcf_file, vep_field=None, vep_separator=None):
    vcf = VCF(vcf_file)
    vcf.add_info_to_header({
        'ID': 'True_Label',
        'Description': 'Pathogenic/Benign labelled variant',
        'Type': 'Integer',
        'Number': '1'
    })
    vcf.add_info_to_header({
        'ID': 'Source',
        'Description': 'File source',
        'Type': 'String',
        'Number': '1'
    })
    vcf.add_info_to_header({
        'ID': 'SF',
        'Description': '',
        'Type': 'String',
        'Number': '1'
    })
    index_dict = dict()
    if vep_field:
        for h in vcf.header_iter():
            try:
                if h.info()['ID'] == vep_field:
                    csq_header = h.info()['Description'].split(vep_separator)
                    for elem in csq_header:
                        index_dict[elem] = csq_header.index(elem)
            except:
                pass
    return vcf, index_dict
def write_new_info_fields(vcf: VCF):
    vcf.add_info_to_header(
        {
            "ID": LOCI_ID,
            "Description": "name of overlapping loci",
            "Type": "String",
            "Number": "1",
        }
    )
    vcf.add_info_to_header(
        {
            "ID": START_ID,
            "Description": "Loci start position; 1-based inclusive",
            "Type": "Integer",
            "Number": "1",
        }
    )
    vcf.add_info_to_header(
        {
            "ID": END_ID,
            "Description": "Loci end position; 1-based inclusive",
            "Type": "Integer",
            "Number": "1",
        }
    )
def unliftover_vcf(b38_vcf, b37_vcf, annotation_type, auto_fill, update_id):
    new_info_headers = annotation_type_headers(annotation_type)
    new_annotation_fields = annotation_type_info_fields(annotation_type)

    log("Collecting the build 37 vcf annotation information")
    b37_annotations = create_b37_annotation_dictionary(b37_vcf,
                                                       annotation_type,
                                                       update_id)

    log("Processing the build38 vcf")
    vcf = VCF(b38_vcf)

    for info_hdr in new_info_headers:
        vcf.add_info_to_header(info_hdr)

    out = Writer('-', vcf)

    for variant in vcf:
        variant = update_variant(variant, b37_annotations,
                                 new_annotation_fields, auto_fill, update_id)
        out.write_record(variant)

    out.close()
    log("All Done!")
Example #22
0
                        key: line[key]
                        for key in args.fields.split(',')
                    }
                else:
                    tsv_fields = line.copy()

                new_tags = tsv_fields.keys()

                annotations[var_id] = tsv_fields

            for tag in new_tags:
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    'Annotation from' + args.annotate,
                    'Type':
                    'String',
                    'Number':
                    '1'
                })

        w = Writer(args.output, vcf)
        for v in vcf:
            var_id = "_".join([v.CHROM, v.end, v.REF, ','.join(v.ALT)])
            if var_id in annotations.keys():
                for tag, value in annotations[var_id].items():
                    v.INFO[tag] = value

            if args.fields:
                out_info_fields = args.fields.split(',')
                for key, value in v.INFO:
    parser.add_argument('--output_vcf',
                        type=str,
                        default=None,
                        help='Output VCF file')
    return parser.parse_args()


if __name__ == '__main__':

    args = parse_args()

    # Reading the VCF file and adding 2 more attributes into INFO header
    data_vcf = VCF(args.vcffile)
    data_vcf.add_info_to_header({
        'ID': 'ps_filter',
        'Description': 'Mask/Caution',
        'Type': 'String',
        'Number': '1'
    })
    data_vcf.add_info_to_header({
        'ID': 'ps_exc',
        'Description': 'Reasons for mask/caution',
        'Type': 'String',
        'Number': '1'
    })

    # create a new vcf Writer using the input vcf as a template.
    fname = args.output_vcf

    w = Writer(fname, data_vcf)
    prob_vcf_columns = [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'
Example #24
0
def read_vcf(vcf):
    vcf_obj = VCF(vcf)
    vcf_obj.add_info_to_header(VARIANTICS_HIST_VCF_HEADER)
    return vcf_obj
Example #25
0
                    help='Server address(default http://127.0.0.1:8080)')

parser.add_argument('--vcf',
                    dest='vcf_file_name',
                    required=True,
                    help='Name of VCF to annotate')

args = parser.parse_args()

vcf = VCF(args.vcf_file_name)

s = cmd2web.Client.connect(args.host)

vcf.add_info_to_header({
    'ID': 'STIX_NONZERO',
    'Description': 'The number of samples in cohort with non-zero evidence',
    'Type': 'Integer',
    'Number': '1'
})

print(str(vcf.raw_header), end='', flush=True)

for v in vcf:
    chrom = v.CHROM
    start = v.POS
    end = v.INFO.get('END')
    svtype = v.INFO.get('SVTYPE')
    cipos = v.INFO.get('CIPOS')
    ciend = v.INFO.get('CIEND')

    if None in [chrom, start, end, svtype]:
        continue
Example #26
0
TEMPLATE_VCF_FILE = joboutdir / "template.vcf"
TEMPLATE_VCF_FILE.write_text(TEMPLATE_VCF)

vcf = VCF(TEMPLATE_VCF_FILE)

# Add source
vcf.add_to_header(f"##source=biopipen.ns.bed.Bed2Vcf")

# Add genome assembly
if genome:
    vcf.add_to_header(f"##reference={genome}")

vcf.add_info_to_header(
    {
        "ID": "END",
        "Number": "1",
        "Type": "Integer",
        "Description": "End position of the variant described in this record"
    }
)

vcf.add_format_to_header(
    {
        "ID": "GT",
        "Number": "1",
        "Type": "String",
        "Description": "Genotype",
    }
)

# Add contigs
contigs = set()
Example #27
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, pcgr_predispose):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file(
        os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv'))
    if pcgr_predispose is True:
        pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file(
            os.path.join(pcgr_db_directory,
                         'pcgr_infotags_predisposition.tsv'))

    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    vep_to_pcgr_af = {
        'gnomAD_AMR_AF': 'AMR_AF_GNOMAD',
        'gnomAD_AFR_AF': 'AFR_AF_GNOMAD',
        'gnomAD_EAS_AF': 'EAS_AF_GNOMAD',
        'gnomAD_NFE_AF': 'NFE_AF_GNOMAD',
        'gnomAD_AF': 'GLOBAL_AF_GNOMAD',
        'gnomAD_SAS_AF': 'SAS_AF_GNOMAD',
        'gnomAD_OTH_AF': 'OTH_AF_GNOMAD',
        'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD',
        'gnomAD_FIN_AF': 'FIN_AF_GNOMAD',
        'AFR_AF': 'AFR_AF_1KG',
        'AMR_AF': 'AMR_AF_1KG',
        'SAS_AF': 'SAS_AF_1KG',
        'EUR_AF': 'EUR_AF_1KG',
        'EAS_AF': 'EAS_AF_1KG',
        'AF': 'GLOBAL_AF_1KG'
    }

    vcf = VCF(query_vcf)
    vep_csq_index2fields = {}
    vep_csq_fields2index = {}
    dbnsfp_prediction_algorithms = []
    effect_predictions_description = ""
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys():
            identifier = str(header_element['ID'])
            if identifier == 'CSQ' or identifier == 'DBNSFP':
                description = str(header_element['Description'])
                if 'Format: ' in description:
                    subtags = description.split('Format: ')[1].split('|')
                    if identifier == 'CSQ':
                        i = 0
                        for t in subtags:
                            v = t
                            if t in vep_to_pcgr_af:
                                v = str(vep_to_pcgr_af[t])
                            if v in pcgr_vcf_infotags_meta:
                                vep_csq_index2fields[i] = v
                                vep_csq_fields2index[v] = i
                            i = i + 1
                    if identifier == 'DBNSFP':
                        if len(subtags) > 7:
                            effect_predictions_description = "Format: " + '|'.join(
                                subtags[7:])
                        i = 7
                        while (i < len(subtags)):
                            dbnsfp_prediction_algorithms.append(
                                str(
                                    re.sub(r'((_score)|(_pred))"*$', '',
                                           subtags[i])))
                            i = i + 1

    for tag in pcgr_vcf_infotags_meta:
        if not vcf.contains(tag):
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(pcgr_vcf_infotags_meta[tag]['description']),
                'Type':
                str(pcgr_vcf_infotags_meta[tag]['type']),
                'Number':
                str(pcgr_vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    pcgr_onco_xref_map = {
        'SYMBOL': 1,
        'ENTREZ_ID': 2,
        'UNIPROT_ID': 3,
        'APPRIS': 4,
        'UNIPROT_ACC': 5,
        'CHORUM_ID': 6,
        'TUMOR_SUPPRESSOR': 7,
        'ONCOGENE': 8,
        'NETWORK_CG': 9,
        'DISGENET_CUI': 10,
        'CHEMBL_COMPOUND_ID': 11,
        'INTOGEN_DRIVER': 12,
        'ONCOSCORE': 13,
        'CANCER_PREDISPOSITION': 14
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        pcgr_onco_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('PCGR_ONCO_XREF') is None:
            for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split(
                    ','):
                xrefs = transcript_onco_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                pcgr_onco_xref[ensembl_transcript_id] = {}
                for annotation in pcgr_onco_xref_map.keys():
                    annotation_index = pcgr_onco_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        pcgr_onco_xref[ensembl_transcript_id][
                            annotation] = xrefs[annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in pcgr_onco_xref:
                                            for annotation in pcgr_onco_xref_map.keys(
                                            ):
                                                if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC':
                                                    continue
                                                if annotation in pcgr_onco_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG' or annotation == 'CANCER_PREDISPOSITION':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        rec.INFO[annotation] = pcgr_onco_xref[
                                                            ensembl_transcript_id][
                                                                annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(
                                                    re.sub('^rs', '', v))
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)
                            j = j + 1
                        set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            write_pass_vcf(annotated_vcf)
        else:
            pcgrutils.pcgr_error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
                logger)
    else:
        pcgrutils.pcgr_error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
            logger)
Example #28
0
        "ORF3a": "cds-YP_009724391.1",
        "E": "cds-YP_009724392.1",
        "M": "cds-YP_009724393.1",
        "ORF6": "cds-YP_009724394.1",
        "ORF7a": "cds-YP_009724395.1",
        "ORF7b": "cds-YP_009725318.1",
        "ORF8": "cds-YP_009724396.1",
        "N": "cds-YP_009724397.2",
        "ORF10": "cds-YP_009725255.1",
    }
    db = gffutils.create_db(
        args.annotation_file, 'ncov_annotation.db',
        force=True, merge_strategy="merge")
    data_vcf = VCF(args.vcf_file)
    data_vcf.add_info_to_header(
        {'ID': 'mat_pep_id', 'Description': 'Mature Peptide ID',
         'Type': 'String', 'Number': '.'})
    data_vcf.add_info_to_header(
        {'ID': 'mat_pep_desc',
         'Description': 'Mature Peptide Description',
         'Type': 'String', 'Number': '.'})
    data_vcf.add_info_to_header(
        {'ID': 'mat_pep_acc',
         'Description': 'Mature Peptide Accession Number',
         'Type': 'String', 'Number': '.'})
    output_file_name = args.output_vcf
    w = Writer(output_file_name, data_vcf)

    for record in data_vcf:
        gene = db[gene_protein[record.INFO.get('EFF').split("|")[5]]]
        record.INFO["mat_pep_id"] = "n/a"
def overlaps_match(q: str, qpos: int, p: str, ppos: int) -> bool:
    qend = qpos + len(q)
    pend = ppos + len(p)
    qidx = slice(max(0, ppos - qpos), pend - qpos)
    pidx = slice(max(0, qpos - ppos), qend - ppos)
    return q[qidx] == p[pidx]


TAG = "CLF"
truth_rdr = VCF(snakemake.input.truth_vcf)
query_rdr = VCF(snakemake.input.query_vcf)
query_rdr.add_info_to_header(
    {
        "ID": TAG,
        "Description": "Classification of record",
        "Type": "String",
        "Number": ".",
    }
)
query_wtr = Writer(snakemake.output.annotated_query_vcf, tmpl=query_rdr)

classifications = []
classified_qrecords = set()
classified_trecords = set()

for query_record in query_rdr:
    if query_record.FILTER is not None:
        continue
    record_clfs = []
    query_gt = Genotype.from_arr(query_record.genotypes[0])
    qalt_idx = query_gt.alt_index()
Example #30
0
def annotate(filepath, VCFDataFrame):
    """Function to write specific calculated and API values into desired VCF
    This is a very explicit function.

    Parameters:
    filepath: File path to desired .vcf to annotate
    VCFDataFrame: Dataframe with values that we want to annotate

    Returns:
    updates .vcf file at filepath
    """

    vcf = VCF(filepath)

    #This is hardcoded as it is curated
    list_of_annotations = [{
        'ID': 'VAR',
        'Description': "Selected Variant based on prioritization of \
        (1) 'complex', (2) 'ins', (3) 'del', (4) 'mnp', (5) 'snp'",
        'Type': 'String',
        'Number': '1'
    }, {
        'ID': 'VAR_TYPE',
        'Description': "Annotated variant type based on prioritization of \
        (1) 'complex', (2) 'ins', (3) 'del', (4) 'mnp', (5) 'snp'",
        'Type': 'String',
        'Number': '1'
    }, {
        'ID': 'VAR_COUNT',
        'Description': "Count of times selected variant was observed",
        'Type': 'Interger',
        'Number': '1'
    }, {
        'ID': 'VAR_FRAC',
        'Description': "Fraction of total reads that the variant was observed",
        'Type': 'Float',
        'Number': '1'
    }, {
        'ID': 'FREQ_ExAC',
        'Description': "Allele frequency of prioritized alt according to ExAC",
        'Type': 'Float',
        'Number': '1'
    }, {
        'ID': 'TYPE_vep',
        'Description': "vep annotation of major consequence of variant",
        'Type': 'String',
        'Number': '1'
    }]

    for annotation in list_of_annotations:
        vcf.add_info_to_header(annotation)

    w = Writer('Annotated_{}'.format(filepath), vcf)

    for i, variant in enumerate(vcf):
        for annotation in list_of_annotations:
            variant.INFO[annotation['ID']] = str(
                VCFDataFrame.at[i, annotation['ID']])
        w.write_record(variant)

    w.close()
    vcf.close()
Example #31
0
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc
   3. Protein-relevant annotations, e.g. c functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv'))
    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(
        query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[
        'dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = VCF(query_vcf)
    for tag in vcf_infotags_meta:
        if lof_prediction == 0:
            if not tag.startswith('LoF'):
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    str(vcf_infotags_meta[tag]['description']),
                    'Type':
                    str(vcf_infotags_meta[tag]['type']),
                    'Number':
                    str(vcf_infotags_meta[tag]['number'])
                })
        else:
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(vcf_infotags_meta[tag]['description']),
                'Type':
                str(vcf_infotags_meta[tag]['type']),
                'Number':
                str(vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    gvanno_xref_map = {
        'ENSEMBL_TRANSCRIPT_ID': 0,
        'ENSEMBL_GENE_ID': 1,
        'ENSEMBL_PROTEIN_ID': 2,
        'SYMBOL': 3,
        'SYMBOL_ENTREZ': 4,
        'ENTREZ_ID': 5,
        'UNIPROT_ID': 6,
        'UNIPROT_ACC': 7,
        'REFSEQ_MRNA': 8,
        'CORUM_ID': 9,
        'TUMOR_SUPPRESSOR': 10,
        'TUMOR_SUPPRESSOR_EVIDENCE': 11,
        'ONCOGENE': 12,
        'ONCOGENE_EVIDENCE': 13,
        'MIM_PHENOTYPE_ID': 14,
        'OPENTARGETS_DISEASE_ASSOCS': 15,
        'OPENTARGETS_TRACTABILITY_COMPOUND': 16,
        'OPENTARGETS_TRACTABILITY_ANTIBODY': 17,
        'PROB_HAPLOINSUFFICIENCY': 18,
        'PROB_EXAC_LOF_INTOLERANT': 19,
        'PROB_EXAC_LOF_INTOLERANT_HOM': 20,
        'PROB_EXAC_LOF_TOLERANT_NULL': 21,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT': 22,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM': 23,
        'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL': 24,
        'PROB_GNOMAD_LOF_INTOLERANT': 25,
        'PROB_GNOMAD_LOF_INTOLERANT_HOM': 26,
        'PROB_GNOMAD_LOF_TOLERANT_NULL': 27,
        'ESSENTIAL_GENE_CRISPR': 28,
        'ESSENTIAL_GENE_CRISPR2': 29
    }

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        num_chromosome_records_processed += 1
        gvanno_xref = annoutils.make_transcript_xref_map(
            rec, gvanno_xref_map, xref_tag="GVANNO_XREF")

        csq_record_results = annoutils.parse_vep_csq(rec,
                                                     gvanno_xref,
                                                     vep_csq_fields_map,
                                                     logger,
                                                     pick_only=True,
                                                     csq_identifier='CSQ')
        if 'vep_all_csq' in csq_record_results:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(
                csq_record_results['vep_all_csq'])
        if 'vep_block' in csq_record_results:
            vep_csq_records = csq_record_results['vep_block']
            block_idx = 0
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]

        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(
                rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
            logger)
Example #32
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, pon_annotation, cpsr):
   """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   5. Panel-of-normal (blacklisted variants) annotation

   List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_directory
   """

   ## read VEP and PCGR tags to be appended to VCF file
   vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'pcgr_infotags.tsv'))
   if cpsr is True:
      vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'cpsr_infotags.tsv'))

   out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

   meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
   dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
   vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
   vcf = VCF(query_vcf)
   for tag in sorted(vcf_infotags_meta):
      if pon_annotation == 0:
         if not tag.startswith('PANEL_OF_NORMALS'):
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
      else:
         vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

   w = Writer(out_vcf, vcf)
   current_chrom = None
   num_chromosome_records_processed = 0
   pcgr_onco_xref_map = {'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID':1, 'ENSEMBL_PROTEIN_ID':2, 'SYMBOL':3, 'SYMBOL_ENTREZ':4, 
                        'ENTREZ_ID':5, 'UNIPROT_ID':6, 'APPRIS':7,'UNIPROT_ACC':8,'REFSEQ_MRNA':9,'CORUM_ID':10,'TUMOR_SUPPRESSOR':11,
                        'TUMOR_SUPPRESSOR_EVIDENCE':12, 'ONCOGENE':13, 'ONCOGENE_EVIDENCE':14,
                        'NETWORK_CG':15,'DISGENET_CUI':16,'CHEMBL_COMPOUND_ID':17,'CHEMBL_COMPOUND_ID_EARLY_PHASE':18, 'INTOGEN_DRIVER':19,
                        'TCGA_DRIVER':20,'ONCOSCORE':21, 'MIM_PHENOTYPE_ID':22, 'CANCER_PREDISPOSITION_SOURCE':23, 
                        'CANCER_SUSCEPTIBILITY_CUI':24, 'CANCER_SYNDROME_CUI':25, 'CANCER_PREDISPOSITION_MOI':26, 
                        'CANCER_PREDISPOSITION_MOD':27, 'SIGNALING_PATHWAY':28, 'OPENTARGETS_DISEASE_ASSOCS':29,
                        'OPENTARGETS_TRACTABILITY_COMPOUND':30, 'OPENTARGETS_TRACTABILITY_ANTIBODY':31, 'GE_PANEL_ID':32, 
                        'ACTIONABLE_TARGET':33,'GENCODE_GENE_STATUS':34,
                        'PROB_HAPLOINSUFFICIENCY':35,'PROB_EXAC_LOF_INTOLERANT':36,'PROB_EXAC_LOF_INTOLERANT_HOM':37,
                        'PROB_EXAC_LOF_TOLERANT_NULL':38,'PROB_EXAC_NONTCGA_LOF_INTOLERANT':39,
                        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM':40, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL':41,
                        'PROB_GNOMAD_LOF_INTOLERANT':42, 'PROB_GNOMAD_LOF_INTOLERANT_HOM':43, 'PROB_GNOMAD_LOF_TOLERANT_NULL':44,
                        'ESSENTIAL_GENE_CRISPR':45, 'ESSENTIAL_GENE_CRISPR2':46}
   
   vcf_info_element_types = {}
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
         identifier = str(header_element['ID'])
         fieldtype = str(header_element['Type'])
         vcf_info_element_types[identifier] = fieldtype

   for rec in vcf:
      if current_chrom is None:
         current_chrom = str(rec.CHROM)
         num_chromosome_records_processed = 0
      else:
         if str(rec.CHROM) != current_chrom:
            if not current_chrom is None:
               logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
      if rec.INFO.get('CSQ') is None:
         alt_allele = ','.join(rec.ALT)
         pos = rec.start + 1
         variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele
         logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped')
         continue
      csq_record_results = {}
      num_chromosome_records_processed += 1
      pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")
      csq_record_results = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')

      vep_csq_records = None 
      if 'vep_all_csq' in csq_record_results:
         rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results['vep_all_csq'])
      if 'vep_block' in csq_record_results:
         vep_csq_records = csq_record_results['vep_block']

         block_idx = 0
         if cpsr is True:
            block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
         record = vep_csq_records[block_idx]
         for k in record:
            if k in vcf_info_element_types:
               if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                  rec.INFO[k] = True
               else:
                  if not record[k] is None:
                     rec.INFO[k] = record[k]
      
      if not rec.INFO.get('DBNSFP') is None:
         annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)


      w.write_record(rec)
   w.close()
   if current_chrom is not None:
      logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
   vcf.close()

   if os.path.exists(out_vcf):
      if os.path.getsize(out_vcf) > 0:
         check_subprocess(logger, 'bgzip -f ' + str(out_vcf))
         check_subprocess(logger, 'tabix -f -p vcf ' + str(out_vcf) + '.gz')
         annotated_vcf = out_vcf + '.gz'
         annoutils.write_pass_vcf(annotated_vcf, logger)
      else:
         annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
   else:
      annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)