Example #1
0
def test_hrec():

    vcf = VCF(VCF_PATH)
    for item in vcf.header_iter():
        info = item.info()
        if info['HeaderType'] != 'GENERIC':
            assert 'ID' in info
Example #2
0
def test_hrec():

    vcf = VCF(VCF_PATH)
    for item in vcf.header_iter():
        info = item.info()
        if info['HeaderType'] != 'GENERIC':
            assert 'ID' in info
Example #3
0
def extractFields(vcf, output_file, negInd, fields):
    if negInd:
        with open(negInd,'r') as infile:
            negative_samples_provided = [line.strip() for line in infile]
    else:
        negative_samples_provided=[]

    vcf_data = VCF(vcf, gts012=True)
    samples = vcf_data.samples
    sample_based=defaultdict(list)
    hp = hgvs.parser.Parser()
    tools = []
    for field in vcf_data.header_iter():
        if field["HeaderType"] == "INFO" and field["ID"] == "ANN":
            tools = field["Description"].split("Format:")[1][:-1].strip().split("|")
            print(tools)

    for record in vcf_data:
        heterozygous = (record.gt_types == 1).nonzero()[0].tolist()
        hom_alt = (record.gt_types == 2).nonzero()[0].tolist()
        merged_individuals_with_variant=heterozygous+hom_alt
        if fields is not None:
            for ind in merged_individuals_with_variant:
                sample_based = get_annotations(record, tools, samples, sample_based, ind, hp, specific_fields=fields)

        else:
            for ind in merged_individuals_with_variant:
                try:
                    sample_based = get_annotations(record, tools, samples, sample_based, ind, hp)
                except KeyError:
                    print("Record {},{},{},{} does not have ANN field.".format(record.CHROM, str(record.POS),
                                                                               record.REF, record.ALT[0]))

        write_output(sample_based, output_file, negative_samples_provided, fields)
Example #4
0
def vep_dbnsfp_meta_vcf(query_vcf, info_tags_wanted):
    vep_to_pcgr_af = {
        'gnomAD_AMR_AF': 'AMR_AF_GNOMAD',
        'gnomAD_AFR_AF': 'AFR_AF_GNOMAD',
        'gnomAD_EAS_AF': 'EAS_AF_GNOMAD',
        'gnomAD_NFE_AF': 'NFE_AF_GNOMAD',
        'gnomAD_AF': 'GLOBAL_AF_GNOMAD',
        'gnomAD_SAS_AF': 'SAS_AF_GNOMAD',
        'gnomAD_OTH_AF': 'OTH_AF_GNOMAD',
        'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD',
        'gnomAD_FIN_AF': 'FIN_AF_GNOMAD',
        'AFR_AF': 'AFR_AF_1KG',
        'AMR_AF': 'AMR_AF_1KG',
        'SAS_AF': 'SAS_AF_1KG',
        'EUR_AF': 'EUR_AF_1KG',
        'EAS_AF': 'EAS_AF_1KG',
        'AF': 'GLOBAL_AF_1KG'
    }

    vcf = VCF(query_vcf)
    vep_csq_index2fields = {}
    vep_csq_fields2index = {}
    dbnsfp_prediction_algorithms = []
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys():
            identifier = str(header_element['ID'])
            if identifier == 'CSQ' or identifier == 'DBNSFP':
                description = str(header_element['Description'])
                if 'Format: ' in description:
                    subtags = description.split('Format: ')[1].split('|')
                    if identifier == 'CSQ':
                        i = 0
                        for t in subtags:
                            v = t.replace('"', '')
                            if t in vep_to_pcgr_af:
                                v = str(vep_to_pcgr_af[t])
                            if v in info_tags_wanted:
                                vep_csq_index2fields[i] = v
                                vep_csq_fields2index[v] = i
                            i = i + 1
                    if identifier == 'DBNSFP':
                        i = 7
                        while (i < len(subtags)):
                            dbnsfp_prediction_algorithms.append(
                                str(
                                    re.sub(r'((_score)|(_pred))"*$', '',
                                           subtags[i])))
                            i = i + 1

    vep_dbnsfp_meta_info = {}
    vep_dbnsfp_meta_info['vep_csq_fieldmap'] = {}
    vep_dbnsfp_meta_info['vep_csq_fieldmap'][
        'field2index'] = vep_csq_fields2index
    vep_dbnsfp_meta_info['vep_csq_fieldmap'][
        'index2field'] = vep_csq_index2fields
    vep_dbnsfp_meta_info[
        'dbnsfp_prediction_algorithms'] = dbnsfp_prediction_algorithms

    return vep_dbnsfp_meta_info
Example #5
0
def check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger):

   """
   Function that compares the INFO tags in the query VCF and preserved INFO tags set by the user as retained in CPSR output TSV
   If any preserved tag is not in query VCF, an error will be returned
   """

   tags = str(preserved_info_tags).split(',')
   info_elements_query_vcf = []

   vcf = VCF(input_vcf)
   logger.info('Checking if existing INFO tags of query VCF file matches preserved INFO tags set by the user')
   ret = 1
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
         if header_element['HeaderType'] == 'INFO':
            info_elements_query_vcf.append(header_element['ID'])


   for t in tags:
      if not t in info_elements_query_vcf:
         err_msg = "Preserved INFO tag '" + str(t) + "' not found among INFO tags in query VCF - make sure preserved VCF INFO tags are set correctly"
         return error_message(err_msg, logger)
      else:
         logger.info("Preserved INFO tag '" + str(t) + "' detected among INFO tags in query VCF")

   return ret
Example #6
0
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly,
                                 logger):
    """
   Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR
   If any coinciding tags, an error will be returned
   """

    pcgr_infotags_desc = annoutils.read_infotag_file(
        os.path.join(pcgr_directory, 'data', genome_assembly,
                     'cpsr_infotags.tsv'))

    vcf = VCF(input_vcf)
    logger.info(
        'Checking if existing INFO tags of query VCF file coincide with CPSR INFO tags'
    )
    ret = 1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO':
                if header_element['ID'] in pcgr_infotags_desc.keys():
                    err_msg = 'INFO tag ' + str(
                        header_element['ID']
                    ) + ' in the query VCF coincides with a VCF annotation tag produced by CPSR - please remove or rename this tag in your query VCF'
                    return annoutils.error_message(err_msg, logger)

    logger.info('No query VCF INFO tags coincide with CPSR INFO tags')
    return ret
Example #7
0
def validate_panel_normal_vcf(vcf, logger):
    """
   Function that checks the INFO tags in the panel of normal VCF for the presense of 'PANEL_OF_NORMAL' (logical tag)
   If any coinciding tags, an error will be returned
   """

    vcf = VCF(vcf)
    ret = -1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO' and header_element[
                    'Type'] == 'Flag':
                if header_element['ID'] == 'PANEL_OF_NORMALS':
                    logger.info(
                        'Found \'PANEL_OF_NORMALS\' INFO flag in the VCF header section of the of panel of normals VCF file'
                    )
                    ret = 1

    if ret == -1:
        err_msg = 'INFO flag \'PANEL_OF_NORMALS\' is missing from the panel of normal VCF header'
        return pcgr_error_message(err_msg, logger)

    return ret
Example #8
0
def _get_vcf_field_defs(vcf: VCF, category: str) -> Dict[str, Any]:
    """Get a dictionary of field definitions for a category (e.g. INFO or FORMAT)
    from the VCF header."""
    return {
        h["ID"]: h.info(extra=True)
        for h in vcf.header_iter() if h["HeaderType"] == category
    }
def parse_header_vcf(vcf_file, vep_field=None, vep_separator=None):
    vcf = VCF(vcf_file)
    vcf.add_info_to_header({
        'ID': 'True_Label',
        'Description': 'Pathogenic/Benign labelled variant',
        'Type': 'Integer',
        'Number': '1'
    })
    vcf.add_info_to_header({
        'ID': 'Source',
        'Description': 'File source',
        'Type': 'String',
        'Number': '1'
    })
    vcf.add_info_to_header({
        'ID': 'SF',
        'Description': '',
        'Type': 'String',
        'Number': '1'
    })
    index_dict = dict()
    if vep_field:
        for h in vcf.header_iter():
            try:
                if h.info()['ID'] == vep_field:
                    csq_header = h.info()['Description'].split(vep_separator)
                    for elem in csq_header:
                        index_dict[elem] = csq_header.index(elem)
            except:
                pass
    return vcf, index_dict
Example #10
0
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, logger):
    """
   Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR
   If any coinciding tags, an error will be returned
   """

    #vep_infotags_desc = pcgrutils.read_infotag_file(os.path.join(pcgr_directory,'data','vep_infotags.tsv'))
    pcgr_infotags_desc = pcgrutils.read_infotag_file(
        os.path.join(pcgr_directory, 'data', 'pcgr_infotags.tsv'))

    vcfanno_tags = {}
    for db in [
            'intogen_driver_mut', 'dbsnp', 'docm', 'civic', 'cbmdb', 'dbnsfp',
            'clinvar', 'tcga', 'uniprot', 'cancer_hotspots', 'pcgr_onco_xref'
    ]:
        vcfanno_tag_file = os.path.join(pcgr_directory, 'data', str(db),
                                        str(db) + '.vcfanno.vcf_info_tags.txt')
        try:
            f = open(vcfanno_tag_file, 'r')
            for line in f:
                if line.startswith('##INFO'):
                    tag = re.sub(r'##INFO=<ID=', '',
                                 str(line.rstrip().split(',')[0]))
                    vcfanno_tags[tag] = 1
        except IOError:
            logger.error('File ' + str(vcfanno_tag_file) + ' does not exist')
            return -1

    vcf = VCF(input_vcf)
    logger.info(
        'Checking if existing INFO tags of query VCF file coincide with PCGR INFO tags'
    )
    ret = 1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO':
                if header_element['ID'] in pcgr_infotags_desc.keys(
                ) or header_element['ID'] in vcfanno_tags.keys(
                ) or header_element['ID'] == 'EFFECT_PREDICTIONS':
                    err_msg = 'INFO tag ' + str(
                        header_element['ID']
                    ) + ' in the query VCF coincides with a VCF annotation tag produced by PCGR - please remove or rename this tag in your query VCF'
                    return pcgr_error_message(err_msg, logger)
                if header_element['ID'] == 'DP_TUMOR' or header_element[
                        'ID'] == 'AF_TUMOR' or header_element[
                            'ID'] == 'AF_NORMAL' or header_element[
                                'ID'] == 'DP_NORMAL':
                    err_msg = 'INFO tag ' + str(
                        header_element['ID']
                    ) + ' in the query VCF coincides with a VCF annotation tag produced by PCGR - please remove or rename this tag in your query VCF'
                    return pcgr_error_message(err_msg, logger)

    logger.info('No query VCF INFO tags coincide with PCGR INFO tags')
    return ret
Example #11
0
def get_vcf_info_tags(vcffile):
   vcf = VCF(vcffile)
   info_tags = {}
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
         if header_element['HeaderType'] == 'INFO':
            info_tags[str(header_element['ID'])] = 1
   
   return info_tags
Example #12
0
def test_header_stuff():
    vcf = VCF('{}/test.vcf.gz'.format(HERE))
    import sys
    seen_formats, seen_infos = 0, 0
    for h in vcf.header_iter():
        i = h.info(extra=True)
        assert isinstance(i, dict)
        seen_formats += i['HeaderType'] == 'FORMAT'
        seen_infos += i['HeaderType'] == 'INFO'
    assert seen_formats == 9, seen_formats
    assert seen_infos == 73, seen_infos
Example #13
0
def test_header_stuff():
    vcf = VCF('{}/test.vcf.gz'.format(HERE))
    import sys
    seen_formats, seen_infos = 0, 0
    for h in vcf.header_iter():
        i = h.info(extra=True)
        assert isinstance(i, dict)
        seen_formats += i['HeaderType'] == 'FORMAT'
        seen_infos += i['HeaderType'] == 'INFO'
    assert seen_formats == 9, seen_formats
    assert seen_infos == 73, seen_infos
def add_absent_records(vcf_absent_gnomad, outfile, nind):
    #format_fields = get_format_fields_from_vcf(vcf_absent_gnomad)
    logging.info("Processing variants absent in gnomAD")
    gt, gt_dp, gt_ref_depth, gt_alt_depth, gt_qual = "0/0", 100, 100, 0, 50
    gt_phred_ll_homref, gt_phred_ll_het, gt_phred_ll_homalt = 0, 1500, 1500
    fmt = [
        "{}:{},{}:{}:{}:{},{},{}".format(gt, gt_ref_depth, gt_alt_depth, gt_dp,
                                         gt_qual, gt_phred_ll_homref,
                                         gt_phred_ll_het, gt_phred_ll_homalt)
    ] * nind
    vcf_data = VCF(vcf_absent_gnomad, gts012=True)
    info_fields = [
        field["ID"] for field in vcf_data.header_iter()
        if field["HeaderType"] == "INFO"
    ]

    with open(outfile, 'a') as out:
        #with gzip.open(outfile, 'ab') as out:
        for record in vcf_data:
            str_info = []
            for i in info_fields:
                try:
                    str_info.append(i + "=" + str(record.INFO[i]))
                except KeyError:
                    continue
            write_record = [
                '.' if v is None else v for v in [
                    record.CHROM,
                    str(record.POS), record.ID, record.REF, record.ALT[0],
                    str(record.QUAL), record.FILTER, ';'.join(str_info),
                    "GT:AD:DP:GQ:PL"
                ]
            ]

            out.write('\t'.join(write_record + fmt) + "\n")
            #out.write('\t'.join(write_record + fmt).encode() + "\n".encode())
        vcf_data.close()
    out.close()
Example #15
0
def vcf2tsv(query_vcf, out_tsv, skip_info_data, skip_genotype_data,
            keep_rejected_calls):

    vcf = VCF(query_vcf, gts012=True)
    out = open(out_tsv, 'w')

    fixed_columns_header = [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'
    ]
    samples = vcf.samples
    info_columns_header = []
    format_columns_header = []
    sample_columns_header = []
    column_types = {}
    gt_present_header = 0

    if len(samples) > 0:
        sample_columns_header.append('VCF_SAMPLE_ID')

    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO' or header_element[
                    'HeaderType'] == 'FORMAT':
                column_types[header_element['ID']] = header_element['Type']
            if header_element['HeaderType'] == 'INFO':
                if skip_info_data is False:
                    info_columns_header.append(header_element['ID'])
            if header_element['HeaderType'] == 'FORMAT':
                if len(sample_columns_header
                       ) > 0 and skip_genotype_data is False:
                    if header_element['ID'] != 'GT':
                        format_columns_header.append(header_element['ID'])
                    else:
                        gt_present_header = 1

    header_line = '\t'.join(fixed_columns_header)
    if skip_info_data is False:
        header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(
            sorted(info_columns_header))
        if len(sample_columns_header) > 0:
            if skip_genotype_data is False:
                header_line = '\t'.join(
                    fixed_columns_header) + '\t' + '\t'.join(
                        sorted(info_columns_header)) + '\t' + '\t'.join(
                            sample_columns_header) + '\t' + '\t'.join(
                                sorted(format_columns_header)) + '\tGT'
            else:
                header_line = '\t'.join(
                    fixed_columns_header) + '\t' + '\t'.join(
                        sorted(info_columns_header))
    else:
        if len(sample_columns_header) > 0:
            if skip_genotype_data is False:
                header_line = '\t'.join(
                    fixed_columns_header) + '\t' + '\t'.join(
                        sample_columns_header) + '\t' + '\t'.join(
                            sorted(format_columns_header)) + '\tGT'
            else:
                header_line = '\t'.join(fixed_columns_header)

    out.write(str(header_line) + '\n')
    for rec in vcf:
        rec_id = '.'
        rec_qual = '.'
        rec_filter = '.'
        alt = ",".join(str(n) for n in rec.ALT)
        if not rec.ID is None:
            rec_id = str(rec.ID)
        if not rec.QUAL is None:
            rec_qual = str("{0:.2f}".format(rec.QUAL))
        rec_filter = str(rec.FILTER)
        if rec.FILTER is None:
            rec_filter = 'PASS'
        if type(rec.FILTER) is list:
            if len(rec.FILTER) == 0:
                rec_filter = 'PASS'
            elif len(rec.FILTER) == 1:
                rec_filter = str(rec.FILTER[0])
            else:
                rec_filter = str(';'.join(str(n) for n in rec.FILTER))
        pos = int(rec.start) + 1
        fixed_fields_string = str(
            rec.CHROM) + '\t' + str(pos) + '\t' + str(rec_id) + '\t' + str(
                rec.REF) + '\t' + str(alt) + '\t' + str(rec_qual) + '\t' + str(
                    rec_filter)
        #print str(fixed_fields_string)

        variant_info = rec.INFO
        vcf_info_data = []
        if skip_info_data is False:
            for info_field in sorted(info_columns_header):
                if column_types[info_field] == 'Flag':
                    if variant_info.get(info_field) is None:
                        vcf_info_data.append('False')
                    else:
                        vcf_info_data.append('True')
                elif column_types[info_field] == 'Float' or column_types[
                        info_field] == 'Integer' or column_types[
                            info_field] == 'String':
                    if type(variant_info.get(info_field)) is list:
                        vcf_info_data.append(",".join(
                            str(n) for n in variant_info.get(
                                info_field).encode('utf-8')))
                    else:
                        if variant_info.get(info_field) is None:
                            vcf_info_data.append('.')
                        else:
                            if column_types[info_field] == 'Float':
                                val = str("{0:.7f}".format(
                                    variant_info.get(info_field)))
                                vcf_info_data.append(val)
                            else:
                                if column_types[info_field] == 'String':
                                    vcf_info_data.append(
                                        str(
                                            variant_info.get(
                                                info_field).encode('utf-8')))
                                else:
                                    vcf_info_data.append(
                                        str(variant_info.get(info_field)))

        #dictionary, with sample names as keys, values being genotype data (dictionary with format tags as keys)
        vcf_sample_genotype_data = {}
        if len(samples) > 0 and skip_genotype_data is False:
            gt_cyvcf = rec.gt_types
            i = 0
            while i < len(samples):
                vcf_sample_genotype_data[samples[i]] = {}
                gt = './.'
                if gt_present_header == 1:
                    if gt_cyvcf[i] == 0:
                        gt = '0/0'
                    if gt_cyvcf[i] == 1:
                        gt = '0/1'
                    if gt_cyvcf[i] == 2:
                        gt = '1/1'
                vcf_sample_genotype_data[samples[i]]['GT'] = gt
                i = i + 1

        for format_tag in sorted(format_columns_header):
            if len(samples) > 0 and skip_genotype_data is False:
                sample_dat = rec.format(format_tag)
                if sample_dat is None:
                    continue
                dim = sample_dat.shape
                j = 0
                ## sample-wise
                while j < dim[0]:
                    if sample_dat[j].size > 1:
                        d = ','.join(
                            str(e) for e in np.ndarray.tolist(sample_dat[j]))
                        if vcf_sample_genotype_data.has_key(samples[j]):
                            vcf_sample_genotype_data[
                                samples[j]][format_tag] = d
                    else:
                        d = str(sample_dat[j][0])
                        if vcf_sample_genotype_data.has_key(samples[j]):
                            vcf_sample_genotype_data[
                                samples[j]][format_tag] = d
                    j = j + 1

        tsv_elements = []
        tsv_elements.append(fixed_fields_string)
        if skip_info_data is False:
            if skip_genotype_data is False:
                if len(sample_columns_header) > 0:
                    tsv_elements.append('\t'.join(vcf_info_data))
                    for s in sorted(vcf_sample_genotype_data.keys()):
                        sample = s
                        line_elements = []
                        line_elements.extend(tsv_elements)
                        line_elements.append(sample)
                        gt_tag = '.'
                        for tag in sorted(
                                vcf_sample_genotype_data[sample].keys()):
                            if tag != 'GT':
                                line_elements.append(
                                    vcf_sample_genotype_data[sample][tag])
                            else:
                                gt_tag = vcf_sample_genotype_data[sample][tag]
                        line_elements.append(gt_tag)
                        if gt_tag == './.':
                            if not keep_rejected_calls is False:
                                out.write('\t'.join(line_elements) + '\n')
                        else:
                            out.write('\t'.join(line_elements) + '\n')
                else:
                    tsv_elements.append('\t'.join(vcf_info_data))
                    line_elements = []
                    line_elements.extend(tsv_elements)
                    out.write('\t'.join(line_elements) + '\n')
            else:
                tsv_elements.append('\t'.join(vcf_info_data))
                line_elements = []
                line_elements.extend(tsv_elements)
                out.write('\t'.join(line_elements) + '\n')
        else:
            if skip_genotype_data is False:
                if len(sample_columns_header) > 0:
                    for s in sorted(vcf_sample_genotype_data.keys()):
                        sample = s
                        line_elements = []
                        line_elements.extend(tsv_elements)
                        line_elements.append(sample)
                        gt_tag = '.'
                        for tag in sorted(
                                vcf_sample_genotype_data[sample].keys()):
                            if tag != 'GT':
                                line_elements.append(
                                    vcf_sample_genotype_data[sample][tag])
                            else:
                                gt_tag = vcf_sample_genotype_data[sample][tag]
                        line_elements.append(gt_tag)
                        out.write('\t'.join(line_elements) + '\n')
            else:
                line_elements = []
                line_elements.extend(tsv_elements)
                line_elements = tsv_elements
                out.write('\t'.join(line_elements) + '\n')

    out.close()
Example #16
0
def process_vcf(vcf: str,
                tools_config: dict,
                thresholds: list,
                af_col: str,
                is_clinvar: bool = False):
    """
    Scans a VEP annotated VCF file and extracts all the
    tools scores

    :param str vcf: VCF file
    :param dict tools_config: Dict with the
        map between available tools and VCF
        annotations (already parsed, only
        existing fields should be passed here).
    :param list thresholds: List of tools
        to process based on the tools config file
        as well as the selected scope for the
        analysis
    :param str af_col: VCF field that measures
            population frequency of variants.
    :param bool is_clinvar: Whether `vcf` is
        from clinvar database.

    :return dict: Dict with scores keyed by variant
    :return list: List with the tools provided in the
    config that were not found in the VCF. (Will be
    used to update the config_tools variable)

    """
    logging.info("Extracting predictions from VCF.")

    # List of required INFO fields when
    # dataset is from Clinvar
    CLINVAR_FIELDS = ['CLNREVSTAT', 'CLNSIG']
    clinvar_confirmed = False

    # List of tools belonging to the
    # provided scope that will be analysed
    # Requires to be present in the updated config
    # (some tools might be removed from config after processing first file)
    TOOLS_TO_ANALYSE = [
        t[0] for t in thresholds if t[0] in tools_config.keys()
    ]

    # MAP with the list of VCF fields per tool
    TOOLS_CONFIG_MAP = {
        k: v[0]
        for k, v in tools_config.items() if k in TOOLS_TO_ANALYSE
    }

    # List with the VCF fields to check
    # Present fields in the header will be
    # popped out sequentially
    MISSING_VCF_FIELDS_FLAT = [
        i for sublist in [v for _k, v in TOOLS_CONFIG_MAP.items()]
        for i in sublist
    ]

    # Missing tools from VCF to return
    # and exclude from analysis
    MISSING_VCF_TOOLS = []

    # Indexes is an ordered dict with the index(s)
    # where information about a given tool is stored
    # in the VEP annotation field
    vep_indexes = OrderedDict()

    # List of VCF fields (not tool names) that are not
    # found in the VEP annotation field
    absent_from_VEP_annot = []

    # Dict with the list of tools and corresponding
    # VCF fields found in the INFO field as a single
    # annotation
    present_in_INFO = defaultdict(list)

    # List of tool names with all annotation fields
    # found in the VEP annotation field
    all_vep_annotations = []
    present_in_VEP_annot = []

    # Parse header and count number of variants
    vcf_data = VCF(vcf)
    if vcf_data.contains("ANN"):
        VEP_TAG = "ANN"
    elif vcf_data.contains("CSQ"):
        VEP_TAG = "CSQ"
    else:
        raise ValueError(
            "VETA requires VEP annotations. ANN or CSQ field was not found in "
            "the INFO field of the input VCF file. Exiting.\n")

    for field in vcf_data.header_iter():

        # If field is in VEP annotations
        if field["HeaderType"] == "INFO" and field["ID"] == VEP_TAG:
            all_vep_annotations = field["Description"].split(
                "Format:")[1][:-1].strip().split("|")
            if af_col in all_vep_annotations:
                vep_indexes[af_col] = [all_vep_annotations.index(af_col)]

            # Looks only for scores belonging
            # to the specific scope set
            # Also look for the AF column set
            for _tool in TOOLS_TO_ANALYSE:

                _tool_field = TOOLS_CONFIG_MAP[_tool]
                _present, _absent = _check_if_field_exists(
                    _tool_field, all_vep_annotations)

                if _present:
                    _fields_present = [v[0] for v in _present]
                    present_in_VEP_annot.extend(_fields_present)
                    vep_indexes[_tool] = [v[1] for v in _present]
                    [
                        MISSING_VCF_FIELDS_FLAT.remove(i)
                        for i in _fields_present
                    ]

                if _absent:
                    absent_from_VEP_annot.extend(_absent)

        elif field["HeaderType"] == "INFO":
            if field['ID'] in MISSING_VCF_FIELDS_FLAT:
                _tool_name = [
                    _t for _t, v in TOOLS_CONFIG_MAP.items()
                    if field['ID'] in v
                ][0]
                present_in_INFO[_tool_name].append(field['ID'])
                MISSING_VCF_FIELDS_FLAT.remove(field['ID'])

            if field['ID'] in CLINVAR_FIELDS:
                clinvar_confirmed = True

            if field['ID'] == af_col and af_col not in vep_indexes.keys(
            ) and af_col not in present_in_INFO.keys():
                present_in_INFO[af_col].append(field['ID'])

    if is_clinvar and not clinvar_confirmed:
        raise ValueError(
            'Input VCF is not from Clinvar ({}) fields were '
            'not found in the VCF INFO annotations. They are '
            'required for clinical significance extraction.\n'
            'Additionally, be aware that when running in '
            'benchmark mode, non-Clinvar files must be passed '
            'via an input directory where target files '
            'referring to each class are located.'.format(CLINVAR_FIELDS))

    # if there are VCF fields from
    # the config absent in VCF header
    # LOG absent fields
    if MISSING_VCF_FIELDS_FLAT:

        for tool, fields in TOOLS_CONFIG_MAP.items():

            for _f in fields:
                if _f in MISSING_VCF_FIELDS_FLAT:
                    logging.info("\'{}\' field does not exist "
                                 "in VCF. Corresponding tool "
                                 "(\'{}\') will be discarded from "
                                 "analysis.".format(_f, tool))

                    MISSING_VCF_TOOLS.append(tool)
                    # Delete tool from analysis
                    # if at least one subfield
                    # is not present
                    try:
                        del vep_indexes[tool]
                    except KeyError:
                        pass

                    try:
                        del present_in_INFO[tool]
                    except KeyError:
                        pass

    n_variants = 0
    for _ in vcf_data:
        n_variants += 1
    vcf_data.close()

    # Pack kwargs
    args = {
        'VEP_TAG': VEP_TAG,
        'all_vep_annotations': all_vep_annotations,
        'present_in_INFO': present_in_INFO,
        'vep_indexes': vep_indexes,
        'is_clinvar': is_clinvar
    }

    # Variant processing
    if n_variants > 5000:
        header = subprocess.Popen(["bcftools", "view", "-h", vcf],
                                  stdout=subprocess.PIPE).stdout.readlines()

        body_variants = tempfile.NamedTemporaryFile()
        subprocess.call(["bcftools", "view", "-H", vcf], stdout=body_variants)

        list_files = osutils.split_file_in_chunks(body_variants.name, header,
                                                  n_variants)
        with multiprocessing.Pool() as p:
            dict_list = list(
                tqdm(p.imap(partial(_iter_variants, **args), list_files)))

        logging.info("Merging data from parallel VCF processing.")
        scores = defaultdict(list)
        for d in dict_list:
            # if bool(d.keys() & scores.keys()):
            #    raise ValueError("Repeated variants in the input file.")
            scores.update(d)
        [os.remove(f) for f in list_files]
        logging.info("Done.")
        p.close()

    else:
        scores = _iter_variants(vcf, **args)

    return scores, MISSING_VCF_TOOLS
Example #17
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, pcgr_predispose):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file(
        os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv'))
    if pcgr_predispose is True:
        pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file(
            os.path.join(pcgr_db_directory,
                         'pcgr_infotags_predisposition.tsv'))

    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    vep_to_pcgr_af = {
        'gnomAD_AMR_AF': 'AMR_AF_GNOMAD',
        'gnomAD_AFR_AF': 'AFR_AF_GNOMAD',
        'gnomAD_EAS_AF': 'EAS_AF_GNOMAD',
        'gnomAD_NFE_AF': 'NFE_AF_GNOMAD',
        'gnomAD_AF': 'GLOBAL_AF_GNOMAD',
        'gnomAD_SAS_AF': 'SAS_AF_GNOMAD',
        'gnomAD_OTH_AF': 'OTH_AF_GNOMAD',
        'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD',
        'gnomAD_FIN_AF': 'FIN_AF_GNOMAD',
        'AFR_AF': 'AFR_AF_1KG',
        'AMR_AF': 'AMR_AF_1KG',
        'SAS_AF': 'SAS_AF_1KG',
        'EUR_AF': 'EUR_AF_1KG',
        'EAS_AF': 'EAS_AF_1KG',
        'AF': 'GLOBAL_AF_1KG'
    }

    vcf = VCF(query_vcf)
    vep_csq_index2fields = {}
    vep_csq_fields2index = {}
    dbnsfp_prediction_algorithms = []
    effect_predictions_description = ""
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys():
            identifier = str(header_element['ID'])
            if identifier == 'CSQ' or identifier == 'DBNSFP':
                description = str(header_element['Description'])
                if 'Format: ' in description:
                    subtags = description.split('Format: ')[1].split('|')
                    if identifier == 'CSQ':
                        i = 0
                        for t in subtags:
                            v = t
                            if t in vep_to_pcgr_af:
                                v = str(vep_to_pcgr_af[t])
                            if v in pcgr_vcf_infotags_meta:
                                vep_csq_index2fields[i] = v
                                vep_csq_fields2index[v] = i
                            i = i + 1
                    if identifier == 'DBNSFP':
                        if len(subtags) > 7:
                            effect_predictions_description = "Format: " + '|'.join(
                                subtags[7:])
                        i = 7
                        while (i < len(subtags)):
                            dbnsfp_prediction_algorithms.append(
                                str(
                                    re.sub(r'((_score)|(_pred))"*$', '',
                                           subtags[i])))
                            i = i + 1

    for tag in pcgr_vcf_infotags_meta:
        if not vcf.contains(tag):
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(pcgr_vcf_infotags_meta[tag]['description']),
                'Type':
                str(pcgr_vcf_infotags_meta[tag]['type']),
                'Number':
                str(pcgr_vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    pcgr_onco_xref_map = {
        'SYMBOL': 1,
        'ENTREZ_ID': 2,
        'UNIPROT_ID': 3,
        'APPRIS': 4,
        'UNIPROT_ACC': 5,
        'CHORUM_ID': 6,
        'TUMOR_SUPPRESSOR': 7,
        'ONCOGENE': 8,
        'NETWORK_CG': 9,
        'DISGENET_CUI': 10,
        'CHEMBL_COMPOUND_ID': 11,
        'INTOGEN_DRIVER': 12,
        'ONCOSCORE': 13,
        'CANCER_PREDISPOSITION': 14
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        pcgr_onco_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('PCGR_ONCO_XREF') is None:
            for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split(
                    ','):
                xrefs = transcript_onco_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                pcgr_onco_xref[ensembl_transcript_id] = {}
                for annotation in pcgr_onco_xref_map.keys():
                    annotation_index = pcgr_onco_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        pcgr_onco_xref[ensembl_transcript_id][
                            annotation] = xrefs[annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in pcgr_onco_xref:
                                            for annotation in pcgr_onco_xref_map.keys(
                                            ):
                                                if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC':
                                                    continue
                                                if annotation in pcgr_onco_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG' or annotation == 'CANCER_PREDISPOSITION':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        rec.INFO[annotation] = pcgr_onco_xref[
                                                            ensembl_transcript_id][
                                                                annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(
                                                    re.sub('^rs', '', v))
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)
                            j = j + 1
                        set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            write_pass_vcf(annotated_vcf)
        else:
            pcgrutils.pcgr_error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
                logger)
    else:
        pcgrutils.pcgr_error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
            logger)
Example #18
0
    for line in f:
        contig, length, *_ = line.strip().split("\t")
        contigs.add(contig)
        vcf.add_to_header(f"##contig=<ID={contig},length={length}>")

for header in headers:
    vcf.add_to_header(header)

for info in infos:
    vcf.add_info_to_header(info)

for fmt in formats:
    vcf.add_format_to_header(fmt)

header_types = {}
for header in vcf.header_iter():
    try:
        header_types[header["ID"]] = header["HeaderType"]
    except KeyError:
        pass

refseq = FastaFile(ref)
variant = next(vcf)
writer = Writer(tmpoutvcf, vcf)
try:
    with open(inbed) as f:
        for line in f:
            # chr,start,end,name,...
            items = line.rstrip("\n\r").split("\t")
            if nonexisting_contigs == "drop" and items[0] not in contigs:
                continue
	def __init__(self,
	             fname,
	             output,
	             vcfanno=list(),
	             true_label=0,
	             vep=list(),
	             vep_field='CSQ',
	             vep_separator='|'):
		"""

		Args:
			fname:
			output:
			vcfanno:
			true_label:
			vep_field:
			vep_separator:
		"""

		nan_dict = {
			'': np.nan,
			'.': np.nan,
			'NaN': np.nan,
			'nan': np.nan,
			'None': np.nan,
			'NA': np.nan,
		}

		unwanted = ['', '.', 'NaN', 'nan', 'None', 'NA']

		print('\nStarting VCF to Pandas ...')

		v = VCF(fname)

		l_dict = list()
		index_dict = dict()
		if vep_field:
			for h in v.header_iter():
				print(h)
				try:
					if h.info()['ID'] == vep_field:
						csq_header = h.info()['Description'].split(vep_separator)
						for elem in csq_header:
							index_dict[elem] = csq_header.index(elem)
				except:
					pass

		pprint(csq_header)
		pprint(index_dict)

		for record in tqdm(v):
			if len(record.REF) == 1 and len(record.ALT[0]) == 1:
				tmp_dict = dict()
				id = str(record.CHROM) + '_' + str(record.POS) + '_' + str(record.REF) + '_' + str(record.ALT[0])
				tmp_dict['ID'] = id
				if vep_field:
					csq = record.INFO.get(vep_field)
					if ',' in csq:
						csq = csq.split(',')
					else:
						csq = [csq]
					for case in csq:
						case = case.split(vep_separator)
						if 'missense_variant' in case[1]:
							tmp_dict['Amino_acids'] = case[index_dict['Amino_acids']]
				if true_label:
					tmp_dict['True_Label'] = true_label
				else:
					tmp_dict['True_Label'] = record.INFO.get('True_Label')
				# vcfanno=list()
				for col in vcfanno:
					field = str(record.INFO.get(col))
					if field:
						if ',' in field:
							field = field.split(',')
							if len(set(field)) <= 1:
								field = field[0]
							else:
								try:
									field = [float(e) for e in field if e not in unwanted]
									genename = False
								except:
									field = list(set(field))[0]
									genename = True
								if 'SIFT' in col and genename is False:
									field = min(field)
								elif genename is False:
									field = max(field)
								elif genename is True:
									field = field

						tmp_dict[col] = field
				l_dict.append(tmp_dict)
Example #20
0
def vcf2tsv(query_vcf, out_tsv, skip_info_data, skip_genotype_data,
            keep_rejected_calls, compress, print_data_type_header):

    vcf = VCF(query_vcf, gts012=True)
    out = open(out_tsv, 'w')

    fixed_columns_header = [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'
    ]
    fixed_columns_header_type = [
        'String', 'Integer', 'String', 'String', 'String', 'Float', 'String'
    ]
    samples = vcf.samples
    info_columns_header = []
    format_columns_header = []
    sample_columns_header = []
    column_types = {}
    gt_present_header = 0

    if len(samples) > 0:
        sample_columns_header.append('VCF_SAMPLE_ID')

    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO' or header_element[
                    'HeaderType'] == 'FORMAT':
                column_types[header_element['ID']] = header_element['Type']
            if header_element['HeaderType'] == 'INFO':
                if skip_info_data is False:
                    info_columns_header.append(header_element['ID'])
            if header_element['HeaderType'] == 'FORMAT':
                if len(sample_columns_header
                       ) > 0 and skip_genotype_data is False:
                    if header_element['ID'] != 'GT':
                        format_columns_header.append(header_element['ID'])
                    else:
                        gt_present_header = 1

    #header_line = '\t'.join(fixed_columns_header)
    header_tags = fixed_columns_header
    if skip_info_data is False:
        #header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header))
        header_tags = fixed_columns_header + sorted(info_columns_header)
        if len(sample_columns_header) > 0:
            if skip_genotype_data is False:
                #header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header)) + '\t' + '\t'.join(sample_columns_header) + '\t' + '\t'.join(sorted(format_columns_header)) + '\tGT'
                header_tags = fixed_columns_header + sorted(
                    info_columns_header) + sample_columns_header + sorted(
                        format_columns_header) + ['GT']
            else:
                #header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header))
                header_tags = fixed_columns_header + sorted(
                    info_columns_header)
    else:
        if len(sample_columns_header) > 0:
            if skip_genotype_data is False:
                #header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sample_columns_header) + '\t' + '\t'.join(sorted(format_columns_header)) + '\tGT'
                header_tags = fixed_columns_header + sample_columns_header + sorted(
                    format_columns_header) + ['GT']
            else:
                #header_line = '\t'.join(fixed_columns_header)
                header_tags = fixed_columns_header
    header_line = '\t'.join(header_tags)

    out.write('#https://github.com/sigven/vcf2tsv version=' + str(version) +
              '\n')
    if print_data_type_header is True:
        #header_tags = header_line.rstrip().split('\t')
        header_types = []
        for h in header_tags:
            if h in column_types:
                header_types.append(str(column_types[h]))
        #header_line_type = '\t'.join(fixed_columns_header_type) + '\t' + '\t'.join(header_types)
        header_line_type = '\t'.join(fixed_columns_header_type + header_types)
        out.write('#' + str(header_line_type) + '\n')
        out.write(str(header_line) + '\n')
    else:
        out.write(str(header_line) + '\n')

    for rec in vcf:
        rec_id = '.'
        rec_qual = '.'
        rec_filter = '.'
        alt = ",".join(str(n) for n in rec.ALT)
        if not rec.ID is None:
            rec_id = str(rec.ID)
        if not rec.QUAL is None:
            rec_qual = str("{0:.2f}".format(rec.QUAL))
        rec_filter = str(rec.FILTER)
        if rec.FILTER is None:
            rec_filter = 'PASS'

        pos = int(rec.start) + 1
        fixed_fields_string = str(
            rec.CHROM) + '\t' + str(pos) + '\t' + str(rec_id) + '\t' + str(
                rec.REF) + '\t' + str(alt) + '\t' + str(rec_qual) + '\t' + str(
                    rec_filter)

        if not 'PASS' in rec_filter and not keep_rejected_calls:
            continue

        variant_info = rec.INFO
        vcf_info_data = []
        if skip_info_data is False:
            for info_field in sorted(info_columns_header):
                if column_types[info_field] == 'Flag':
                    if variant_info.get(info_field) is None:
                        vcf_info_data.append('False')
                    else:
                        vcf_info_data.append('True')
                elif column_types[info_field] == 'Float' or column_types[
                        info_field] == 'Integer' or column_types[
                            info_field] == 'String' or column_types[
                                info_field] == 'Character':
                    if type(variant_info.get(info_field)) is list or type(
                            variant_info.get(info_field)) is tuple:
                        vcf_info_data.append(",".join(
                            str(n) for n in variant_info.get(info_field)))
                    else:
                        if variant_info.get(info_field) is None:
                            vcf_info_data.append('.')
                        else:
                            if column_types[info_field] == 'Float':
                                if not isinstance(variant_info.get(info_field),
                                                  float):
                                    print(
                                        'vcf2tsv.py WARNING:\tINFO tag ' +
                                        str(info_field) +
                                        ' is defined in the VCF header as type \'Float\', yet parsed as other type:'
                                        +
                                        str(type(variant_info.get(info_field)))
                                    )
                                    if not ',' in str(alt):
                                        print(
                                            'Warning: Multiple values in INFO tag for single ALT allele (VCF multiallelic sites not decomposed properly?):'
                                            + str(fixed_fields_string) + '\t' +
                                            str(info_field) + '=' +
                                            str(variant_info.get(info_field)))
                                    vcf_info_data.append('.')
                                else:
                                    val = str("{0:.7f}".format(
                                        variant_info.get(info_field)))
                                    vcf_info_data.append(val)
                            else:
                                if column_types[
                                        info_field] == 'String' or column_types[
                                            info_field] == 'Character':
                                    if isinstance(variant_info.get(info_field),
                                                  str):
                                        #print(str(info_field) + '\t' + variant_info.get(info_field).encode('ascii','ignore').rstrip().decode('ascii'))
                                        vcf_info_data.append(
                                            variant_info.get(
                                                info_field).encode(
                                                    'ascii',
                                                    'ignore').decode('ascii'))
                                    else:
                                        vcf_info_data.append('.')
                                        if column_types[
                                                info_field] == 'String':
                                            print(
                                                'vcf2tsv.py WARNING:\tINFO tag '
                                                + str(info_field) +
                                                ' is defined in the VCF header as type \'String\', yet parsed as other type:'
                                                + str(
                                                    type(
                                                        variant_info.get(
                                                            info_field))))
                                        if column_types[
                                                info_field] == 'Character':
                                            print(
                                                'vcf2tsv.py WARNING:\tINFO tag '
                                                + str(info_field) +
                                                ' is defined in the VCF header as type \'Character\', yet parsed as other type:'
                                                + str(
                                                    type(
                                                        variant_info.get(
                                                            info_field))))
                                else:
                                    if isinstance(variant_info.get(info_field),
                                                  int):
                                        vcf_info_data.append(
                                            str(variant_info.get(info_field)))
                                    else:
                                        print(
                                            'vcf2tsv.py WARNING:\tINFO tag ' +
                                            str(info_field) +
                                            ' is defined in the VCF header as type \'Integer\', yet parsed as other type:'
                                            + str(
                                                type(
                                                    variant_info.get(
                                                        info_field))))
                                        vcf_info_data.append(
                                            re.sub(
                                                r'\(|\)', '',
                                                variant_info.
                                                get(info_field).encode(
                                                    'ascii',
                                                    'ignore').decode('ascii')))

        #print(str(vcf_info_data))
        #dictionary, with sample names as keys, values being genotype data (dictionary with format tags as keys)
        vcf_sample_genotype_data = {}
        if len(samples) > 0 and skip_genotype_data is False:
            gt_cyvcf = rec.gt_types
            i = 0
            while i < len(samples):
                vcf_sample_genotype_data[samples[i]] = {}
                gt = './.'
                if gt_present_header == 1:
                    if gt_cyvcf[i] == 0:
                        gt = '0/0'
                    if gt_cyvcf[i] == 1:
                        gt = '0/1'
                    if gt_cyvcf[i] == 2:
                        gt = '1/1'
                vcf_sample_genotype_data[samples[i]]['GT'] = gt
                i = i + 1

        for format_tag in sorted(format_columns_header):
            if len(samples) > 0 and skip_genotype_data is False:
                sample_dat = rec.format(format_tag)
                if sample_dat is None:
                    k = 0
                    while k < len(samples):
                        if samples[k] in vcf_sample_genotype_data:
                            vcf_sample_genotype_data[
                                samples[k]][format_tag] = '.'
                        k = k + 1
                    continue
                dim = sample_dat.shape
                j = 0
                ## sample-wise
                while j < dim[0]:
                    if sample_dat[j].size > 1:
                        d = ','.join(
                            str(e) for e in np.ndarray.tolist(sample_dat[j]))
                        if samples[j] in vcf_sample_genotype_data:
                            vcf_sample_genotype_data[
                                samples[j]][format_tag] = d
                    else:
                        d = '.'
                        if column_types[format_tag] == 'String':
                            d = str(sample_dat[j])
                        if column_types[format_tag] == 'Integer':
                            d = str(sample_dat[j][0])
                        if samples[j] in vcf_sample_genotype_data:
                            vcf_sample_genotype_data[
                                samples[j]][format_tag] = d
                    j = j + 1

        #print(str(vcf_sample_genotype_data))
        tsv_elements = []
        tsv_elements.append(fixed_fields_string)
        if skip_info_data is False:
            if skip_genotype_data is False:
                if len(sample_columns_header) > 0:
                    tsv_elements.append("\t".join(
                        str(n) for n in vcf_info_data))
                    ## one line per sample variant
                    for s in sorted(vcf_sample_genotype_data.keys()):
                        sample = s
                        line_elements = []
                        line_elements.extend(tsv_elements)
                        line_elements.append(sample)
                        gt_tag = '.'
                        for tag in sorted(
                                vcf_sample_genotype_data[sample].keys()):
                            if tag != 'GT':
                                line_elements.append(
                                    vcf_sample_genotype_data[sample]
                                    [tag].encode('ascii',
                                                 'ignore').decode('ascii'))
                            else:
                                gt_tag = vcf_sample_genotype_data[sample][
                                    tag].encode('ascii',
                                                'ignore').decode('ascii')
                        line_elements.append(gt_tag)
                        if gt_tag == './.' or gt_tag == '.':
                            if keep_rejected_calls:
                                out.write('\t'.join(line_elements) + '\n')
                        else:
                            out.write("\t".join(str(n)
                                                for n in line_elements) + '\n')

                else:
                    tsv_elements.append("\t".join(
                        str(n) for n in vcf_info_data))
                    line_elements = []
                    line_elements.extend(tsv_elements)
                    out.write('\t'.join(line_elements) + '\n')
            else:
                tsv_elements.append("\t".join(str(n) for n in vcf_info_data))
                line_elements = []
                line_elements.extend(tsv_elements)
                out.write('\t'.join(line_elements) + '\n')
        else:
            if skip_genotype_data is False:
                if len(sample_columns_header) > 0:
                    ## one line per sample variant
                    for s in sorted(vcf_sample_genotype_data.keys()):
                        sample = s
                        line_elements = []
                        line_elements.extend(tsv_elements)
                        line_elements.append(sample)
                        gt_tag = '.'
                        for tag in sorted(
                                vcf_sample_genotype_data[sample].keys()):
                            if tag != 'GT':
                                line_elements.append(
                                    vcf_sample_genotype_data[sample][tag])
                            else:
                                gt_tag = vcf_sample_genotype_data[sample][tag]
                        line_elements.append(gt_tag)
                        if gt_tag == './.' or gt_tag == '.':
                            if keep_rejected_calls:
                                out.write('\t'.join(line_elements) + '\n')
                        else:
                            out.write('\t'.join(line_elements) + '\n')
            else:
                line_elements = []
                line_elements.extend(tsv_elements)
                line_elements = tsv_elements
                out.write('\t'.join(line_elements) + '\n')

    out.close()

    if compress is True:
        command = 'gzip -f ' + str(out_tsv)
        check_subprocess(command)
def generate_vcf(gnomad_vcf, outfile, pop, format_fields):
    logging.info("Processing gnomAD file")
    nind = get_number_individuals(gnomad_vcf, pop)
    gt_dp, gt_qual = generate_putative_GQ_DP(format_fields, nind)
    vcf_data = VCF(gnomad_vcf, gts012=True)
    with open(outfile, 'w') as out:
        #with gzip.open(outfile, 'wb') as out:
        vcf_data.add_format_to_header({
            'ID': 'GT',
            'Description': 'Genotype',
            'Type': 'String',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'AD',
            'Description':
            'Allelic depths for the ref and alt alleles in the order listed',
            'Type': 'Integer',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'DP',
            'Description': 'Approximate read depth',
            'Type': 'Integer',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'GQ',
            'Description': 'Genotyp Quality',
            'Type': 'Integer',
            'Number': 1
        })
        vcf_data.add_format_to_header({
            'ID': 'PL',
            'Description':
            'normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification',
            'Type': 'Integer',
            'Number': "G"
        })

        individuals = ["ind_" + str(i) for i in range(1, nind, 1)]
        header = filter(None, vcf_data.raw_header.split("\n"))
        final_header = [
            line + '\t' +
            '\t'.join(individuals) if line.startswith("#CHROM") else line
            for line in header
        ]
        out.write('\n'.join(final_header) + "\n")
        #out.write('\n'.join(final_header).encode() + "\n".encode())
        info_fields = [
            field["ID"] for field in vcf_data.header_iter()
            if field["HeaderType"] == "INFO"
        ]

        for record in vcf_data:
            info = record.INFO
            if pop == "All":
                nhomalt = info["nhomalt"]
                nheterozygous = info["AC"] - (nhomalt * 2)
            else:
                nhomalt = info["nhomalt" + "_" + pop]
                nheterozygous = info["AC" + "_" + pop] - (nhomalt * 2)

            record_combined_gt = simulate_genotypes(nind, nhomalt,
                                                    nheterozygous)
            gt_phred_ll_homref = np.zeros((nind, ), dtype=np.int)
            gt_phred_ll_het = np.full(shape=nind,
                                      fill_value=1500,
                                      dtype=np.int)
            gt_phred_ll_homalt = np.full(shape=nind,
                                         fill_value=1500,
                                         dtype=np.int)
            gt_alt_depth = np.zeros((nind, ), dtype=np.int)
            gt_ref_depth = copy.deepcopy(gt_dp)
            fmt = []
            for i, gt in enumerate(record_combined_gt):
                if gt == "0/1":  # if het
                    gt_phred_ll_het[i] = 0
                    gt_phred_ll_homref[i] = 1500
                    gt_alt_depth[i] = gt_ref_depth[i] = 50
                elif gt == "1/1":  # if homalt
                    gt_phred_ll_homalt[i] = 1500
                    gt_phred_ll_homref[i] = 1500
                    gt_alt_depth[i] = 100
                    gt_ref_depth[i] = 0

                fmt.append("{}:{},{}:{}:{}:{},{},{}".format(
                    gt, gt_ref_depth[i], gt_alt_depth[i], gt_dp[i], gt_qual[i],
                    gt_phred_ll_homref[i], gt_phred_ll_het[i],
                    gt_phred_ll_homalt[i]))
            str_info = []
            for i in info_fields:
                try:
                    str_info.append(i + "=" + str(record.INFO[i]))
                except KeyError as abs_fied:
                    #print("Field {} absent".format(abs_fied))
                    continue

            write_record = [
                '.' if v is None else v for v in [
                    record.CHROM,
                    str(record.POS), record.ID, record.REF, record.ALT[0],
                    str(record.QUAL), record.FILTER, ';'.join(str_info),
                    "GT:AD:DP:GQ:PL"
                ]
            ]

            out.write('\t'.join(write_record + fmt) + "\n")
            #out.write('\t'.join(write_record + fmt).encode() + "\n".encode())

        vcf_data.close()
        out.close()
    return nind
Example #22
0
def vcf2pd(vcf_in):
    '''
    VCF 파일을 읽어서 pandas dataframe 형식으로 return함
    :param vcf_in: VCF 파일 (.vcf/.vcf.gz/.bcf)
    :return: pandas dataframe
    '''

    vcf = VCF(vcf_in, gts012=True)
    lRows = []  # row의 list를 만들어서 마지막에 DataFrame으로 변환하는게 가장 빠르다.
    lInfo = []  # INFO list
    lFormat = []  # FORMAT list

    samples = vcf.samples
    n_samples = len(samples)

    # INFO FIELD의 item을 얻는다
    for h in vcf.header_iter():
        if (h['HeaderType'] == 'INFO'):
            lInfo.append(h['ID'])

        if (h['HeaderType'] == 'FORMAT'):
            lFormat.append(h['ID'])

    # sample information이 있는지 찾는다 (mutect2 output의 경우 이 정보가 포함되어있음)
    re_tumor = re.compile('##tumor_sample=.*')
    re_normal = re.compile('##normal_sample=.*')
    if re_tumor.search(vcf.raw_header) is not None:
        samples[samples.index(
            re_tumor.search(vcf.raw_header).group().split('=')[1])] = 'TUMOR'

    if re_normal.search(vcf.raw_header) is not None:
        samples[samples.index(
            re_normal.search(vcf.raw_header).group().split('=')[1])] = 'NORMAL'

    for v in vcf:

        # 8개의 fixed field를 저장한다.

        dVariant = {
            'CHROM': v.CHROM,
            'POS': v.POS,
            'ID': v.ID,
            'REF': v.REF,
            'ALT': ','.join(v.ALT),
            'QUAL': v.QUAL,
            'FILTER': v.FILTER
        }

        if not dVariant[
                'FILTER']:  # cyvcf2에서는 FILTER가 PASS일때 FILTER=None으로 저장하기 때문에 다시 'PASS'로 돌려줌
            dVariant['FILTER'] = 'PASS'

        # INFO field 처리
        for i in lInfo:
            dVariant[i] = v.INFO.get(i)

        # FORMAT field 처리
        for f in lFormat:

            for i in range(n_samples):

                if f == 'GT':
                    dVariant[samples[i] + '_' + f] = v.gt_types[i]
                    # v.format('GT')에는 이상한 형식으로 저장됨.
                    # gt_type = 0 --> hom_ref, gt_type = 1 --> hetero, gt_type = 2 --> hom_alt, gt_type = 3 --> unknown
                else:

                    if v.format(f) is not None:  # field가 None이 아니면

                        if isinstance(v.format(f)[i], str):
                            dVariant[samples[i] + '_' + f] = str(
                                v.format(f)
                                [i])  # string일 경우 각 letter들이 comma로 구분되는 것 방지
                        elif np.isnan(v.format(f)[i]).any():  # nan일 경우..
                            dVariant[samples[i] + '_' + f] = None
                        else:
                            dVariant[samples[i] + '_' + f] = ','.join(
                                list(map(str,
                                         v.format(f)[i])))

        lRows.append(dVariant)

    cols = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER']
    cols.extend(lInfo)
    cols.extend([s + '_' + f for s in samples for f in lFormat])

    if lRows:
        df = pd.DataFrame(lRows, columns=cols)
    else:
        df = pd.DataFrame(
            columns=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'])

    return df
Example #23
0
def process_vep_scores(vcf_file, is_clinvar=False):
    indexes, scores, absent = OrderedDict(), defaultdict(list), []
    vcf_data = VCF(vcf_file)
    if vcf_data.contains("ANN") or vcf_data.contains("CSQ"):
        spliceai_scores_fromtool=False
        for field in vcf_data.header_iter():
            if field["HeaderType"] == "INFO" and field["ID"] == "ANN":
                tools = field["Description"].split("Format:")[1][:-1].strip().split("|")
                for plugin in vep_plugins:
                    if plugin in tools:
                        indexes[plugin] = tools.index(plugin)
                    else:
                        absent.append(plugin)
            elif field["HeaderType"] == "INFO" and field["ID"] == "SpliceAI":
                spliceai_scores_fromtool=True

        for record in vcf_data:
            key = record.ID + "_" + str(record.POS)
            scores[key].extend([record.CHROM,record.POS, record.REF, record.ALT, record.ID, record.var_type, record.var_subtype])
            annotation = record.INFO.get("ANN")
            if annotation:
                info = annotation.split(",")[0].split("|")

                try:
                    scores[key].append(info[tools.index('Existing_variation')])
                    scores[key].append(info[tools.index('HGVSc')])
                    scores[key].append(info[tools.index('Consequence')])
                except ValueError:
                    pass
                if len(absent) > 0:
                    scores[key].append(("GERP", record.INFO.get("GERP")))
                    scores[key].append(("phyloP", record.INFO.get("phyloP")))
                    scores[key].append(("phastCons", record.INFO.get("phastCons")))
                    scores[key].append(("SiPhy", record.INFO.get("SiPhy")))
                    scores[key].append(("fitcons", record.INFO.get("fitcons")))
                    scores[key].append(("LINSIGHT", record.INFO.get("linsight_g")))
                    scores[key].append(("ReMM", record.INFO.get("ReMM")))
                    scores[key].append(("DANN", record.INFO.get("DANN")))
                    scores[key].append(("GWAVA", record.INFO.get("GWAVA")))
                    scores[key].append(("FATHMM-MKL", record.INFO.get("fathmmMKL")))
                    scores[key].append(("Eigen", record.INFO.get("Eigen")))
                    scores[key].append(("Eigen-PC", record.INFO.get("Eigen-PC")))
                    scores[key].append(("funseq2", record.INFO.get("funseq2")))
                    scores[key].append(("dpsi_zscore", record.INFO.get("dpsi_zscore")))
                    scores[key].append(("traP", record.INFO.get("traP")))
                    if spliceai_scores_fromtool:
                        scores[key].append(("SpliceAI", record.INFO.get("SpliceAI")))
                    else:
                        scores[key].append(("SpliceAI", format_spliceai_fields(record)))

                for pl, i in indexes.items():
                    scores[key].append((pl, info[i]))

            if is_clinvar:
                scores[key].append(('CLNREVSTAT', record.INFO.get("CLNREVSTAT")))
                scores[key].append(('CLNSIG', record.INFO.get("CLNSIG")))


    else:
        print("Program requires ANN field to be present in the INFO field of the input VCF file.\n")
        exit(1)

    for k, v in scores.items():
        [val[1] for val in v if isinstance(val,tuple) and val[0] == "SpliceAI"]

    vcf_data.close()
    return scores
Example #24
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, pon_annotation, cpsr):
   """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   5. Panel-of-normal (blacklisted variants) annotation

   List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_directory
   """

   ## read VEP and PCGR tags to be appended to VCF file
   vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'pcgr_infotags.tsv'))
   if cpsr is True:
      vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'cpsr_infotags.tsv'))

   out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

   meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
   dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
   vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
   vcf = VCF(query_vcf)
   for tag in sorted(vcf_infotags_meta):
      if pon_annotation == 0:
         if not tag.startswith('PANEL_OF_NORMALS'):
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
      else:
         vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

   w = Writer(out_vcf, vcf)
   current_chrom = None
   num_chromosome_records_processed = 0
   pcgr_onco_xref_map = {'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID':1, 'ENSEMBL_PROTEIN_ID':2, 'SYMBOL':3, 'SYMBOL_ENTREZ':4, 
                        'ENTREZ_ID':5, 'UNIPROT_ID':6, 'APPRIS':7,'UNIPROT_ACC':8,'REFSEQ_MRNA':9,'CORUM_ID':10,'TUMOR_SUPPRESSOR':11,
                        'TUMOR_SUPPRESSOR_EVIDENCE':12, 'ONCOGENE':13, 'ONCOGENE_EVIDENCE':14,
                        'NETWORK_CG':15,'DISGENET_CUI':16,'CHEMBL_COMPOUND_ID':17,'CHEMBL_COMPOUND_ID_EARLY_PHASE':18, 'INTOGEN_DRIVER':19,
                        'TCGA_DRIVER':20,'ONCOSCORE':21, 'MIM_PHENOTYPE_ID':22, 'CANCER_PREDISPOSITION_SOURCE':23, 
                        'CANCER_SUSCEPTIBILITY_CUI':24, 'CANCER_SYNDROME_CUI':25, 'CANCER_PREDISPOSITION_MOI':26, 
                        'CANCER_PREDISPOSITION_MOD':27, 'SIGNALING_PATHWAY':28, 'OPENTARGETS_DISEASE_ASSOCS':29,
                        'OPENTARGETS_TRACTABILITY_COMPOUND':30, 'OPENTARGETS_TRACTABILITY_ANTIBODY':31, 'GE_PANEL_ID':32, 
                        'ACTIONABLE_TARGET':33,'GENCODE_GENE_STATUS':34,
                        'PROB_HAPLOINSUFFICIENCY':35,'PROB_EXAC_LOF_INTOLERANT':36,'PROB_EXAC_LOF_INTOLERANT_HOM':37,
                        'PROB_EXAC_LOF_TOLERANT_NULL':38,'PROB_EXAC_NONTCGA_LOF_INTOLERANT':39,
                        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM':40, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL':41,
                        'PROB_GNOMAD_LOF_INTOLERANT':42, 'PROB_GNOMAD_LOF_INTOLERANT_HOM':43, 'PROB_GNOMAD_LOF_TOLERANT_NULL':44,
                        'ESSENTIAL_GENE_CRISPR':45, 'ESSENTIAL_GENE_CRISPR2':46}
   
   vcf_info_element_types = {}
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
         identifier = str(header_element['ID'])
         fieldtype = str(header_element['Type'])
         vcf_info_element_types[identifier] = fieldtype

   for rec in vcf:
      if current_chrom is None:
         current_chrom = str(rec.CHROM)
         num_chromosome_records_processed = 0
      else:
         if str(rec.CHROM) != current_chrom:
            if not current_chrom is None:
               logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
      if rec.INFO.get('CSQ') is None:
         alt_allele = ','.join(rec.ALT)
         pos = rec.start + 1
         variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele
         logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped')
         continue
      csq_record_results = {}
      num_chromosome_records_processed += 1
      pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")
      csq_record_results = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')

      vep_csq_records = None 
      if 'vep_all_csq' in csq_record_results:
         rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results['vep_all_csq'])
      if 'vep_block' in csq_record_results:
         vep_csq_records = csq_record_results['vep_block']

         block_idx = 0
         if cpsr is True:
            block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
         record = vep_csq_records[block_idx]
         for k in record:
            if k in vcf_info_element_types:
               if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                  rec.INFO[k] = True
               else:
                  if not record[k] is None:
                     rec.INFO[k] = record[k]
      
      if not rec.INFO.get('DBNSFP') is None:
         annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)


      w.write_record(rec)
   w.close()
   if current_chrom is not None:
      logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
   vcf.close()

   if os.path.exists(out_vcf):
      if os.path.getsize(out_vcf) > 0:
         check_subprocess(logger, 'bgzip -f ' + str(out_vcf))
         check_subprocess(logger, 'tabix -f -p vcf ' + str(out_vcf) + '.gz')
         annotated_vcf = out_vcf + '.gz'
         annoutils.write_pass_vcf(annotated_vcf, logger)
      else:
         annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
   else:
      annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
Example #25
0
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc
   3. Protein-relevant annotations, e.g. c functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv'))
    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(
        query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[
        'dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = VCF(query_vcf)
    for tag in vcf_infotags_meta:
        if lof_prediction == 0:
            if not tag.startswith('LoF'):
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    str(vcf_infotags_meta[tag]['description']),
                    'Type':
                    str(vcf_infotags_meta[tag]['type']),
                    'Number':
                    str(vcf_infotags_meta[tag]['number'])
                })
        else:
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(vcf_infotags_meta[tag]['description']),
                'Type':
                str(vcf_infotags_meta[tag]['type']),
                'Number':
                str(vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    gvanno_xref_map = {
        'ENSEMBL_TRANSCRIPT_ID': 0,
        'ENSEMBL_GENE_ID': 1,
        'ENSEMBL_PROTEIN_ID': 2,
        'SYMBOL': 3,
        'SYMBOL_ENTREZ': 4,
        'ENTREZ_ID': 5,
        'UNIPROT_ID': 6,
        'UNIPROT_ACC': 7,
        'REFSEQ_MRNA': 8,
        'CORUM_ID': 9,
        'TUMOR_SUPPRESSOR': 10,
        'TUMOR_SUPPRESSOR_EVIDENCE': 11,
        'ONCOGENE': 12,
        'ONCOGENE_EVIDENCE': 13,
        'MIM_PHENOTYPE_ID': 14,
        'OPENTARGETS_DISEASE_ASSOCS': 15,
        'OPENTARGETS_TRACTABILITY_COMPOUND': 16,
        'OPENTARGETS_TRACTABILITY_ANTIBODY': 17,
        'PROB_HAPLOINSUFFICIENCY': 18,
        'PROB_EXAC_LOF_INTOLERANT': 19,
        'PROB_EXAC_LOF_INTOLERANT_HOM': 20,
        'PROB_EXAC_LOF_TOLERANT_NULL': 21,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT': 22,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM': 23,
        'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL': 24,
        'PROB_GNOMAD_LOF_INTOLERANT': 25,
        'PROB_GNOMAD_LOF_INTOLERANT_HOM': 26,
        'PROB_GNOMAD_LOF_TOLERANT_NULL': 27,
        'ESSENTIAL_GENE_CRISPR': 28,
        'ESSENTIAL_GENE_CRISPR2': 29
    }

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        num_chromosome_records_processed += 1
        gvanno_xref = annoutils.make_transcript_xref_map(
            rec, gvanno_xref_map, xref_tag="GVANNO_XREF")

        csq_record_results = annoutils.parse_vep_csq(rec,
                                                     gvanno_xref,
                                                     vep_csq_fields_map,
                                                     logger,
                                                     pick_only=True,
                                                     csq_identifier='CSQ')
        if 'vep_all_csq' in csq_record_results:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(
                csq_record_results['vep_all_csq'])
        if 'vep_block' in csq_record_results:
            vep_csq_records = csq_record_results['vep_block']
            block_idx = 0
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]

        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(
                rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
            logger)
Example #26
0
def printFields(vcf, fields, printall):
    standard_fields = [
        "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "FORMAT"
    ]
    existing_info, final_final_existing_info = [], []
    indexes = defaultdict(list)
    vcf_data = VCF(vcf, gts012=True)
    for field in vcf_data.header_iter():
        d = field.info()
        if d['HeaderType'] == "INFO":
            existing_info.append(d['ID'])
    print("#List of available INFO fields extracted from the header:\n#{}".
          format(existing_info))
    for field in vcf_data.header_iter():
        if field["HeaderType"] == "INFO" and field["ID"] == "ANN":
            tools = field["Description"].split(
                "Format:")[1][:-1].strip().split("|")
            print("#List of available fields within ANN field:\n#{}".format(
                tools))
            final_existing_info = list(set(existing_info) - set(tools))
            for f in fields:
                try:
                    if not f in standard_fields and not f in final_existing_info:
                        indexes[f] = tools.index(f)
                except ValueError:
                    print("{} field not recognized".format(f))
                    exit(1)
        else:
            final_existing_info = existing_info

    print("#{}".format("\t".join(fields)))
    for record in vcf_data:
        d = defaultdict(list)
        outline = []
        try:
            info_obj = record.INFO
        except AttributeError:
            standard_where_no_info = []
            [
                standard_where_no_info.append(gets_standard_field(record, f))
                for f in fields if f in standard_fields
            ]
            if len(standard_where_no_info) > 0:
                print('\t'.join(standard_where_no_info))
            continue

        try:
            if printall:
                info_vep = info_obj.get("ANN").split(",")
            else:
                info_vep = info_obj.get("ANN").split(",")[0].split("|")
        except AttributeError:
            info_vep = []

        for f in fields:
            if printall and not info_vep:
                print(
                    "No ANN field found in VCF, therefore --printAll argument is useless.\n"
                )
                exit(1)
            elif printall:
                for i, block in enumerate(info_vep):
                    try:
                        d[i].append(
                            gets_standard_field(record, f)
                        ) if f in standard_fields or f in final_existing_info else d[
                            i].append(block.split("|")[indexes[f]])
                    except IndexError:
                        continue
            else:
                if f in standard_fields or f in final_existing_info:
                    outline.append(gets_standard_field(record, f))
                elif info_vep:
                    [
                        outline.append(info_vep[indexes[f]])
                        if info_vep[indexes[f]] else outline.append('None')
                    ]
                else:
                    print("{} field not in VCF".format(f))
                    exit(1)
        if len(d) > 0:
            for block in d.keys():
                print('\t'.join(d[block]))
            print('\n')
        else:
            print('\t'.join(outline))