コード例 #1
0
ファイル: cpsr_validate_input.py プロジェクト: flywind2/pcgr
def is_valid_vcf(input_vcf, output_dir, logger):
    """
   Function that reads the output file of EBIvariation/vcf-validator and reports potential errors and validation status
   """

    logger.info('Validating VCF file with EBIvariation/vcf-validator')
    vcf_validation_output_file = os.path.join(
        output_dir,
        re.sub(r'(\.vcf$|\.vcf\.gz$)', '.vcf_validator_output',
               os.path.basename(input_vcf)))
    command_v42 = 'vcf_validator --input ' + str(input_vcf) + ' > ' + str(
        vcf_validation_output_file)
    if input_vcf.endswith('.gz'):
        command_v42 = 'bgzip -dc ' + str(
            input_vcf) + ' | vcf_validator  > ' + str(
                vcf_validation_output_file)
    os.system(command_v42)

    #is_valid_vcf = -1
    validation_results = {}
    validation_results['validation_status'] = 0
    validation_results['error_messages'] = []
    if os.path.exists(vcf_validation_output_file):
        f = open(vcf_validation_output_file, 'r')
        for line in f:
            if not re.search(r' \(warning\)$|^Reading from ',
                             line.rstrip()):  ## ignore warnings
                if line.startswith('Line '):
                    validation_results['error_messages'].append('ERROR: ' +
                                                                line.rstrip())
                if 'the input file is valid' in line.rstrip():  ## valid VCF
                    validation_results['validation_status'] = 1
                if 'the input file is not valid' in line.rstrip(
                ):  ## non-valid VCF
                    validation_results['validation_status'] = 0
        f.close()
        os.system('rm -f ' + str(vcf_validation_output_file))
    else:
        err_msg = str(vcf_validation_output_file) + ' does not exist'
        return annoutils.error_message(err_msg, logger)

    if validation_results['validation_status'] == 0:
        error_string_42 = '\n'.join(validation_results['error_messages'])
        validation_status = 'According to the VCF specification, the VCF file (' + str(
            input_vcf) + ') is NOT valid'
        err_msg = validation_status + ':\n' + str(error_string_42)
        return annoutils.error_message(err_msg, logger)
    else:
        validation_status = 'According to the VCF specification, the VCF file ' + str(
            input_vcf) + ' is valid'
        logger.info(validation_status)
    return 0
コード例 #2
0
ファイル: cpsr_validate_input.py プロジェクト: flywind2/pcgr
def is_valid_custom_bed(bed_file, logger):
    """
   Function that checks whether the custom panel (BED) adheres to the correct format
   """

    bed_reader = csv.DictReader(open(bed_file, 'r'), delimiter='\t')
    for row in bed_reader:
        fields = len(row)
        if len(row) != 4:
            err_msg = 'BED file with custom screening regions must contain four columns: \'Chromosome\', \'Start\',\'End\',\'GeneSymbol\' - found entry containing ' + len(
                row) + ' columns'
            return annoutils.error_message(err_msg, logger)

    bed_reader = csv.DictReader(
        open(bed_file, 'r'),
        delimiter='\t',
        fieldnames=['Chromosome', 'Start', 'End', 'Symbol'])

    bed_dataframe = np.read_csv(bed_file,
                                usecols=[0, 1, 2, 3],
                                sep="\t",
                                names=["Chromosome", "Start", "End", "Symbol"])
    if not bed_dataframe[
            'Start'].dtype.kind in 'i':  ## check that 'Start' is of type integer
        err_msg = '\'Start\' column of BED file (custom panel) contains non-integer values'
        return annoutils.error_message(err_msg, logger)
    if not bed_dataframe[
            'End'].dtype.kind in 'i':  ## check that 'End' is of type integer
        err_msg = '\'End\' column of BED file (custom panel) contains non-integer values'
        return annoutils.error_message(err_msg, logger)

    for rec in bed_reader:
        if int(rec['End']) < int(
                rec['Start']
        ):  ## check that 'End' is always greather than 'Start'
            err_msg = 'Detected wrongly formatted BED segment - \'Start\' is greater than \'End\' (' + str(
                rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(
                    rec['End']) + ')'
            return annoutils.error_message(err_msg, logger)
        if int(rec['End']) < 1 or int(
                rec['Start']
        ) < 0:  ## check that 'Start' and 'End' is always non-negative
            err_msg = 'Detected wrongly formatted BED segment - \'Start\' or \'End\' is less than or equal to zero (' + str(
                rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(
                    rec['End']) + ')'
            return annoutils.error_message(err_msg, logger)
    logger.info('Custom panel BED file (' + str(bed_file) +
                ') adheres to the correct format (gene symbols not checked)')

    return 0
コード例 #3
0
ファイル: cpsr_validate_input.py プロジェクト: flywind2/pcgr
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly,
                                 logger):
    """
   Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR
   If any coinciding tags, an error will be returned
   """

    pcgr_infotags_desc = annoutils.read_infotag_file(
        os.path.join(pcgr_directory, 'data', genome_assembly,
                     'cpsr_infotags.tsv'))

    vcf = VCF(input_vcf)
    logger.info(
        'Checking if existing INFO tags of query VCF file coincide with CPSR INFO tags'
    )
    ret = 1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys(
        ) and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO':
                if header_element['ID'] in pcgr_infotags_desc.keys():
                    err_msg = 'INFO tag ' + str(
                        header_element['ID']
                    ) + ' in the query VCF coincides with a VCF annotation tag produced by CPSR - please remove or rename this tag in your query VCF'
                    return annoutils.error_message(err_msg, logger)

    logger.info('No query VCF INFO tags coincide with CPSR INFO tags')
    return ret
コード例 #4
0
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname,
                        configuration_file, vcf_validation, genome_assembly,
                        sample_id, virtual_panel_id, diagnostic_grade_only,
                        output_dir):
    """
   Function that reads the input files to CPSR (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR
   3. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   4. Check that VCF contains a single sample column 
   5. The resulting VCF file is sorted and indexed (bgzip + tabix)
   """
    logger = annoutils.getlogger('cpsr-validate-input')

    custom_list_bed_fname = 'None'
    if not custom_list_fname == 'None':
        custom_list_bed_fname = os.path.join(
            output_dir,
            sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed')
        get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname,
                                  sample_id, pcgr_directory, output_dir,
                                  genome_assembly, logger)

    #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr')
    if not input_vcf == 'None':
        if vcf_validation == 1:
            valid_vcf = annoutils.is_valid_vcf(input_vcf, output_dir, logger,
                                               debug)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as provided by option --no_vcf_validate'
            )

        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)
        samples = vcf.samples
        if len(samples) > 1:
            err_msg = "Query VCF contains more than one sample column (" + ', '.join(
                samples
            ) + ") - CPSR expects a germline VCF with a single sample column - exiting"
            return annoutils.error_message(err_msg, logger)
        simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory,
                     genome_assembly, virtual_panel_id, diagnostic_grade_only,
                     output_dir, logger)

    return 0
コード例 #5
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, pon_annotation, cpsr):
   """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   5. Panel-of-normal (blacklisted variants) annotation

   List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_directory
   """

   ## read VEP and PCGR tags to be appended to VCF file
   vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'pcgr_infotags.tsv'))
   if cpsr is True:
      vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_directory,'cpsr_infotags.tsv'))

   out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

   meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
   dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
   vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
   vcf = VCF(query_vcf)
   for tag in sorted(vcf_infotags_meta):
      if pon_annotation == 0:
         if not tag.startswith('PANEL_OF_NORMALS'):
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
      else:
         vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

   w = Writer(out_vcf, vcf)
   current_chrom = None
   num_chromosome_records_processed = 0
   pcgr_onco_xref_map = {'ENSEMBL_TRANSCRIPT_ID': 0, 'ENSEMBL_GENE_ID':1, 'ENSEMBL_PROTEIN_ID':2, 'SYMBOL':3, 'SYMBOL_ENTREZ':4, 
                        'ENTREZ_ID':5, 'UNIPROT_ID':6, 'APPRIS':7,'UNIPROT_ACC':8,'REFSEQ_MRNA':9,'CORUM_ID':10,'TUMOR_SUPPRESSOR':11,
                        'TUMOR_SUPPRESSOR_EVIDENCE':12, 'ONCOGENE':13, 'ONCOGENE_EVIDENCE':14,
                        'NETWORK_CG':15,'DISGENET_CUI':16,'CHEMBL_COMPOUND_ID':17,'CHEMBL_COMPOUND_ID_EARLY_PHASE':18, 'INTOGEN_DRIVER':19,
                        'TCGA_DRIVER':20,'ONCOSCORE':21, 'MIM_PHENOTYPE_ID':22, 'CANCER_PREDISPOSITION_SOURCE':23, 
                        'CANCER_SUSCEPTIBILITY_CUI':24, 'CANCER_SYNDROME_CUI':25, 'CANCER_PREDISPOSITION_MOI':26, 
                        'CANCER_PREDISPOSITION_MOD':27, 'SIGNALING_PATHWAY':28, 'OPENTARGETS_DISEASE_ASSOCS':29,
                        'OPENTARGETS_TRACTABILITY_COMPOUND':30, 'OPENTARGETS_TRACTABILITY_ANTIBODY':31, 'GE_PANEL_ID':32, 
                        'ACTIONABLE_TARGET':33,'GENCODE_GENE_STATUS':34,
                        'PROB_HAPLOINSUFFICIENCY':35,'PROB_EXAC_LOF_INTOLERANT':36,'PROB_EXAC_LOF_INTOLERANT_HOM':37,
                        'PROB_EXAC_LOF_TOLERANT_NULL':38,'PROB_EXAC_NONTCGA_LOF_INTOLERANT':39,
                        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM':40, 'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL':41,
                        'PROB_GNOMAD_LOF_INTOLERANT':42, 'PROB_GNOMAD_LOF_INTOLERANT_HOM':43, 'PROB_GNOMAD_LOF_TOLERANT_NULL':44,
                        'ESSENTIAL_GENE_CRISPR':45, 'ESSENTIAL_GENE_CRISPR2':46}
   
   vcf_info_element_types = {}
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
         identifier = str(header_element['ID'])
         fieldtype = str(header_element['Type'])
         vcf_info_element_types[identifier] = fieldtype

   for rec in vcf:
      if current_chrom is None:
         current_chrom = str(rec.CHROM)
         num_chromosome_records_processed = 0
      else:
         if str(rec.CHROM) != current_chrom:
            if not current_chrom is None:
               logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
      if rec.INFO.get('CSQ') is None:
         alt_allele = ','.join(rec.ALT)
         pos = rec.start + 1
         variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele
         logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped')
         continue
      csq_record_results = {}
      num_chromosome_records_processed += 1
      pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")
      csq_record_results = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')

      vep_csq_records = None 
      if 'vep_all_csq' in csq_record_results:
         rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results['vep_all_csq'])
      if 'vep_block' in csq_record_results:
         vep_csq_records = csq_record_results['vep_block']

         block_idx = 0
         if cpsr is True:
            block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
         record = vep_csq_records[block_idx]
         for k in record:
            if k in vcf_info_element_types:
               if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                  rec.INFO[k] = True
               else:
                  if not record[k] is None:
                     rec.INFO[k] = record[k]
      
      if not rec.INFO.get('DBNSFP') is None:
         annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)


      w.write_record(rec)
   w.close()
   if current_chrom is not None:
      logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
   vcf.close()

   if os.path.exists(out_vcf):
      if os.path.getsize(out_vcf) > 0:
         check_subprocess(logger, 'bgzip -f ' + str(out_vcf))
         check_subprocess(logger, 'tabix -f -p vcf ' + str(out_vcf) + '.gz')
         annotated_vcf = out_vcf + '.gz'
         annoutils.write_pass_vcf(annotated_vcf, logger)
      else:
         annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
   else:
      annoutils.error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)', logger)
コード例 #6
0
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc
   3. Protein-relevant annotations, e.g. c functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv'))
    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(
        query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[
        'dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = VCF(query_vcf)
    for tag in vcf_infotags_meta:
        if lof_prediction == 0:
            if not tag.startswith('LoF'):
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    str(vcf_infotags_meta[tag]['description']),
                    'Type':
                    str(vcf_infotags_meta[tag]['type']),
                    'Number':
                    str(vcf_infotags_meta[tag]['number'])
                })
        else:
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(vcf_infotags_meta[tag]['description']),
                'Type':
                str(vcf_infotags_meta[tag]['type']),
                'Number':
                str(vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    gvanno_xref_map = {
        'ENSEMBL_TRANSCRIPT_ID': 0,
        'ENSEMBL_GENE_ID': 1,
        'ENSEMBL_PROTEIN_ID': 2,
        'SYMBOL': 3,
        'SYMBOL_ENTREZ': 4,
        'ENTREZ_ID': 5,
        'UNIPROT_ID': 6,
        'UNIPROT_ACC': 7,
        'REFSEQ_MRNA': 8,
        'CORUM_ID': 9,
        'TUMOR_SUPPRESSOR': 10,
        'TUMOR_SUPPRESSOR_EVIDENCE': 11,
        'ONCOGENE': 12,
        'ONCOGENE_EVIDENCE': 13,
        'MIM_PHENOTYPE_ID': 14,
        'OPENTARGETS_DISEASE_ASSOCS': 15,
        'OPENTARGETS_TRACTABILITY_COMPOUND': 16,
        'OPENTARGETS_TRACTABILITY_ANTIBODY': 17,
        'PROB_HAPLOINSUFFICIENCY': 18,
        'PROB_EXAC_LOF_INTOLERANT': 19,
        'PROB_EXAC_LOF_INTOLERANT_HOM': 20,
        'PROB_EXAC_LOF_TOLERANT_NULL': 21,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT': 22,
        'PROB_EXAC_NONTCGA_LOF_INTOLERANT_HOM': 23,
        'PROB_EXAC_NONTCGA_LOF_TOLERANT_NULL': 24,
        'PROB_GNOMAD_LOF_INTOLERANT': 25,
        'PROB_GNOMAD_LOF_INTOLERANT_HOM': 26,
        'PROB_GNOMAD_LOF_TOLERANT_NULL': 27,
        'ESSENTIAL_GENE_CRISPR': 28,
        'ESSENTIAL_GENE_CRISPR2': 29
    }

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        num_chromosome_records_processed += 1
        gvanno_xref = annoutils.make_transcript_xref_map(
            rec, gvanno_xref_map, xref_tag="GVANNO_XREF")

        csq_record_results = annoutils.parse_vep_csq(rec,
                                                     gvanno_xref,
                                                     vep_csq_fields_map,
                                                     logger,
                                                     pick_only=True,
                                                     csq_identifier='CSQ')
        if 'vep_all_csq' in csq_record_results:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(
                csq_record_results['vep_all_csq'])
        if 'vep_block' in csq_record_results:
            vep_csq_records = csq_record_results['vep_block']
            block_idx = 0
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]

        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(
                rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
            logger)
コード例 #7
0
def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction=0):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Gene annotations, e.g. known oncogenes/tumor suppressors, curated disease associations (DisGenet), MIM phenotype associations etc
   3. Protein-relevant annotations, e.g. c functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(gvanno_db_directory, 'gvanno_infotags.tsv'))
    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(
        query_vcf, vcf_infotags_meta)
    vep_csq_index2fields = meta_vep_dbnsfp_info['vep_csq_index2fields']
    vep_csq_fields2index = meta_vep_dbnsfp_info['vep_csq_fields2index']
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info[
        'dbnsfp_prediction_algorithms']

    vcf = VCF(query_vcf)
    for tag in vcf_infotags_meta:
        if lof_prediction == 0:
            if not tag.startswith('LoF'):
                vcf.add_info_to_header({
                    'ID':
                    tag,
                    'Description':
                    str(vcf_infotags_meta[tag]['description']),
                    'Type':
                    str(vcf_infotags_meta[tag]['type']),
                    'Number':
                    str(vcf_infotags_meta[tag]['number'])
                })
        else:
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(vcf_infotags_meta[tag]['description']),
                'Type':
                str(vcf_infotags_meta[tag]['type']),
                'Number':
                str(vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    gvanno_xref_map = {
        'ENSEMBL_TRANSCRIPT_ID': 0,
        'ENSEMBL_GENE_ID': 1,
        'SYMBOL': 2,
        'ENTREZ_ID': 3,
        'UNIPROT_ID': 4,
        'APPRIS': 5,
        'UNIPROT_ACC': 6,
        'REFSEQ_MRNA': 7,
        'CORUM_ID': 8,
        'TUMOR_SUPPRESSOR': 9,
        'ONCOGENE': 10,
        'DISGENET_CUI': 11,
        'MIM_PHENOTYPE_ID': 12
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        gvanno_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('GVANNO_XREF') is None:
            for transcript_xref in rec.INFO.get('GVANNO_XREF').split(','):
                xrefs = transcript_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                gvanno_xref[ensembl_transcript_id] = {}
                for annotation in gvanno_xref_map.keys():
                    annotation_index = gvanno_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        gvanno_xref[ensembl_transcript_id][annotation] = xrefs[
                            annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in gvanno_xref:
                                            for annotation in gvanno_xref_map.keys(
                                            ):
                                                if annotation in gvanno_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        rec.INFO[annotation] = gvanno_xref[
                                                            ensembl_transcript_id][
                                                                annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(v)
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)

                            j = j + 1
                        annoutils.set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    annoutils.map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (gvanno-writer)',
            logger)
コード例 #8
0
ファイル: pcgr_summarise.py プロジェクト: Color4/pcgr
def extend_vcf_annotations(query_vcf, pcgr_db_directory, logger, cpsr):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    pcgr_vcf_infotags_meta = annoutils.read_infotag_file(
        os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv'))
    if cpsr is True:
        pcgr_vcf_infotags_meta = annoutils.read_infotag_file(
            os.path.join(pcgr_db_directory, 'cpsr_infotags.tsv'))

    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    vep_to_pcgr_af = {
        'gnomAD_AMR_AF': 'AMR_AF_GNOMAD',
        'gnomAD_AFR_AF': 'AFR_AF_GNOMAD',
        'gnomAD_EAS_AF': 'EAS_AF_GNOMAD',
        'gnomAD_NFE_AF': 'NFE_AF_GNOMAD',
        'gnomAD_AF': 'GLOBAL_AF_GNOMAD',
        'gnomAD_SAS_AF': 'SAS_AF_GNOMAD',
        'gnomAD_OTH_AF': 'OTH_AF_GNOMAD',
        'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD',
        'gnomAD_FIN_AF': 'FIN_AF_GNOMAD',
        'AFR_AF': 'AFR_AF_1KG',
        'AMR_AF': 'AMR_AF_1KG',
        'SAS_AF': 'SAS_AF_1KG',
        'EUR_AF': 'EUR_AF_1KG',
        'EAS_AF': 'EAS_AF_1KG',
        'AF': 'GLOBAL_AF_1KG'
    }

    vcf = VCF(query_vcf)
    vep_csq_index2fields = {}
    vep_csq_fields2index = {}
    dbnsfp_prediction_algorithms = []
    effect_predictions_description = ""
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys():
            identifier = str(header_element['ID'])
            if identifier == 'CSQ' or identifier == 'DBNSFP':
                description = str(header_element['Description'])
                if 'Format: ' in description:
                    subtags = description.split('Format: ')[1].split('|')
                    if identifier == 'CSQ':
                        i = 0
                        for t in subtags:
                            v = t
                            if t in vep_to_pcgr_af:
                                v = str(vep_to_pcgr_af[t])
                            if v in pcgr_vcf_infotags_meta:
                                vep_csq_index2fields[i] = v
                                vep_csq_fields2index[v] = i
                            i = i + 1
                    if identifier == 'DBNSFP':
                        if len(subtags) > 7:
                            effect_predictions_description = "Format: " + '|'.join(
                                subtags[7:])
                        i = 7
                        while (i < len(subtags)):
                            dbnsfp_prediction_algorithms.append(
                                str(
                                    re.sub(r'((_score)|(_pred))"*$', '',
                                           subtags[i])))
                            i = i + 1

    for tag in pcgr_vcf_infotags_meta:
        #if not vcf.contains(tag):
        vcf.add_info_to_header({
            'ID':
            tag,
            'Description':
            str(pcgr_vcf_infotags_meta[tag]['description']),
            'Type':
            str(pcgr_vcf_infotags_meta[tag]['type']),
            'Number':
            str(pcgr_vcf_infotags_meta[tag]['number'])
        })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    pcgr_onco_xref_map = {
        'SYMBOL': 1,
        'ENTREZ_ID': 2,
        'UNIPROT_ID': 3,
        'APPRIS': 4,
        'UNIPROT_ACC': 5,
        'CHORUM_ID': 6,
        'TUMOR_SUPPRESSOR': 7,
        'ONCOGENE': 8,
        'NETWORK_CG': 9,
        'DISGENET_CUI': 10,
        'CHEMBL_COMPOUND_ID': 11,
        'INTOGEN_DRIVER': 12,
        'ONCOSCORE': 13,
        'CANCER_PREDISPOSITION_SOURCE': 15,
        'CANCER_SUSCEPTIBILITY_CUI': 16,
        'CANCER_SYNDROME_CUI': 17,
        'CANCER_PREDISPOSITION_MOI': 18
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        pcgr_onco_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('PCGR_ONCO_XREF') is None:
            for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split(
                    ','):
                xrefs = transcript_onco_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                pcgr_onco_xref[ensembl_transcript_id] = {}
                for annotation in pcgr_onco_xref_map.keys():
                    annotation_index = pcgr_onco_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        pcgr_onco_xref[ensembl_transcript_id][
                            annotation] = xrefs[annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in pcgr_onco_xref:
                                            for annotation in pcgr_onco_xref_map.keys(
                                            ):
                                                if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC':
                                                    continue
                                                if annotation in pcgr_onco_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        if annotation.startswith(
                                                                'CANCER_'):
                                                            if cpsr is True:
                                                                rec.INFO[
                                                                    annotation] = pcgr_onco_xref[
                                                                        ensembl_transcript_id][
                                                                            annotation]
                                                        else:
                                                            rec.INFO[annotation] = pcgr_onco_xref[
                                                                ensembl_transcript_id][
                                                                    annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(v)
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)
                            j = j + 1
                        annoutils.set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    annoutils.map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            annoutils.error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
                logger)
    else:
        annoutils.error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
            logger)