Ejemplo n.º 1
0
def __main__():

    parser = argparse.ArgumentParser(
        description='Cancer gene annotations from PCGR pipeline (SNVs/InDels)')
    parser.add_argument(
        'vcf_file',
        help='VCF file with VEP-annotated query variants (SNVs/InDels)')
    parser.add_argument('pon_annotation',
                        default=0,
                        type=int,
                        help='Include Panel of Normals annotation')
    parser.add_argument('pcgr_db_dir', help='PCGR data directory')
    parser.add_argument(
        '--cpsr',
        action="store_true",
        help=
        "Aggregate cancer gene annotations for Cancer Predisposition Sequencing Reporter (CPSR)"
    )
    args = parser.parse_args()

    logger = annoutils.getlogger('pcgr-gene-annotate')
    if args.cpsr is True:
        logger = annoutils.getlogger('cpsr-gene-annotate')

    extend_vcf_annotations(args.vcf_file, args.pcgr_db_dir, logger,
                           args.pon_annotation, args.cpsr)
Ejemplo n.º 2
0
def validate_gvanno_input(gvanno_directory, input_vcf, configuration_file,
                          genome_assembly):
    """
   Function that reads the input file to gvanno (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by gvanno
   3. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   4. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = annoutils.getlogger('gvanno-validate-input')
    config_options = annoutils.read_config_options(configuration_file,
                                                   gvanno_directory,
                                                   genome_assembly,
                                                   logger,
                                                   wflow='gvanno')

    if not input_vcf == 'None':
        if config_options['other']['vcf_validation']:
            valid_vcf = is_valid_vcf(input_vcf, logger)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as defined in configuration file (vcf_validation = false)'
            )
        tag_check = check_existing_vcf_info_tags(input_vcf, gvanno_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)

        simplify_vcf(input_vcf, vcf, logger)

    return 0
Ejemplo n.º 3
0
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname,
                        configuration_file, vcf_validation, genome_assembly,
                        sample_id, virtual_panel_id, diagnostic_grade_only,
                        output_dir):
    """
   Function that reads the input files to CPSR (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR
   3. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   4. Check that VCF contains a single sample column 
   5. The resulting VCF file is sorted and indexed (bgzip + tabix)
   """
    logger = annoutils.getlogger('cpsr-validate-input')

    custom_list_bed_fname = 'None'
    if not custom_list_fname == 'None':
        custom_list_bed_fname = os.path.join(
            output_dir,
            sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed')
        get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname,
                                  sample_id, pcgr_directory, output_dir,
                                  genome_assembly, logger)

    #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr')
    if not input_vcf == 'None':
        if vcf_validation == 1:
            valid_vcf = annoutils.is_valid_vcf(input_vcf, output_dir, logger,
                                               debug)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as provided by option --no_vcf_validate'
            )

        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)
        samples = vcf.samples
        if len(samples) > 1:
            err_msg = "Query VCF contains more than one sample column (" + ', '.join(
                samples
            ) + ") - CPSR expects a germline VCF with a single sample column - exiting"
            return annoutils.error_message(err_msg, logger)
        simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory,
                     genome_assembly, virtual_panel_id, diagnostic_grade_only,
                     output_dir, logger)

    return 0
Ejemplo n.º 4
0
def validate_pcgr_input(pcgr_directory, input_vcf, input_cna, configuration_file, panel_normal_vcf, vcf_validation, tumor_only, genome_assembly, output_dir):
   """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that panel-of-normals VCF adheres to the required format (PANEL_OF_NORMALS INFO tag in header)
   6. Check that copy number segment file has required columns and correct data types (and range)
   7. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
   logger = annoutils.getlogger('pcgr-validate-input')
   config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'pcgr')

   if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True:
      logger.warn('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored')

   if not input_vcf == 'None':
      if vcf_validation == 1:
         valid_vcf = is_valid_vcf(input_vcf, output_dir, logger)
         if valid_vcf == -1:
            return -1
      else:
         logger.info('Skipping validation of VCF file - as provided by option --no_vcf_validate')
      tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger)
      if tag_check == -1:
         return -1
      
      vcf = VCF(input_vcf)
      allelic_support_check = check_format_ad_dp_tags(vcf, pcgr_directory, config_options, tumor_only, logger)
      if allelic_support_check == -1:
         return -1
      
      simplify_vcf(input_vcf, vcf, output_dir, logger)
   
   if not panel_normal_vcf == "None":
      valid_panel_normals = validate_panel_normal_vcf(panel_normal_vcf, logger)
      if valid_panel_normals == -1:
         return -1
      
   if not input_cna == 'None':
      valid_cna = is_valid_cna(input_cna, logger)
      if valid_cna == -1:
         return -1
   
   return 0
Ejemplo n.º 5
0
def validate_pcgr_input(pcgr_directory, input_vcf, input_cna,
                        configuration_file, genome_assembly, output_dir):
    """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that copy number segment file has required columns and correct data types (and range)
   6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = annoutils.getlogger('pcgr-validate-input')
    config_options = annoutils.read_config_options(configuration_file,
                                                   pcgr_directory,
                                                   genome_assembly,
                                                   logger,
                                                   wflow='pcgr')
    #print str(config_options)
    if not input_vcf == 'None':
        if config_options['other']['vcf_validation']:
            valid_vcf = is_valid_vcf(input_vcf, output_dir, logger)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as defined in configuration file (vcf_validation = false)'
            )
        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)
        allelic_support_check = check_format_ad_dp_tags(
            vcf, pcgr_directory, config_options, logger)
        if allelic_support_check == -1:
            return -1

        simplify_vcf(input_vcf, vcf, output_dir, logger)

    if not input_cna == 'None':
        ret = is_valid_cna(input_cna, logger)
        return ret

    return 0
Ejemplo n.º 6
0
#!/usr/bin/env python

import argparse
from cyvcf2 import VCF
import random
import annoutils
import os
import re
import sys

logger = annoutils.getlogger('pcgr-vcfanno')


def __main__():
   parser = argparse.ArgumentParser(description='Run brentp/vcfanno - annotate a VCF file against multiple VCF files in parallel', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
   parser.add_argument('query_vcf', help='Bgzipped input VCF file with query variants (SNVs/InDels)')
   parser.add_argument('out_vcf', help='Output VCF file with appended annotations from multiple VCF files')
   parser.add_argument('pcgr_db_dir', help='PCGR assembly-specific data directory')
   parser.add_argument('--num_processes', help="Number of processes vcfanno can use during annotation", default=4)
   parser.add_argument("--docm",action = "store_true", help="Annotate VCF with annotations from Database of Curated Mutations")
   parser.add_argument("--intogen_driver_mut",action = "store_true", help="Annotate VCF with predicted cancer driver mutations from IntoGen's Catalog of Driver Mutations")
   parser.add_argument("--clinvar",action = "store_true", help="Annotate VCF with annotations from ClinVar")
   parser.add_argument("--dbnsfp",action = "store_true", help="Annotate VCF with annotations from database of non-synonymous functional predictions")
   parser.add_argument("--tcga",action = "store_true", help="Annotate VCF with variant frequencies from the The Cancer Genome Atlas")
   parser.add_argument("--tcga_pcdm",action = "store_true", help="Annotate VCF with putative cancer driver mutations from The Cancer Genome Atlas")
   parser.add_argument("--civic",action = "store_true", help="Annotate VCF with annotations from the Clinical Interpretation of Variants in Cancer database")
   parser.add_argument("--cbmdb",action = "store_true", help="Annotate VCF with annotations from the Cancer bioMarkers database")
   parser.add_argument("--icgc",action = "store_true", help="Annotate VCF with known variants found in the ICGC-PCAWG sequencing project")
   parser.add_argument("--cancer_hotspots",action = "store_true", help="Annotate VCF with mutation hotspots from cancerhotspots.org")
   parser.add_argument("--uniprot",action = "store_true", help="Annotate VCF with protein functional features from the UniProt Knowledgebase")
   parser.add_argument("--pcgr_onco_xref",action = "store_true", help="Annotate VCF with transcript annotations from PCGR (targeted drugs, protein complexes, cancer gene associations, etc)")
Ejemplo n.º 7
0
#!/usr/bin/env python

import csv
import re
import argparse
from cyvcf2 import VCF, Writer
import gzip
import os
import annoutils

logger = annoutils.getlogger('gvanno-gene-annotate')
csv.field_size_limit(500 * 1024 * 1024)


def __main__():

    parser = argparse.ArgumentParser(
        description='Gene annotations from gvanno pipeline (SNVs/InDels)')
    parser.add_argument(
        'vcf_file',
        help='VCF file with VEP-annotated query variants (SNVs/InDels)')
    parser.add_argument('gvanno_db_dir', help='gvanno data directory')
    parser.add_argument('lof_prediction',
                        default=0,
                        type=int,
                        help='VEP LoF prediction setting (0/1)')
    args = parser.parse_args()

    extend_vcf_annotations(args.vcf_file, args.gvanno_db_dir,
                           args.lof_prediction)
Ejemplo n.º 8
0
#!/usr/bin/env python

import argparse
from cyvcf2 import VCF
import random
import annoutils
import os
import re
import sys

logger = annoutils.getlogger('gvanno-vcfanno')


def __main__():
    parser = argparse.ArgumentParser(
        description=
        'Run brentp/vcfanno - annotate a VCF file against multiple VCF files in parallel',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        'query_vcf',
        help='Bgzipped input VCF file with query variants (SNVs/InDels)')
    parser.add_argument(
        'out_vcf',
        help='Output VCF file with appended annotations from multiple VCF files'
    )
    parser.add_argument('gvanno_db_dir', help='gvanno data directory')
    parser.add_argument(
        '--num_processes',
        help="Number of processes vcfanno can use during annotation",
        default=4)
    parser.add_argument("--clinvar",