Ejemplo n.º 1
0
def validate_pcgr_input(pcgr_directory, input_vcf, input_cna,
                        configuration_file):
    """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that copy number segment file has required columns and correct data types (and range)
   6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = pcgrutils.getlogger('pcgr-validate-input')

    if not input_vcf == 'None':
        valid_vcf = is_valid_vcf(input_vcf, logger)
        if valid_vcf == -1:
            return -1
        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)
        allelic_support_check = check_format_ad_dp_tags(
            vcf, configuration_file, logger)
        if allelic_support_check == -1:
            return -1

        simplify_vcf(input_vcf, vcf, logger)

    if not input_cna == 'None':
        ret = is_valid_cna(input_cna, logger)
        return ret

    return 0
Ejemplo n.º 2
0
def validate_pcgr_input(pcgr_directory, input_vcf, configuration_file,
                        genome_assembly):
    """
   Function that reads the input files to PCGR_predispose (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR_predispose
   3. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   4. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = pcgrutils.getlogger('pcgr-predispose-validate-input')
    config_options = pcgrutils.read_config_options(configuration_file,
                                                   pcgr_directory,
                                                   genome_assembly, logger)
    #print str(config_options)
    if not input_vcf == 'None':
        if config_options['other']['vcf_validation']:
            valid_vcf = is_valid_vcf(input_vcf, logger)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as defined in configuration file (vcf_validation = false)'
            )
        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)
        simplify_vcf(input_vcf, vcf, logger)

    return 0
Ejemplo n.º 3
0
#!/usr/bin/env python

import csv
import re
import argparse
#from itertools import izip, imap
from cyvcf2 import VCF, Writer
import gzip
import dbnsfp
import os
import pcgrutils

logger = pcgrutils.getlogger('pcgr-gene-annotate')
csv.field_size_limit(500 * 1024 * 1024)
threeLettertoOneLetterAA = {
    'Ala': 'A',
    'Arg': 'R',
    'Asn': 'N',
    'Asp': 'D',
    'Cys': 'C',
    'Glu': 'E',
    'Gln': 'Q',
    'Gly': 'G',
    'His': 'H',
    'Ile': 'I',
    'Leu': 'L',
    'Lys': 'K',
    'Met': 'M',
    'Phe': 'F',
    'Pro': 'P',
    'Ser': 'S',
Ejemplo n.º 4
0
#!/usr/bin/env python

import argparse
from cyvcf2 import VCF
import random
import pcgrutils
import os
import re
import sys

logger = pcgrutils.getlogger('pcgr-vcfanno')


def __main__():
   parser = argparse.ArgumentParser(description='Run brentp/vcfanno - annotate a VCF file against multiple VCF files in parallel', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
   parser.add_argument('query_vcf', help='Bgzipped input VCF file with query variants (SNVs/InDels)')
   parser.add_argument('out_vcf', help='Output VCF file with appended annotations from multiple VCF files')
   parser.add_argument('pcgr_db_dir', help='PCGR data directory')
   parser.add_argument('--num_processes', help="Number of processes vcfanno can use during annotation", default=4)
   parser.add_argument("--docm",action = "store_true", help="Annotate VCF with annotations from Database of Curated Mutations")
   parser.add_argument("--intogen_driver_mut",action = "store_true", help="Annotate VCF with predicted cancer driver mutations from IntoGen's Catalog of Driver Mutations")
   parser.add_argument("--clinvar",action = "store_true", help="Annotate VCF with annotations from ClinVar")
   parser.add_argument("--dbnsfp",action = "store_true", help="Annotate VCF with annotations from database of non-synonymous functional predictions")
   parser.add_argument("--tcga",action = "store_true", help="Annotate VCF with variant frequencies from the The Cancer Genome Atlas")
   parser.add_argument("--civic",action = "store_true", help="Annotate VCF with annotations from the Clinical Interpretation of Variants in Cancer database")
   parser.add_argument("--cbmdb",action = "store_true", help="Annotate VCF with annotations from the Cancer bioMarkers database")
   parser.add_argument("--icgc",action = "store_true", help="Annotate VCF with known variants found in the ICGC-PCAWG sequencing project")
   parser.add_argument("--cancer_hotspots",action = "store_true", help="Annotate VCF with mutation hotspots from cancerhotspots.org")
   parser.add_argument("--uniprot",action = "store_true", help="Annotate VCF with protein functional features from the UniProt Knowledgebase")
   parser.add_argument("--pcgr_onco_xref",action = "store_true", help="Annotate VCF with transcript annotations from PCGR (targeted drugs, protein complexes, cancer gene associations, etc)")
   
Ejemplo n.º 5
0
#!/usr/bin/env python

import argparse
import pcgrutils
import cyvcf
import os

logger = pcgrutils.getlogger('pcgr-flatten-multisample')


def __main__():

    parser = argparse.ArgumentParser(
        description='Flatten multisample VCF file to single sample VCF files')
    parser.add_argument(
        'vcf_file',
        help=
        'Multi-sample, bgzipped VCF file with annotated query variants (SNVs/InDels)'
    )
    parser.add_argument(
        'output_postfix',
        help='Postfix of output file (e.g. \'mutect.pass.annotated.vcf\')')
    args = parser.parse_args()

    flatten_vcf(args.vcf_file, args.output_postfix)


def flatten_vcf(query_vcf, output_postfix):

    logger.info('Read query VCF using cyvcf - ' + str(query_vcf))
    vcf_reader = cyvcf.Reader(open(query_vcf, 'r'))
Ejemplo n.º 6
0
def verify_pcgr_input(pcgr_directory, input_vcf, input_cna, tumor_dp_tag,
                      tumor_af_tag, normal_dp_tag, normal_af_tag,
                      call_conf_tag):
    """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that copy number segment file has required columns and correct data types (and range)
   6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = pcgrutils.getlogger('pcgr-check-input')
    input_vcf_pcgr_ready = '/workdir/output/' + re.sub(
        r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.tmp.vcf',
        os.path.basename(input_vcf))
    input_vcf_pcgr_ready_decomposed = '/workdir/output/' + re.sub(
        r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.vcf', os.path.basename(input_vcf))

    if not input_vcf == 'None':
        logger.info('Validating VCF file with EBIvariation/vcf-validator')
        vcf_validation_output_file = '/workdir/output/' + re.sub(
            r'(\.vcf$|\.vcf\.gz$)', '.vcf_validator_output',
            os.path.basename(input_vcf))
        command_v42 = 'vcf_validator --input ' + str(
            input_vcf) + ' --version v4.2 > ' + str(vcf_validation_output_file)
        if input_vcf.endswith('.gz'):
            command_v42 = 'bgzip -dc ' + str(
                input_vcf) + ' | vcf_validator --version v4.2 > ' + str(
                    vcf_validation_output_file)

        os.system(command_v42)
        validation_results = is_valid_vcf(vcf_validation_output_file)

        if not validation_results['validation_status']:
            error_string_42 = '\n'.join(validation_results['error_messages'])
            validation_status = 'VCF file is NOT valid according to v4.2 specification'
            logger.error(validation_status + ':\n' + str(error_string_42))
            return -1
        else:
            validation_status = 'VCF file ' + str(
                input_vcf) + ' is valid according to v4.2 specification'
            logger.info(validation_status)

        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 logger)
        if tag_check == -1:
            return -1
        else:
            logger.info('No query VCF INFO tags coincide with PCGR INFO tags')

        if validation_results['validation_status']:
            multiallelic_alt = 0
            vcf = VCF(input_vcf)
            check_ad_dp_tags(vcf, tumor_dp_tag, tumor_af_tag, normal_dp_tag,
                             normal_af_tag, call_conf_tag, logger)
            for rec in vcf:
                POS = rec.start + 1
                alt = ",".join(str(n) for n in rec.ALT)
                if len(rec.ALT) > 1:
                    logger.warning("Multiallelic site detected:" +
                                   str(rec.CHROM) + '\t' + str(POS) + '\t' +
                                   str(rec.REF) + '\t' + str(alt))
                    multiallelic_alt = 1
            command_vcf_sample_free1 = 'egrep \'^##\' ' + str(
                input_vcf) + ' > ' + str(input_vcf_pcgr_ready)
            command_vcf_sample_free2 = 'egrep \'^#CHROM\' ' + str(
                input_vcf) + ' | cut -f1-8 >> ' + str(input_vcf_pcgr_ready)
            command_vcf_sample_free3 = 'egrep -v \'^#\' ' + str(
                input_vcf
            ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k3,3 -k4,4 >> ' + str(
                input_vcf_pcgr_ready)
            command_vcf_sample_free4 = 'egrep -v \'^#\' ' + str(
                input_vcf
            ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                input_vcf_pcgr_ready)
            command_vcf_sample_free5 = 'egrep -v \'^#\' ' + str(
                input_vcf
            ) + ' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                input_vcf_pcgr_ready)
            if input_vcf.endswith('.gz'):
                command_vcf_sample_free1 = 'bgzip -dc ' + str(
                    input_vcf) + ' | egrep \'^##\' > ' + str(
                        input_vcf_pcgr_ready)
                command_vcf_sample_free2 = 'bgzip -dc ' + str(
                    input_vcf) + ' | egrep \'^#CHROM\' | cut -f1-8 >> ' + str(
                        input_vcf_pcgr_ready)
                command_vcf_sample_free3 = 'bgzip -dc ' + str(
                    input_vcf
                ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k3,3 -k4,4 >> ' + str(
                    input_vcf_pcgr_ready)
                command_vcf_sample_free4 = 'bgzip -dc ' + str(
                    input_vcf
                ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                    input_vcf_pcgr_ready)
                command_vcf_sample_free5 = 'bgzip -dc ' + str(
                    input_vcf
                ) + ' | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3 -k4,4 >> ' + str(
                    input_vcf_pcgr_ready)

            os.system(command_vcf_sample_free1)
            os.system(command_vcf_sample_free2)
            os.system(command_vcf_sample_free3)
            os.system(command_vcf_sample_free4)
            os.system(command_vcf_sample_free5)

            if multiallelic_alt == 1:
                logger.info(
                    'Decomposing multi-allelic sites in input VCF file using \'vt decompose\''
                )
                command_decompose = 'vt decompose -s ' + str(
                    input_vcf_pcgr_ready) + ' > ' + str(
                        input_vcf_pcgr_ready_decomposed
                    ) + ' 2> /workdir/output/decompose.log'
                os.system(command_decompose)
            else:
                command_copy = 'cp ' + str(input_vcf_pcgr_ready) + ' ' + str(
                    input_vcf_pcgr_ready_decomposed)
                os.system(command_copy)
            os.system('bgzip -f ' + str(input_vcf_pcgr_ready_decomposed))
            os.system('tabix -p vcf ' + str(input_vcf_pcgr_ready_decomposed) +
                      '.gz')
            os.system('rm -f ' + str(input_vcf_pcgr_ready) +
                      ' /workdir/output/decompose.log')

    if not input_cna == 'None':
        ret = is_valid_cna_segment_file(input_cna, logger)
        return ret

    return 0