Ejemplo n.º 1
0
def extract_snpEff(vcf_line):

    annGenes = []
    annAAs = []
    annTxns = []

    vcf_obj = genome.Vcf_line(vcf_line)
    snpeff_ann = vcf_obj.get_info_value('ANN')

    if snpeff_ann:
        ann_items = snpeff_ann.split(',')

        for ann_i in ann_items:

            ann_item = ann_i.split('|')
            gene_i = ann_item[3]
            feature_i = ann_item[6]
            ntchange_i = ann_item[9]
            aaChange_i = ann_item[10]

            if gene_i and aaChange_i:

                # Only do non-syn variants
                aa = re.search(r'p\.([a-zA-Z]+)[0-9]+([a-zA-Z]+)', aaChange_i)

                if aa and (aa.groups()[0] != aa.groups()[1]):

                    annGenes.append(gene_i)
                    annAAs.append(aaChange_i)
                    annTxns.append(feature_i)

    return annGenes, annAAs, annTxns
Ejemplo n.º 2
0
def remove_vcf_illegal_lines(invcf, outvcf):
    '''
    In VarDict v1.7, there are lines with <XXX> in ALT without END in info, which will cause bedtools to fail. 
    This program will check if these things exist, and if they do, remove them.
    If the input VCF has illegal lines, it will return the modified output VCF file excluding those lines.
    If the input VCF file does not have such illegal lines, it will return False.
    '''
    
    hasIllegalLine = False
    with genome.open_textfile(invcf) as vcf:
        line_i = vcf.readline().rstrip()
        while line_i.startswith('#'):
            line_i = vcf.readline().rstrip()
            
        while line_i:
            
            vcf_i = genome.Vcf_line( line_i )
            
            if re.match(r'<\w+>', vcf_i.altbase) and ( not vcf_i.get_info_value('END') ):
                hasIllegalLine = True
                break
            
            line_i = vcf.readline().rstrip()
    
    if hasIllegalLine:
        with genome.open_textfile(invcf) as vcf, open(outvcf, 'w') as out:
            
            line_i = vcf.readline().rstrip()
            while line_i.startswith('#'):
                out.write( line_i + '\n')
                line_i = vcf.readline().rstrip()
            
            while line_i:
                
                vcf_i = genome.Vcf_line( line_i )
                
                if not ( re.match(r'<\w+>', vcf_i.altbase) and (not vcf_i.get_info_value('END')) ):
                    out.write( line_i + '\n')

                line_i = vcf.readline().rstrip()
        
        return outvcf
        
    else:
        return hasIllegalLine
Ejemplo n.º 3
0
def split_into_snv_and_indel(infile, snv_out, indel_out):

    with genome.open_textfile(infile) as vcf_in, open(
            snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('#'):

            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

            line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            if (',' not in vcf_i.altbase) and ('/' not in vcf_i.altbase):

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write(line_i + '\n')
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write(line_i + '\n')

            else:

                item = line_i.split('\t')

                if ',' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split(',')
                elif '/' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split('/')

                for ith_base, altbase_i in enumerate(alt_bases):

                    item[4] = altbase_i
                    new_line = '\t'.join(item)

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write(new_line + '\n')
                    elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                        indel_out.write(new_line + '\n')

            line_i = vcf_in.readline().rstrip()
Ejemplo n.º 4
0
 # Add the SCORE description before the #CHROM line
 vcfout.write('##FORMAT=<ID=SCORE,Number=1,Type=Float,Description="SomaticSeq Probability (either fraction or Phred)">\n')
 
 tumor_column = line_in.split('\t').index(tumor)
 tumor_idx = tumor_column - 9
 (normal_column, normal_idx) = (9, 0) if tumor_idx == 1 else (None, None)
 
 # This is the #CHROM line
 vcfout.write( line_in + '\n' )
 
 line_in = vcfin.readline().rstrip('\n')
 
 # Move COMBO and NUM_TOOLS from INFO to Tumor Sample, and move QUAL to the Tumor Sample as well
 while line_in:
     
     vcf_line_in = genome.Vcf_line( line_in )
     
     # New INFO
     new_info = []        
     for info_item in vcf_line_in.get_info_items():
         if not ( info_item.startswith('NUM_TOOLS=') or info_item.startswith(caller_string) ):
             new_info.append( info_item )
     
     if new_info == []:
         new_info_line = '.'
     else:
         new_info_line = ';'.join( new_info )
     
     # FORMAT:
     if somaticseq_trained:
         new_format_field = vcf_line_in.field + ':{}:NUM_TOOLS:SCORE'.format( caller_string )
Ejemplo n.º 5
0
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            nbam_fn=None,
            tbam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            jsm=None,
            sniper=None,
            vardict=None,
            muse=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            tnscope=None,
            platypus=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

        # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        nbam = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa)
        tbam = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 10 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if jsm:
            jsm = genome.open_textfile(jsm)
            jsm_line = genome.skip_vcf_header(jsm)

        if sniper:
            sniper = genome.open_textfile(sniper)
            sniper_line = genome.skip_vcf_header(sniper)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if muse:
            muse = genome.open_textfile(muse)
            muse_line = genome.skip_vcf_header(muse)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        if tnscope:
            tnscope = genome.open_textfile(tnscope)
            tnscope_line = genome.skip_vcf_header(tnscope)

        if platypus:
            platypus = genome.open_textfile(platypus)
            platypus_line = genome.skip_vcf_header(platypus)

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write(out_header.replace('{', '').replace('}', '') + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)

                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                # Keep track of NumCallers:
                num_callers = 0

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if jsm:
                    got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate(
                        my_coordinate, jsm_line, jsm, chrom_seq)
                if sniper:
                    got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate(
                        my_coordinate, sniper_line, sniper, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if muse:
                    got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate(
                        my_coordinate, muse_line, muse, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if tnscope:
                    got_tnscope, tnscope_variants, tnscope_line = genome.find_vcf_at_coordinate(
                        my_coordinate, tnscope_line, tnscope, chrom_seq)
                if platypus:
                    got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate(
                        my_coordinate, platypus_line, platypus, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                # Now, use pysam to look into the BAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = nlod = tlod = tandem = ecnt = nan

                    if varscan:
                        varscan_classification = annotate_caller.VarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = nan

                    if jsm:
                        jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM(
                            variant_id, jsm_variants)
                        num_callers += jointsnvmix2_classification
                    else:
                        jointsnvmix2_classification = score_jointsnvmix2 = nan

                    if sniper:
                        sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper(
                            variant_id, sniper_variants)
                        num_callers += sniper_classification
                    else:
                        sniper_classification = score_somaticsniper = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = score_vardict = nan

                    if muse:
                        muse_classification = annotate_caller.MuSE(
                            variant_id, muse_variants)
                        num_callers += muse_classification
                    else:
                        muse_classification = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.LoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.Scalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = somatic_evs = qss = tqss = nan

                    if tnscope:
                        tnscope_classification = annotate_caller.TNscope(
                            variant_id, tnscope_variants)
                        num_callers += tnscope_classification
                    else:
                        tnscope_classification = nan

                    if platypus:
                        platypus_classification = annotate_caller.countPASS(
                            variant_id, platypus_variants)
                        num_callers += platypus_classification
                    else:
                        platypus_classification = nan

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants:
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### #########
                        nBamFeatures = sequencing_features.from_bam(
                            nbam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)
                        tBamFeatures = sequencing_features.from_bam(
                            tbam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        n_ref = nBamFeatures['ref_for'] + nBamFeatures[
                            'ref_rev']
                        n_alt = nBamFeatures['alt_for'] + nBamFeatures[
                            'alt_rev']
                        t_ref = tBamFeatures['ref_for'] + tBamFeatures[
                            'ref_rev']
                        t_alt = tBamFeatures['alt_for'] + tBamFeatures[
                            'alt_rev']
                        sor = sequencing_features.somaticOddRatio(
                            n_ref, n_alt, t_ref, t_alt)

                        # Calculate VarScan'2 SCC directly without using VarScan2 output:
                        try:
                            score_varscan2 = genome.p2phred(
                                stats.fisher_exact(
                                    ((t_alt, n_alt), (t_ref, n_ref)),
                                    alternative='greater')[1])
                        except ValueError:
                            score_varscan2 = nan

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                   = my_coordinate[0],                                                    \
                        POS                     = my_coordinate[1],                                                    \
                        ID                      = my_identifiers,                                                      \
                        REF                     = ref_base,                                                            \
                        ALT                     = first_alt,                                                           \
                        if_MuTect               = mutect_classification,                                               \
                        if_VarScan2             = varscan_classification,                                              \
                        if_JointSNVMix2         = jointsnvmix2_classification,                                         \
                        if_SomaticSniper        = sniper_classification,                                               \
                        if_VarDict              = vardict_classification,                                              \
                        MuSE_Tier               = muse_classification,                                                 \
                        if_LoFreq               = lofreq_classification,                                               \
                        if_Scalpel              = scalpel_classification,                                              \
                        if_Strelka              = strelka_classification,                                              \
                        if_TNscope              = tnscope_classification,                                              \
                        if_Platypus             = platypus_classification,                                             \
                        Strelka_Score           = somatic_evs,                                                         \
                        Strelka_QSS             = qss,                                                                 \
                        Strelka_TQSS            = tqss,                                                                \
                        VarScan2_Score          = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        SNVMix2_Score           = rescale(score_jointsnvmix2,  'phred', p_scale, 1001),                \
                        Sniper_Score            = rescale(score_somaticsniper, 'phred', p_scale, 1001),                \
                        VarDict_Score           = rescale(score_vardict,       'phred', p_scale, 1001),                \
                        if_dbsnp                = if_dbsnp,                                                            \
                        COMMON                  = if_common,                                                           \
                        if_COSMIC               = if_cosmic,                                                           \
                        COSMIC_CNT              = num_cases,                                                           \
                        Consistent_Mates        = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates      = tBamFeatures['inconsistent_mates'],                                  \
                        N_DP                    = nBamFeatures['dp'],                                                  \
                        nBAM_REF_MQ             = '%g' % nBamFeatures['ref_mq'],                                       \
                        nBAM_ALT_MQ             = '%g' % nBamFeatures['alt_mq'],                                       \
                        nBAM_Z_Ranksums_MQ      = '%g' % nBamFeatures['z_ranksums_mq'],                                \
                        nBAM_REF_BQ             = '%g' % nBamFeatures['ref_bq'],                                       \
                        nBAM_ALT_BQ             = '%g' % nBamFeatures['alt_bq'],                                       \
                        nBAM_Z_Ranksums_BQ      = '%g' % nBamFeatures['z_ranksums_bq'],                                \
                        nBAM_REF_NM             = '%g' % nBamFeatures['ref_NM'],                                       \
                        nBAM_ALT_NM             = '%g' % nBamFeatures['alt_NM'],                                       \
                        nBAM_NM_Diff            = '%g' % nBamFeatures['NM_Diff'],                                      \
                        nBAM_REF_Concordant     = nBamFeatures['ref_concordant_reads'],                                \
                        nBAM_REF_Discordant     = nBamFeatures['ref_discordant_reads'],                                \
                        nBAM_ALT_Concordant     = nBamFeatures['alt_concordant_reads'],                                \
                        nBAM_ALT_Discordant     = nBamFeatures['alt_discordant_reads'],                                \
                        nBAM_Concordance_FET    = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        N_REF_FOR               = nBamFeatures['ref_for'],                                             \
                        N_REF_REV               = nBamFeatures['ref_rev'],                                             \
                        N_ALT_FOR               = nBamFeatures['alt_for'],                                             \
                        N_ALT_REV               = nBamFeatures['alt_rev'],                                             \
                        nBAM_StrandBias_FET     = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        nBAM_Z_Ranksums_EndPos  = '%g' % nBamFeatures['z_ranksums_endpos'],                            \
                        nBAM_REF_Clipped_Reads  = nBamFeatures['ref_SC_reads'],                                        \
                        nBAM_ALT_Clipped_Reads  = nBamFeatures['alt_SC_reads'],                                        \
                        nBAM_Clipping_FET       = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        nBAM_MQ0                = nBamFeatures['MQ0'],                                                 \
                        nBAM_Other_Reads        = nBamFeatures['noise_read_count'],                                    \
                        nBAM_Poor_Reads         = nBamFeatures['poor_read_count'],                                     \
                        nBAM_REF_InDel_3bp      = nBamFeatures['ref_indel_3bp'],                                       \
                        nBAM_REF_InDel_2bp      = nBamFeatures['ref_indel_2bp'],                                       \
                        nBAM_REF_InDel_1bp      = nBamFeatures['ref_indel_1bp'],                                       \
                        nBAM_ALT_InDel_3bp      = nBamFeatures['alt_indel_3bp'],                                       \
                        nBAM_ALT_InDel_2bp      = nBamFeatures['alt_indel_2bp'],                                       \
                        nBAM_ALT_InDel_1bp      = nBamFeatures['alt_indel_1bp'],                                       \
                        M2_NLOD                 = nlod,                                                                \
                        M2_TLOD                 = tlod,                                                                \
                        M2_STR                  = tandem,                                                              \
                        M2_ECNT                 = ecnt,                                                                \
                        SOR                     = sor,                                                                 \
                        MSI                     = msi,                                                                 \
                        MSILEN                  = msilen,                                                              \
                        SHIFT3                  = shift3,                                                              \
                        MaxHomopolymer_Length   = homopolymer_length,                                                  \
                        SiteHomopolymer_Length  = site_homopolymer_length,                                             \
                        T_DP                    = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ             = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ             = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_Z_Ranksums_MQ      = '%g' % tBamFeatures['z_ranksums_mq'],                                \
                        tBAM_REF_BQ             = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ             = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_Z_Ranksums_BQ      = '%g' % tBamFeatures['z_ranksums_bq'],                                \
                        tBAM_REF_NM             = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM             = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff            = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant     = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant     = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant     = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant     = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET    = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR               = tBamFeatures['ref_for'],                                             \
                        T_REF_REV               = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR               = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV               = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET     = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_Z_Ranksums_EndPos  = '%g' % tBamFeatures['z_ranksums_endpos'],                            \
                        tBAM_REF_Clipped_Reads  = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads  = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET       = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads        = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads         = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp      = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp      = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp      = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp      = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp      = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp      = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length            = indel_length,                                                        \
                        TrueVariant_or_False    = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect,
                        varscan, jsm, sniper, vardict, muse, lofreq, scalpel,
                        strelka, tnscope, platypus)
        [opened_file.close() for opened_file in opened_files if opened_file]
Ejemplo n.º 6
0
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            bam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            vardict=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

    # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 6 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write(out_header.replace('{', '').replace('}', '') + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)
                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                # Now, use pysam to look into the tBAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    # Reset num_caller to 0 for each variant in the same coordinate
                    num_callers = 0

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, tlod, ecnt = annotate_caller.ssMuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = tlod = ecnt = nan

                    if varscan:
                        varscan_classification, score_varscan2 = annotate_caller.ssVarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = score_varscan2 = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.ssLoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.ssScalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification = annotate_caller.ssStrelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = nan

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants.keys():
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### INFO EXTRACTION FROM BAM FILES ########## #########
                        # Tumor tBAM file:
                        tBamFeatures = sequencing_features.from_bam(
                            bam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring.
                        seq_span_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 41),
                            my_coordinate[1] + 40)
                        seq_left_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 81),
                            my_coordinate[1])
                        seq_right_80bp = ref_fa.fetch(my_coordinate[0],
                                                      my_coordinate[1],
                                                      my_coordinate[1] + 81)

                        if len(seq_span_80bp) > 20:
                            LC_spanning = sequencing_features.subLC(
                                seq_span_80bp, 20)
                        else:
                            LC_spanning = math.nan

                        if len(seq_left_80bp) > 20:
                            left_LC = sequencing_features.subLC(
                                seq_left_80bp, 20)
                        else:
                            left_LC = math.nan

                        if len(seq_right_80bp) > 20:
                            right_LC = sequencing_features.subLC(
                                seq_right_80bp, 20)
                        else:
                            right_LC = math.nan

                        LC_adjacent = min(left_LC, right_LC)

                        LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
                        LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                      = my_coordinate[0],                                                    \
                        POS                        = my_coordinate[1],                                                    \
                        ID                         = my_identifiers,                                                      \
                        REF                        = ref_base,                                                            \
                        ALT                        = first_alt,                                                           \
                        if_MuTect                  = mutect_classification,                                               \
                        if_Strelka                 = strelka_classification,                                              \
                        if_VarScan2                = varscan_classification,                                              \
                        if_VarDict                 = vardict_classification,                                              \
                        if_LoFreq                  = lofreq_classification,                                               \
                        if_Scalpel                 = scalpel_classification,                                              \
                        VarScan2_Score             = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        if_dbsnp                   = if_dbsnp,                                                            \
                        COMMON                     = if_common,                                                           \
                        if_COSMIC                  = if_cosmic,                                                           \
                        COSMIC_CNT                 = num_cases,                                                           \
                        Consistent_Mates           = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates         = tBamFeatures['inconsistent_mates'],                                  \
                        Seq_Complexity_Span        = LC_spanning_phred,                                                   \
                        Seq_Complexity_Adj         = LC_adjacent_phred,                                                   \
                        M2_TLOD                    = tlod,                                                                \
                        M2_ECNT                    = ecnt,                                                                \
                        MSI                        = msi,                                                                 \
                        MSILEN                     = msilen,                                                              \
                        SHIFT3                     = shift3,                                                              \
                        MaxHomopolymer_Length      = homopolymer_length,                                                  \
                        SiteHomopolymer_Length     = site_homopolymer_length,                                             \
                        T_DP                       = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ                = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ                = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_p_MannWhitneyU_MQ     = '%g' % tBamFeatures['p_mannwhitneyu_mq'],                            \
                        tBAM_REF_BQ                = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ                = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_p_MannWhitneyU_BQ     = '%g' % tBamFeatures['p_mannwhitneyu_bq'],                            \
                        tBAM_REF_NM                = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM                = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff               = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant        = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant        = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant        = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant        = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET       = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR                  = tBamFeatures['ref_for'],                                             \
                        T_REF_REV                  = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR                  = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV                  = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET        = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'],                        \
                        tBAM_REF_Clipped_Reads     = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads     = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET          = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                   = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads           = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads            = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp         = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp         = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp         = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp         = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp         = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp         = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length               = indel_length,                                                        \
                        TrueVariant_or_False       = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan,
                        vardict, lofreq, scalpel, strelka)
        [opened_file.close() for opened_file in opened_files if opened_file]
Ejemplo n.º 7
0
def convert(infile, outfile):

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # Skip headers from now on:
        while line_i.startswith('#'):

            if line_i.startswith('##FORMAT=<ID=DP4,'):
                line_i = '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">'

            elif line_i.startswith('##FORMAT=<ID=AD,'):
                line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

            vcfout.write(line_i + '\n')

            line_i = vcf.readline().rstrip()

        # Doing the work here:
        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            num_samples = len(vcf_i.samples)
            if num_samples == 1:
                paired = False

            elif num_samples == 2:
                paired = True

            elif num_samples > 2:
                sys.stderr.write(
                    'We found more than 2 sammples in this VCF file. It may be messed up, but I\'ll just assume the first 2 samples mean anything at all'
                )
                paired = True

            elif num_samples == 0:
                raise Exception('No sample information here.')

            # Replace the wrong "G/A" with the correct "G,A" in ALT column:
            vcf_i.altbase = vcf_i.altbase.replace('/', ',')

            # vcf-validator is not going to accept multiple sequences in the REF, as is the case in VarScan2's indel output:
            vcf_i.refbase = re.sub(r'[^\w].*$', '', vcf_i.refbase)

            # Get rid of non-compliant characters in the ALT column:
            vcf_i.altbase = re.sub(r'[^\w,.]', '', vcf_i.altbase)

            # Eliminate dupliate entries in ALT:
            vcf_i.altbase = re.sub(r'(\w+),\1', r'\1', vcf_i.altbase)

            # Eliminate ALT entries when it matches with the REF column, to address vcf-validator complaints:
            if ',' in vcf_i.altbase:
                alt_item = vcf_i.altbase.split(',')

                if vcf_i.refbase in alt_item:

                    bad_idx = alt_item.index(vcf_i.refbase)
                    alt_item.pop(bad_idx)
                    vcf_i.altbase = ','.join(alt_item)

                # To fix this vcf-validator complaints:
                # Could not parse the allele(s) [GTC], first base does not match the reference
                for n1, alt_i in enumerate(alt_item[1::]):
                    if not alt_i.startswith(vcf_i.refbase):

                        alt_item.pop(n1 + 1)
                        vcf_i.altbase = ','.join(alt_item)

            # Combine AD:RD into AD:
            format_items = vcf_i.get_sample_variable()
            if 'AD' in format_items and 'RD' in format_items:

                rd_sm1 = vcf_i.get_sample_value('RD', 0)
                ad_sm1 = vcf_i.get_sample_value('AD', 0)

                try:
                    rd_sm2 = vcf_i.get_sample_value('RD', 1)
                    ad_sm2 = vcf_i.get_sample_value('AD', 1)
                except IndexError:
                    rd_sm2 = ad_sm2 = 0

                idx_ad = format_items.index('AD')
                idx_rd = format_items.index('RD')
                format_items.pop(idx_rd)
                vcf_i.field = ':'.join(format_items)

                item_normal = vcf_i.samples[0].split(':')
                item_normal[idx_ad] = '{},{}'.format(rd_sm1, ad_sm1)
                item_normal.pop(idx_rd)
                vcf_i.samples[0] = ':'.join(item_normal)

                if paired:

                    item_tumor = vcf_i.samples[1].split(':')
                    item_tumor[idx_ad] = '{},{}'.format(rd_sm2, ad_sm2)
                    item_tumor.pop(idx_rd)
                    vcf_i.samples[1] = ':'.join(item_tumor)

            # Reform the line:
            line_i = '\t'.join(
                (vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier,
                 vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters,
                 vcf_i.info, vcf_i.field, '\t'.join((vcf_i.samples))))

            # VarScan2 output a line with REF allele as "M". GATK CombineVariants complain about that.
            if not re.search(r'[^GCTAU]', vcf_i.refbase, re.I):
                vcfout.write(line_i + '\n')

            # Next line:
            line_i = vcf.readline().rstrip()
Ejemplo n.º 8
0
def convert(infile, snv_out, indel_out, is_tnscope):

    info_to_split = 'NLOD', 'TLOD'
    info_to_keep = 'STR', 'ECNT'

    with genome.open_textfile(infile) as vcf_in, open(
            snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            if line_i.startswith('##normal_sample='):
                normal_name = line_i.split('=')[1]

            if line_i.startswith('##tumor_sample='):
                tumor_name = line_i.split('=')[1]

            if line_i.startswith('##INFO=<ID=SOR,'):
                line_i = re.sub(r'Float', 'String', line_i)

            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

            line_i = vcf_in.readline().rstrip()

        # This line will be #CHROM:
        snv_out.write(line_i + '\n')
        indel_out.write(line_i + '\n')
        header = line_i.split('\t')

        if is_tnscope:
            # Doesn't matter which one is normal/tumor. These information are not used.
            normal_index, tumor_index = 1, 0

        else:
            normal_index = header.index(normal_name) - 9
            tumor_index = header.index(tumor_name) - 9

        # This will be the first variant line:
        line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            if ',' not in vcf_i.altbase:

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write(line_i + '\n')
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write(line_i + '\n')

            else:
                alt_bases = vcf_i.altbase.split(',')
                measures = []
                still_measures = []

                for measure_i in info_to_split:
                    try:
                        measures.append(
                            vcf_i.get_info_value(measure_i).split(','))
                    except AttributeError:
                        measures.append(None)

                for measure_i in info_to_keep:
                    try:
                        still_measures.append(vcf_i.get_info_value(measure_i))
                    except AttributeError:
                        still_measures.append(None)

                for ith_base, altbase_i in enumerate(alt_bases):

                    split_infos = [
                        '{}={}'.format(info_variable, info_value[ith_base])
                        for info_variable, info_value in zip(
                            info_to_split, measures) if info_value != None
                    ]

                    still_infos = [
                        '{}={}'.format(info_variable, info_value)
                        for info_variable, info_value in zip(
                            info_to_keep, still_measures)
                        if info_value != False
                    ]

                    split_infos.extend(still_infos)

                    info_string = ';'.join(split_infos)

                    GT0 = vcf_i.get_sample_value('GT', idx=0)
                    if GT0 != '0/0' and GT0 != '0/1':
                        sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0])
                    else:
                        sample_0 = vcf_i.samples[0]

                    GT1 = vcf_i.get_sample_value('GT', idx=1)
                    if GT1 != '0/0' and GT0 != '0/1':
                        sample_1 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[1])
                    else:
                        sample_1 = vcf_i.samples[1]

                    new_line = '\t'.join(
                        (vcf_i.chromosome, str(vcf_i.position),
                         vcf_i.identifier, vcf_i.refbase, altbase_i,
                         vcf_i.qual, vcf_i.filters, info_string, vcf_i.field,
                         sample_0, sample_1))

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write(new_line + '\n')
                    elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1:
                        indel_out.write(new_line + '\n')

            line_i = vcf_in.readline().rstrip()
Ejemplo n.º 9
0
        vcf_out.write(line_i + '\n')
        line_i = vcf_in.readline().rstrip()

    vcf_out.write(line_i + '\n')

    # This line will be #CHROM:
    header = line_i.split('\t')
    sample_index = header.index(sample) - 9

    # This will be the first variant line:
    line_i = vcf_in.readline().rstrip()

    while line_i:

        vcf_i = genome.Vcf_line(line_i)

        if vcf_i.filters == 'PASS':

            refMQ = float(vcf_i.get_sample_value('refMQ', sample_index))
            altMQ = float(vcf_i.get_sample_value('altMQ', sample_index))
            refBQ = float(vcf_i.get_sample_value('refBQ', sample_index))
            altBQ = float(vcf_i.get_sample_value('altBQ', sample_index))
            refNM = float(vcf_i.get_sample_value('refNM', sample_index))
            altNM = float(vcf_i.get_sample_value('altNM', sample_index))
            fetSB = float(vcf_i.get_sample_value('fetSB', sample_index))
            fetCD = float(vcf_i.get_sample_value('fetCD', sample_index))
            zMQ = float(vcf_i.get_sample_value('zMQ', sample_index))
            zBQ = float(vcf_i.get_sample_value('zBQ', sample_index))
            MQ0 = int(vcf_i.get_sample_value('MQ0', sample_index))
            VAF = float(vcf_i.get_sample_value('VAF', sample_index))
Ejemplo n.º 10
0
def convert(infile, snv_out, indel_out):

    info_to_split = 'NLOD', 'TLOD'
    info_to_keep = 'STR', 'ECNT'

    with genome.open_textfile(infile) as vcf_in, open(
            snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

            if line_i.startswith('##normal_sample='):
                normal_name = line_i.split('=')[1]

            if line_i.startswith('##tumor_sample='):
                tumor_name = line_i.split('=')[1]

            line_i = vcf_in.readline().rstrip()
            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

        # This line will be #CHROM:
        header = line_i.split('\t')

        # This will be the first variant line:
        line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            # If "germlinerisk" is the only flag, then make it PASS since there is no matched normal
            if vcf_i.filters == 'germline_risk':
                vcf_i.filters = 'PASS'

            if ',' not in vcf_i.altbase:

                item = line_i.split('\t')
                if item[6] == 'germline_risk':
                    item[6] = 'PASS'

                new_line = '\t'.join(item)

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write(new_line + '\n')
                else:
                    indel_out.write(new_line + '\n')

            else:
                alt_bases = vcf_i.altbase.split(',')
                measures = []
                still_measures = []

                for measure_i in info_to_split:
                    try:
                        measures.append(
                            vcf_i.get_info_value(measure_i).split(','))
                    except AttributeError:
                        measures.append(None)

                for measure_i in info_to_keep:
                    try:
                        still_measures.append(vcf_i.get_info_value(measure_i))
                    except AttributeError:
                        still_measures.append(None)

                for ith_base, altbase_i in enumerate(alt_bases):

                    split_infos = [
                        '{}={}'.format(info_variable, info_value[ith_base])
                        for info_variable, info_value in zip(
                            info_to_split, measures) if info_value != None
                    ]

                    still_infos = [
                        '{}={}'.format(info_variable, info_value)
                        for info_variable, info_value in zip(
                            info_to_keep, still_measures)
                        if info_value != False
                    ]

                    split_infos.extend(still_infos)

                    info_string = ';'.join(split_infos)

                    GT0 = vcf_i.get_sample_value('GT', idx=0)
                    if GT0 != '0/0' and GT0 != '0/1':
                        sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0])
                    else:
                        sample_0 = vcf_i.samples[0]

                    new_line = '\t'.join(
                        (vcf_i.chromosome, str(vcf_i.position),
                         vcf_i.identifier, vcf_i.refbase, altbase_i,
                         vcf_i.qual, vcf_i.filters, info_string, vcf_i.field,
                         sample_0))

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write(new_line + '\n')
                    else:
                        indel_out.write(new_line + '\n')

            line_i = vcf_in.readline().rstrip()
Ejemplo n.º 11
0
 
 while my_line.startswith('##'):
     outfile.write( my_line + '\n' )
     my_line = infile.readline().rstrip()
     
 # This is to read through and copy the #CHROM line
 assert my_line.startswith('#CHROM')
 outfile.write('##INFO=<ID=COORDINATES,Number=.,Type=Integer,Description="Coordinates of the bases">\n')
 outfile.write('##INFO=<ID=PDP,Number=.,Type=Integer,Description="Phased DP, one for reference, and each of the variant calls.">\n')
 outfile.write( my_line + '\n' )
 my_line = infile.readline().rstrip()
 
 # Get into the bulk of the VCF file
 while my_line:
     
     my_vcf = genome.Vcf_line( my_line )
     
     if len( my_vcf.refbase ) == 1:
         my_coordinates = [(my_vcf.chromosome, my_vcf.position)]
         base_options = [[ my_vcf.refbase ]]
         base_options[0].extend( my_vcf.altbase.split(',') )
         vcf_lines = [ my_line ]
         filter_status = [ my_vcf.filters ]
         
     elif len( my_vcf.refbase ) == len( my_vcf.altbase ):
         pass
     
     while (my_coordinates[-1][0] == my_vcf.chromosome) and (my_vcf.position - my_coordinates[-1][1] <= threshold):
         
         my_line = infile.readline().rstrip()
         my_vcf = genome.Vcf_line( my_line )
Ejemplo n.º 12
0
def split_into_snv_and_indel(infile, snv_out, indel_out):

    with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('#'):

            snv_out.write( line_i + '\n' )
            indel_out.write( line_i + '\n' )

            line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line( line_i )


            if (',' not in vcf_i.altbase) and ('/' not in vcf_i.altbase):

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write( line_i + '\n' )
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write( line_i + '\n' )

            else:
                
                item = line_i.split('\t')
                
                if ',' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split(',')
                elif '/' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split('/')
                else:
                    raise Exception('Check the line: {}'.format(line_i))
                
                for ith_base, altbase_i in enumerate(alt_bases):

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        item_j    = copy(item)
                        item_j[4] = altbase_i
                        new_line  = '\t'.join(item_j)
                        
                        snv_out.write( new_line + '\n' )
                    
                    elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1:
                        item_j    = copy(item)
                        item_j[4] = altbase_i
                        new_line  = '\t'.join(item_j)
                        
                        indel_out.write( new_line + '\n')
                        
                    else:
                        complex_variant = complex2indel.translate(vcf_i.refbase, altbase_i)
                        
                        if complex_variant:
                            (new_ref, new_alt), offset = complex_variant
                            
                            if new_ref[0] == new_alt[0] and ( len(new_ref) == 1 or len(new_alt) == 1):
                                
                                item_j    = copy(item)
                                item_j[3] = new_ref
                                item_j[4] = new_alt
                                
                                # This *may* cause the output VCF file to go out of order
                                if offset != 0:
                                    item_j[1] = str( int(item[1]) + offset )
                                    
                                new_line = '\t'.join(item_j)
                                indel_out.write( new_line + '\n')

            line_i = vcf_in.readline().rstrip()
Ejemplo n.º 13
0
def convert(infile, snv_out, indel_out):

    with genome.open_textfile(infile) as vcf, open(
            snv_out, 'w') as snpout, open(indel_out, 'w') as indelout:

        line_i = vcf.readline().rstrip()

        while line_i.startswith('##'):

            if re.match(r'^##INFO=<ID=(LSEQ|RSEQ),', line_i):
                line_i = line_i.replace('Number=G', 'Number=1')

            elif line_i.startswith('##FORMAT=<ID=BIAS,'):
                line_i = line_i.replace('Number=1', 'Number=.')

            elif line_i.startswith('##FORMAT=<ID=PSTD,') or \
            line_i.startswith('##FORMAT=<ID=QSTD,') or \
            line_i.startswith('##INFO=<ID=SOR,'):
                line_i = line_i.replace('Type=Float', 'Type=String')

            snpout.write(line_i + '\n')
            indelout.write(line_i + '\n')
            line_i = vcf.readline().rstrip()

        addition_header = []
        addition_header.append(
            '##INFO=<ID=Germline,Number=0,Type=Flag,Description="VarDict Germline">'
        )
        addition_header.append(
            '##INFO=<ID=StrongSomatic,Number=0,Type=Flag,Description="VarDict Strong Somatic">'
        )
        addition_header.append(
            '##INFO=<ID=LikelySomatic,Number=0,Type=Flag,Description="VarDict Likely Somatic">'
        )
        addition_header.append(
            '##INFO=<ID=LikelyLOH,Number=0,Type=Flag,Description="VarDict Likely LOH">'
        )
        addition_header.append(
            '##INFO=<ID=StrongLOH,Number=0,Type=Flag,Description="VarDict Strong LOH">'
        )
        addition_header.append(
            '##INFO=<ID=AFDiff,Number=0,Type=Flag,Description="VarDict AF Diff">'
        )
        addition_header.append(
            '##INFO=<ID=Deletion,Number=0,Type=Flag,Description="VarDict Deletion">'
        )
        addition_header.append(
            '##INFO=<ID=SampleSpecific,Number=0,Type=Flag,Description="VarDict SampleSpecific">'
        )
        addition_header.append(
            '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">'
        )

        for item_i in addition_header:
            snpout.write(item_i + '\n')
            indelout.write(item_i + '\n')

        # This is the #CHROM line
        header_main_item = line_i.split('\t')
        num_header = len(header_main_item)

        if num_header == 10:
            paired = False
        elif num_header == 11:
            paired = True

        snpout.write(line_i + '\n')
        indelout.write(line_i + '\n')

        line_i = vcf.readline().rstrip()
        while line_i:

            vcfcall = genome.Vcf_line(line_i)

            # Fix the occasional error where ALT and REF are the same:
            if vcfcall.refbase != vcfcall.altbase:

                # In the REF/ALT field, non-GCTA characters should be changed to N to fit the VCF standard:
                vcfcall.refbase = re.sub(r'[^GCTA]',
                                         'N',
                                         vcfcall.refbase,
                                         flags=re.I)
                vcfcall.altbase = re.sub(r'[^GCTA]',
                                         'N',
                                         vcfcall.altbase,
                                         flags=re.I)

                ## To be consistent with other tools, Combine AD:RD or ALD:RD into DP4.
                # VarDict puts Tumor first and Normal next
                # Also, the old version has no ALD (somatic.pl). The new version has ALD (paired.pl).
                format_field = vcfcall.field.split(':')
                idx_rd = format_field.index('RD')

                tumor_sample = vcfcall.samples[0].split(':')
                tumor_dp4 = tumor_sample.pop(idx_rd)

                if paired:
                    normal_sample = vcfcall.samples[1].split(':')
                    normal_dp4 = normal_sample.pop(idx_rd)

                format_field.pop(idx_rd)

                # As right now, the old version has no ALD. The new version has ALD.
                # If the VCF has no ALD, then the AD means the same thing ALD is supposed to mean.
                try:
                    idx_ad = format_field.index('ALD')
                except ValueError:
                    idx_ad = format_field.index('AD')

                if paired:
                    normal_dp4 = normal_dp4 + ',' + normal_sample.pop(idx_ad)

                tumor_dp4 = tumor_dp4 + ',' + tumor_sample.pop(idx_ad)
                format_field.pop(idx_ad)

                # Re-format the strings:
                format_field.append('DP4')

                if paired:
                    normal_sample.append(normal_dp4)
                tumor_sample.append(tumor_dp4)

                if paired:
                    normal_sample = ':'.join(normal_sample)
                tumor_sample = ':'.join(tumor_sample)
                new_format_string = ':'.join(format_field)

                # VarDict's END tag has caused problem with GATK CombineVariants. Simply get rid of it.
                vcfcall.info = re.sub(r'END=[0-9]+;', '', vcfcall.info)

                if paired:
                    line_i = '\t'.join(
                        (vcfcall.chromosome, str(vcfcall.position),
                         vcfcall.identifier, vcfcall.refbase, vcfcall.altbase,
                         vcfcall.qual, vcfcall.filters, vcfcall.info,
                         new_format_string, normal_sample, tumor_sample))
                else:
                    line_i = '\t'.join(
                        (vcfcall.chromosome, str(vcfcall.position),
                         vcfcall.identifier, vcfcall.refbase, vcfcall.altbase,
                         vcfcall.qual, vcfcall.filters, vcfcall.info,
                         new_format_string, tumor_sample))

                # Write to snp and indel into different files:
                if 'TYPE=SNV' in vcfcall.info:
                    snpout.write(line_i + '\n')

                elif 'TYPE=Deletion' in vcfcall.info or 'TYPE=Insertion' in vcfcall.info:
                    indelout.write(line_i + '\n')

                elif 'TYPE=Complex' in vcfcall.info and (len(
                        vcfcall.refbase) == len(vcfcall.altbase)):
                    i = 0

                    for ref_i, alt_i in zip(vcfcall.refbase, vcfcall.altbase):

                        if ref_i != alt_i:
                            if paired:
                                line_i = '\t'.join(
                                    (vcfcall.chromosome,
                                     str(vcfcall.position + i),
                                     vcfcall.identifier, ref_i, alt_i,
                                     vcfcall.qual, vcfcall.filters,
                                     vcfcall.info, new_format_string,
                                     normal_sample, tumor_sample))
                            else:
                                line_i = '\t'.join(
                                    (vcfcall.chromosome,
                                     str(vcfcall.position + i),
                                     vcfcall.identifier, ref_i, alt_i,
                                     vcfcall.qual, vcfcall.filters,
                                     vcfcall.info, new_format_string,
                                     tumor_sample))

                            snpout.write(line_i + '\n')

                        i += 1

            # Continue:
            line_i = vcf.readline().rstrip()
Ejemplo n.º 14
0
def vcfs2variants(vcf_files, bam_files, sample_names):

    assert len(vcf_files) == len(sample_names) == len(bam_files)

    variantDict = {}
    i = 0
    for vcf_file_i, bam_file_i, sample_name_i in zip(vcf_files, bam_files,
                                                     sample_names):

        with genome.open_textfile(vcf_file_i) as vcf, pysam.AlignmentFile(
                bam_file_i) as bam:

            line_i = vcf.readline().rstrip()
            while line_i.startswith('#'):
                line_i = vcf.readline().rstrip()

            while line_i:

                vcf_obj = genome.Vcf_line(line_i)
                item = line_i.split('\t')

                contig_i = item[0]
                pos_i = int(item[1])
                refbase = item[3]
                altbase = item[4]
                ID_field = item[2].split(';')
                filter_i = item[6].split(';')

                genes, amino_acid_changes, txn_ids = extract_snpEff(line_i)
                dbsnp_cosmic_ids = extract_dbsnp_cosmic(line_i)

                variant_id = (
                    contig_i,
                    pos_i,
                    refbase,
                    altbase,
                )

                vdp, rdp, odp, totaldp = vaf_from_bam(bam, (contig_i, pos_i),
                                                      refbase, altbase, 1)

                try:
                    vaf_i = vdp / totaldp
                except ZeroDivisionError:
                    vaf_i = math.nan

                if variant_id not in variantDict:
                    variantDict[variant_id] = {}
                    variantDict[variant_id]['GENES'] = genes
                    variantDict[variant_id]['AAChange'] = amino_acid_changes
                    variantDict[variant_id]['TRANSCRIPT'] = txn_ids
                    variantDict[variant_id]['DATABASE'] = dbsnp_cosmic_ids

                variantDict[variant_id][sample_name_i] = {
                    'FILTER': filter_i,
                    'VAF': vaf_i,
                    'VDP': vdp,
                    'DP': totaldp
                }

                line_i = vcf.readline().rstrip()

        i += 1

    return variantDict
Ejemplo n.º 15
0
def catch_up(line_1, line_2, file_1, file_2, output_vcf, id_1, id_2, id_12):

    id_1, id_2, id_12 = id_1, id_2, id_12

    vcf_1 = genome.Vcf_line(line_1)
    vcf_2 = genome.Vcf_line(line_2)

    coord_1 = [vcf_1.chromosome, vcf_1.position]
    coord_2 = [vcf_2.chromosome, vcf_2.position]

    print(coord_1, coord_2)

    is_behind = whoisbehind(coord_1, coord_2)

    # As long as the coordinates are not the same, and both files are not finished:
    while is_behind != 10:

        # If 1st VCF is behind:
        if is_behind == 0:

            item_1 = line_1.rstrip('\n').split('\t')

            # Write, unless...
            if item_1[idx_filter] != 'PrintEmALL':

                #item_1[idx_id] = id_1
                id_item = item_1[idx_id].split(';')
                id_item.append(id_1)
                item_1[idx_id] = ';'.join(id_item)
                item_1[idx_id] = re.sub(r'^\.;', '', item_1[idx_id])

                line_1 = '\t'.join(item_1)

                output_vcf.write(line_1 + '\n')

            line_1 = file_1.readline()
            vcf_1 = genome.Vcf_line(line_1)
            coord_1 = [vcf_1.chromosome, vcf_1.position]

        # If 2nd VCF is behind:
        elif is_behind == 1:

            item_2 = line_2.rstrip('\n').split('\t')

            # Write, unless...
            #if item_2[idx_filter] != 'PrintEmALL':

            #IF
            #item_2[idx_id] = id_2
            id_item = item_2[idx_id].split(';')
            id_item.append(id_2)
            item_2[idx_id] = ';'.join(id_item)
            item_2[idx_id] = re.sub(r'^\.;', '', item_2[idx_id])

            line_2 = '\t'.join(item_2)

            output_vcf.write(line_2 + '\n')
            ## FI

            line_2 = file_2.readline()
            vcf_2 = genome.Vcf_line(line_2)
            coord_2 = [vcf_2.chromosome, vcf_2.position]

        is_behind = whoisbehind(coord_1, coord_2)

    # Returns the value of the function:
    if coord_1[0] == coord_2[0] == '':
        result = 42
    else:

        item_1 = line_1.rstrip('\n').split('\t')
        item_2 = line_2.rstrip('\n').split('\t')

        #item_1[idx_id] = id_12
        id_item = item_1[idx_id].split(';')
        id_item.append(id_12)
        item_1[idx_id] = ';'.join(id_item)
        item_1[idx_id] = re.sub(r'^.;', '', item_1[idx_id])

        line_1 = '\t'.join(item_1)

        output_vcf.write(line_1 + '\n')

        result = (
            line_1,
            line_2,
        )

    return result