コード例 #1
0
ファイル: concat.py プロジェクト: zprh/somaticseq
def vcf(infileList, outfile):

    with open(outfile, 'w') as vcfout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as vcfin:

                line_i = vcfin.readline()

                while line_i.startswith('#'):
                    if not headerWritten:
                        vcfout.write(line_i)

                    line_i = vcfin.readline()

                # Turn off header writing from now on:
                headerWritten = True

                while line_i:
                    vcfout.write(line_i)
                    line_i = vcfin.readline()
    return 0
コード例 #2
0
ファイル: concat.py プロジェクト: zprh/somaticseq
def tsv(infileList, outfile):

    with open(outfile, 'w') as tsvout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as tsvin:

                # First line is a header
                line_i = tsvin.readline()

                if not headerWritten:
                    tsvout.write(line_i)

                # Turn off header writing from now on:
                headerWritten = True

                line_i = tsvin.readline()

                while line_i:
                    tsvout.write(line_i)
                    line_i = tsvin.readline()

    return 0
コード例 #3
0
def convert(infile, outfile):
    with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            vcf_out.write( line_i + '\n' )
            line_i = vcf_in.readline().rstrip()

        # This is the #CHROM line:
        headers = line_i.split('\t')
        num_columns = len(headers)
        vcf_out.write( line_i + '\n' )

        line_i = vcf_in.readline().rstrip()
        while line_i:

            items = line_i.split('\t')

            items[8] = 'GT:' + items[8]

            for i in range(9, num_columns):
                items[i] = '0/1:' + items[i]

            line_out = '\t'.join( items )
            vcf_out.write( line_out + '\n' )

            line_i = vcf_in.readline().rstrip()
コード例 #4
0
ファイル: copy_TextFile.py プロジェクト: zprh/somaticseq
def copy(infile, outfile):

    with genome.open_textfile(infile) as filein, open(outfile, 'w') as fileout:
        line_i = filein.readline()
        while line_i:
            fileout.write(line_i)
            line_i = filein.readline()
コード例 #5
0
def convert(infile, outfile):

    idx_chrom, idx_pos, idx_id, idx_ref, idx_alt, idx_qual, idx_filter, idx_info, idx_format, idx_SM1, idx_SM2 = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # VCF header
        while line_i.startswith('#'):

            vcfout.write(line_i + '\n')
            line_i = vcf.readline().rstrip()

        while line_i:

            # Print "SomaticSniper" into the INFO field if it is called so, otherwise never mind.
            item = line_i.split('\t')

            # In the REF field, non-GCTA characters should be changed to N to fit the VCF standard:
            item[idx_ref] = re.sub(r'[^GCTA]', 'N', item[idx_ref], flags=re.I)
            line_i = '\t'.join(item)

            vcfout.write(line_i + '\n')

            line_i = vcf.readline().rstrip()
コード例 #6
0
ファイル: modify_Strelka.py プロジェクト: zprh/somaticseq
def convert(infile, outfile):
    with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            vcf_out.write(line_i + '\n')
            line_i = vcf_in.readline().rstrip()

        # This is the #CHROM line:
        headers = line_i.split('\t')
        num_columns = len(headers)
        vcf_out.write(line_i + '\n')

        line_i = vcf_in.readline().rstrip()
        while line_i:

            items = line_i.split('\t')

            items[8] = 'GT:' + items[8]

            for i in range(9, num_columns):
                items[i] = '0/1:' + items[i]

            line_out = '\t'.join(items)
            vcf_out.write(line_out + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #7
0
def tsv(infileList, outfile, bgzip=False):

    with open(outfile, 'w') as tsvout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as tsvin:

                # First line is a header
                line_i = tsvin.readline()

                if not headerWritten:
                    tsvout.write(line_i)

                # Turn off header writing from now on:
                headerWritten = True

                line_i = tsvin.readline()

                while line_i:
                    tsvout.write(line_i)
                    line_i = tsvin.readline()

    if bgzip:
        bgzip_compress(outfile, True)
        actual_outfile = outfile + '.gz'
    else:
        actual_outfile = outfile

    return actual_outfile
コード例 #8
0
def vcf(infileList, outfile, bgzip=False):

    with open(outfile, 'w') as vcfout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as vcfin:

                line_i = vcfin.readline()

                while line_i.startswith('#'):
                    if not headerWritten:
                        vcfout.write(line_i)

                    line_i = vcfin.readline()

                # Turn off header writing from now on:
                headerWritten = True

                while line_i:
                    vcfout.write(line_i)
                    line_i = vcfin.readline()

    if bgzip:
        bgzip_compress(outfile, True)
        actual_outfile = outfile + '.gz'
    else:
        actual_outfile = outfile

    return actual_outfile
コード例 #9
0
def spreader(infileList, outfiles, chunk=4, bgzip=False, threads=1):
    '''
    Given an infile, it will spread its content into the outfiles "chunk" at a time, e.g,. 
    If infile is a fastq file, and output is 3 fastq files, then the first 4 lines will go to the 1st output, the next 4 lines to go the 2nd output, the next 4 lines go to the 3rd output, and then the next 4 lines will go back to the 1st output, so on and so forth.
    '''

    outs = [open(out_i, 'w') for out_i in outfiles]

    for infile in infileList:
        with genome.open_textfile(infile) as text_in:
            line_i = text_in.readline()
            while line_i:
                for out_i in outs:
                    for i in range(chunk):
                        out_i.write(line_i)
                        line_i = text_in.readline()

    [out_i.close() for out_i in outs]

    if bgzip:

        pool = Pool(processes=threads)
        bash_async = pool.map_async(bgzip_compress, outfiles)
        actual_outfiles = bash_async.get()
        pool.close()

    else:
        actual_outfiles = outfiles

    return actual_outfiles
コード例 #10
0
def convert(infile, outfile):

    idx_chrom,idx_pos,idx_id,idx_ref,idx_alt,idx_qual,idx_filter,idx_info,idx_format,idx_SM1,idx_SM2 = 0,1,2,3,4,5,6,7,8,9,10

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # VCF header
        while line_i.startswith('#'):

            vcfout.write( line_i + '\n')
            line_i = vcf.readline().rstrip()


        while line_i:

            # Print "SomaticSniper" into the INFO field if it is called so, otherwise never mind.
            item = line_i.split('\t')

            # In the REF field, non-GCTA characters should be changed to N to fit the VCF standard:
            item[idx_ref] = re.sub( r'[^GCTA]', 'N', item[idx_ref], flags=re.I )
            line_i = '\t'.join(item)

            vcfout.write( line_i + '\n' )

            line_i = vcf.readline().rstrip()
コード例 #11
0
ファイル: copy_TextFile.py プロジェクト: bioinform/somaticseq
def copy(infile, outfile):
    
    with genome.open_textfile(infile) as filein, open(outfile, 'w') as fileout:
        line_i = filein.readline()
        while line_i:
            fileout.write(line_i)
            line_i = filein.readline()
コード例 #12
0
def remove_vcf_illegal_lines(invcf, outvcf):
    '''
    In VarDict v1.7, there are lines with <XXX> in ALT without END in info, which will cause bedtools to fail. 
    This program will check if these things exist, and if they do, remove them.
    If the input VCF has illegal lines, it will return the modified output VCF file excluding those lines.
    If the input VCF file does not have such illegal lines, it will return False.
    '''
    
    hasIllegalLine = False
    with genome.open_textfile(invcf) as vcf:
        line_i = vcf.readline().rstrip()
        while line_i.startswith('#'):
            line_i = vcf.readline().rstrip()
            
        while line_i:
            
            vcf_i = genome.Vcf_line( line_i )
            
            if re.match(r'<\w+>', vcf_i.altbase) and ( not vcf_i.get_info_value('END') ):
                hasIllegalLine = True
                break
            
            line_i = vcf.readline().rstrip()
    
    if hasIllegalLine:
        with genome.open_textfile(invcf) as vcf, open(outvcf, 'w') as out:
            
            line_i = vcf.readline().rstrip()
            while line_i.startswith('#'):
                out.write( line_i + '\n')
                line_i = vcf.readline().rstrip()
            
            while line_i:
                
                vcf_i = genome.Vcf_line( line_i )
                
                if not ( re.match(r'<\w+>', vcf_i.altbase) and (not vcf_i.get_info_value('END')) ):
                    out.write( line_i + '\n')

                line_i = vcf.readline().rstrip()
        
        return outvcf
        
    else:
        return hasIllegalLine
コード例 #13
0
ファイル: concat.py プロジェクト: zprh/somaticseq
def bed(infileList, outfile):

    with open(outfile, 'w') as bedout:

        for file_i in infileList:

            with genome.open_textfile(file_i) as bedin:

                for line_i in bedin:
                    bedout.write(line_i)
    return 0
コード例 #14
0
ファイル: splitVcf.py プロジェクト: whiteorchid/somaticseq
def split_into_snv_and_indel(infile, snv_out, indel_out):

    with genome.open_textfile(infile) as vcf_in, open(
            snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('#'):

            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

            line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            if (',' not in vcf_i.altbase) and ('/' not in vcf_i.altbase):

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write(line_i + '\n')
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write(line_i + '\n')

            else:

                item = line_i.split('\t')

                if ',' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split(',')
                elif '/' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split('/')

                for ith_base, altbase_i in enumerate(alt_bases):

                    item[4] = altbase_i
                    new_line = '\t'.join(item)

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write(new_line + '\n')
                    elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                        indel_out.write(new_line + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #15
0
def bed(infileList, outfile, bgzip=False):

    with open(outfile, 'w') as bedout:

        for file_i in infileList:

            with genome.open_textfile(file_i) as bedin:

                for line_i in bedin:
                    bedout.write(line_i)

    if bgzip:
        bgzip_compress(outfile, True)
        actual_outfile = outfile + '.gz'
    else:
        actual_outfile = outfile

    return actual_outfile
コード例 #16
0
ファイル: concat.py プロジェクト: bioinform/somaticseq
def tsv(infileList, outfile):

    with open(outfile, 'w') as tsvout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as tsvin:

                # First line is a header
                line_i = tsvin.readline()

                if not headerWritten:
                    tsvout.write( line_i )

                # Turn off header writing from now on:
                headerWritten = True

                line_i = tsvin.readline()

                while line_i:
                    tsvout.write( line_i )
                    line_i = tsvin.readline()
コード例 #17
0
ファイル: concat.py プロジェクト: bioinform/somaticseq
def vcf(infileList, outfile):

    with open(outfile, 'w') as vcfout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as vcfin:

                line_i = vcfin.readline()

                while line_i.startswith('#'):
                    if not headerWritten:
                        vcfout.write( line_i )

                    line_i = vcfin.readline()

                # Turn off header writing from now on:
                headerWritten = True

                while line_i:
                    vcfout.write( line_i )
                    line_i = vcfin.readline()
コード例 #18
0
ファイル: modify_VarScan2.py プロジェクト: zprh/somaticseq
def convert(infile, outfile):

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # Skip headers from now on:
        while line_i.startswith('#'):

            if line_i.startswith('##FORMAT=<ID=DP4,'):
                line_i = '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">'

            elif line_i.startswith('##FORMAT=<ID=AD,'):
                line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

            vcfout.write(line_i + '\n')

            line_i = vcf.readline().rstrip()

        # Doing the work here:
        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            num_samples = len(vcf_i.samples)
            if num_samples == 1:
                paired = False

            elif num_samples == 2:
                paired = True

            elif num_samples > 2:
                sys.stderr.write(
                    'We found more than 2 sammples in this VCF file. It may be messed up, but I\'ll just assume the first 2 samples mean anything at all'
                )
                paired = True

            elif num_samples == 0:
                raise Exception('No sample information here.')

            # Replace the wrong "G/A" with the correct "G,A" in ALT column:
            vcf_i.altbase = vcf_i.altbase.replace('/', ',')

            # vcf-validator is not going to accept multiple sequences in the REF, as is the case in VarScan2's indel output:
            vcf_i.refbase = re.sub(r'[^\w].*$', '', vcf_i.refbase)

            # Get rid of non-compliant characters in the ALT column:
            vcf_i.altbase = re.sub(r'[^\w,.]', '', vcf_i.altbase)

            # Eliminate dupliate entries in ALT:
            vcf_i.altbase = re.sub(r'(\w+),\1', r'\1', vcf_i.altbase)

            # Eliminate ALT entries when it matches with the REF column, to address vcf-validator complaints:
            if ',' in vcf_i.altbase:
                alt_item = vcf_i.altbase.split(',')

                if vcf_i.refbase in alt_item:

                    bad_idx = alt_item.index(vcf_i.refbase)
                    alt_item.pop(bad_idx)
                    vcf_i.altbase = ','.join(alt_item)

                # To fix this vcf-validator complaints:
                # Could not parse the allele(s) [GTC], first base does not match the reference
                for n1, alt_i in enumerate(alt_item[1::]):
                    if not alt_i.startswith(vcf_i.refbase):

                        alt_item.pop(n1 + 1)
                        vcf_i.altbase = ','.join(alt_item)

            # Combine AD:RD into AD:
            format_items = vcf_i.get_sample_variable()
            if 'AD' in format_items and 'RD' in format_items:

                rd_sm1 = vcf_i.get_sample_value('RD', 0)
                ad_sm1 = vcf_i.get_sample_value('AD', 0)

                try:
                    rd_sm2 = vcf_i.get_sample_value('RD', 1)
                    ad_sm2 = vcf_i.get_sample_value('AD', 1)
                except IndexError:
                    rd_sm2 = ad_sm2 = 0

                idx_ad = format_items.index('AD')
                idx_rd = format_items.index('RD')
                format_items.pop(idx_rd)
                vcf_i.field = ':'.join(format_items)

                item_normal = vcf_i.samples[0].split(':')
                item_normal[idx_ad] = '{},{}'.format(rd_sm1, ad_sm1)
                item_normal.pop(idx_rd)
                vcf_i.samples[0] = ':'.join(item_normal)

                if paired:

                    item_tumor = vcf_i.samples[1].split(':')
                    item_tumor[idx_ad] = '{},{}'.format(rd_sm2, ad_sm2)
                    item_tumor.pop(idx_rd)
                    vcf_i.samples[1] = ':'.join(item_tumor)

            # Reform the line:
            line_i = '\t'.join(
                (vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier,
                 vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters,
                 vcf_i.info, vcf_i.field, '\t'.join((vcf_i.samples))))

            # VarScan2 output a line with REF allele as "M". GATK CombineVariants complain about that.
            if not re.search(r'[^GCTAU]', vcf_i.refbase, re.I):
                vcfout.write(line_i + '\n')

            # Next line:
            line_i = vcf.readline().rstrip()
コード例 #19
0
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, bam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, vardict=None, lofreq=None, scalpel=None, strelka=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file  = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')


    # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position


    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        bam    = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header( truth )

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header( cosmic )

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header( dbsnp )

        # 6 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header( mutect )

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header( varscan )

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header( vardict )

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header( lofreq )

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header( scalpel )

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header( strelka )


        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match( genome.pattern_chr_position, my_line )
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write( out_header.replace('{','').replace('}','')  + '\n' )

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line( my_line )
                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append( vcf_i )


                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line( my_line )

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match( genome.pattern_chr_position, my_line )
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1:
                        raise Exception( '{} does not seem to be properly sorted.'.format(mysites) )

                    coordinate_i = coordinate_j
                    ###################################################################################
                    
                    if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append( vcf_i )

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates( bed_item[0], int(bed_item[1])+1, int(bed_item[2]) )

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates( pos_item[0], int(pos_item[1]), int(pos_item[1]) )

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates( fai_item[0], 1, int(fai_item[1]) )

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append( ref_base )
                        alt_bases.append( first_alt )
                        indel_lengths.append( indel_length )

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp  = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value('COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value('CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set( my_identifier_i )

                        all_my_identifiers.append( my_identifier_i )

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [None] # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                # Keep track of NumCallers:
                num_callers = 0

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:   got_mutect,  mutect_variants,  mutect_line  = genome.find_vcf_at_coordinate(my_coordinate, mutect_line,  mutect,  chrom_seq)
                if varscan:  got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(my_coordinate, varscan_line, varscan, chrom_seq)
                if vardict:  got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(my_coordinate, vardict_line, vardict, chrom_seq)
                if lofreq:   got_lofreq,  lofreq_variants,  lofreq_line  = genome.find_vcf_at_coordinate(my_coordinate, lofreq_line,  lofreq,  chrom_seq)
                if scalpel:  got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:  got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(my_coordinate, strelka_line, strelka, chrom_seq)
                if truth:    got_truth,   truth_variants,   truth_line   = genome.find_vcf_at_coordinate(my_coordinate, truth_line,   truth,   chrom_seq)
                if dbsnp:    got_dbsnp,   dbsnp_variants,   dbsnp_line   = genome.find_vcf_at_coordinate(my_coordinate, dbsnp_line,   dbsnp,   chrom_seq)
                if cosmic:   got_cosmic,  cosmic_variants,  cosmic_line  = genome.find_vcf_at_coordinate(my_coordinate, cosmic_line,  cosmic,  chrom_seq)

                # Now, use pysam to look into the tBAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate( variants_at_my_coordinate ):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ( (my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase )

                        ref_base       = ref_bases[ith_call]
                        first_alt      = alt_bases[ith_call]
                        indel_length   = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ( (my_coordinate[0], my_coordinate[1]), ref_base, first_alt )


                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, tlod, ecnt = annotate_caller.ssMuTect(variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = tlod = ecnt = nan


                    if varscan:
                        varscan_classification, score_varscan2 = annotate_caller.ssVarScan(variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = score_varscan2 = nan


                    if vardict:
                        vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict(variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan


                    if lofreq:
                        lofreq_classification = annotate_caller.ssLoFreq(variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan


                    if scalpel:
                        scalpel_classification = annotate_caller.ssScalpel(variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan


                    if strelka:
                        strelka_classification = annotate_caller.ssStrelka(variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = nan


                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants.keys():
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan


                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add( ID_i )


                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add( ID_i )


                        ########## ######### INFO EXTRACTION FROM BAM FILES ########## #########
                        # Tumor tBAM file:
                        tBamFeatures = sequencing_features.from_bam(bam, my_coordinate, ref_base, first_alt, min_mq, min_bq)

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                   = my_coordinate[0],                                                    \
                        POS                     = my_coordinate[1],                                                    \
                        ID                      = my_identifiers,                                                      \
                        REF                     = ref_base,                                                            \
                        ALT                     = first_alt,                                                           \
                        if_MuTect               = mutect_classification,                                               \
                        if_Strelka              = strelka_classification,                                              \
                        if_VarScan2             = varscan_classification,                                              \
                        if_VarDict              = vardict_classification,                                              \
                        if_LoFreq               = lofreq_classification,                                               \
                        if_Scalpel              = scalpel_classification,                                              \
                        VarScan2_Score          = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        if_dbsnp                = if_dbsnp,                                                            \
                        COMMON                  = if_common,                                                           \
                        if_COSMIC               = if_cosmic,                                                           \
                        COSMIC_CNT              = num_cases,                                                           \
                        Consistent_Mates        = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates      = tBamFeatures['inconsistent_mates'],                                  \
                        M2_TLOD                 = tlod,                                                                \
                        M2_ECNT                 = ecnt,                                                                \
                        MSI                     = msi,                                                                 \
                        MSILEN                  = msilen,                                                              \
                        SHIFT3                  = shift3,                                                              \
                        MaxHomopolymer_Length   = homopolymer_length,                                                  \
                        SiteHomopolymer_Length  = site_homopolymer_length,                                             \
                        T_DP                    = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ             = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ             = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_Z_Ranksums_MQ      = '%g' % tBamFeatures['z_ranksums_mq'],                                \
                        tBAM_REF_BQ             = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ             = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_Z_Ranksums_BQ      = '%g' % tBamFeatures['z_ranksums_bq'],                                \
                        tBAM_REF_NM             = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM             = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff            = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant     = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant     = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant     = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant     = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET    = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR               = tBamFeatures['ref_for'],                                             \
                        T_REF_REV               = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR               = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV               = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET     = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_Z_Ranksums_EndPos  = '%g' % tBamFeatures['z_ranksums_endpos'],                            \
                        tBAM_REF_Clipped_Reads  = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads  = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET       = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads        = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads         = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp      = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp      = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp      = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp      = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp      = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp      = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length            = indel_length,                                                        \
                        TrueVariant_or_False    = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict, lofreq, scalpel, strelka)
        [opened_file.close() for opened_file in opened_files if opened_file]
コード例 #20
0
def convert(infile, snv_out, indel_out, is_tnscope):

    info_to_split = 'NLOD', 'TLOD'
    info_to_keep = 'STR', 'ECNT'

    with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            if line_i.startswith('##normal_sample='):
                normal_name = line_i.split('=')[1]

            if line_i.startswith('##tumor_sample='):
                tumor_name = line_i.split('=')[1]

            if line_i.startswith('##INFO=<ID=SOR,'):
                line_i = re.sub(r'Float', 'String', line_i)

            snv_out.write( line_i + '\n' )
            indel_out.write( line_i + '\n' )

            line_i = vcf_in.readline().rstrip()

        # This line will be #CHROM:
        snv_out.write( line_i + '\n' )
        indel_out.write( line_i + '\n' )
        header = line_i.split('\t')

        if is_tnscope:
            # Doesn't matter which one is normal/tumor. These information are not used.
            normal_index, tumor_index = 1,0

        else:
            normal_index = header.index(normal_name) - 9
            tumor_index = header.index(tumor_name) - 9

        # This will be the first variant line:
        line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line( line_i )

            if ',' not in vcf_i.altbase:

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write( line_i + '\n' )
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write( line_i + '\n' )

            else:
                alt_bases = vcf_i.altbase.split(',')
                measures = []
                still_measures = []

                for measure_i in info_to_split:
                    try:
                        measures.append( vcf_i.get_info_value(measure_i).split(',') )
                    except AttributeError:
                        measures.append( None )

                for measure_i in info_to_keep:
                    try:
                        still_measures.append( vcf_i.get_info_value(measure_i) )
                    except AttributeError:
                        still_measures.append( None )

                for ith_base, altbase_i in enumerate(alt_bases):

                    split_infos = [ '{}={}'.format(info_variable, info_value[ith_base]) for info_variable, info_value in zip(info_to_split, measures) if info_value != None ]

                    still_infos = [ '{}={}'.format(info_variable, info_value) for info_variable, info_value in zip(info_to_keep, still_measures) if info_value != False ]

                    split_infos.extend(still_infos)

                    info_string = ';'.join( split_infos )

                    GT0 = vcf_i.get_sample_value('GT', idx=0)
                    if GT0 != '0/0' and GT0 != '0/1':
                        sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0])
                    else:
                        sample_0 = vcf_i.samples[0]

                    GT1 = vcf_i.get_sample_value('GT', idx=1)
                    if GT1 != '0/0' and GT0 != '0/1':
                        sample_1 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[1])
                    else:
                        sample_1 = vcf_i.samples[1]


                    new_line = '\t'.join(( vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, altbase_i, vcf_i.qual, vcf_i.filters, info_string, vcf_i.field, sample_0, sample_1 ))

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write( new_line + '\n' )
                    elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1:
                        indel_out.write( new_line + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #21
0
def convert(infile, outfile):

    idx_chrom, idx_pos, idx_id, idx_ref, idx_alt, idx_qual, idx_filter, idx_info, idx_format, idx_SM1, idx_SM2 = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # VCF header
        while line_i.startswith('#'):

            if line_i.startswith('##FORMAT=<ID=AD,'):
                line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

            vcfout.write(line_i + '\n')

            line_i = vcf.readline().rstrip()

        while line_i:

            item = line_i.split('\t')

            format_items = item[idx_format].split(':')
            if 'AD' in format_items and 'RD' in format_items:

                # NORMAL
                idx_ad = format_items.index('AD')
                idx_rd = format_items.index('RD')
                format_items.pop(idx_rd)

                item_normal = item[idx_SM1].split(':')
                normal_ad = int(item_normal[idx_ad])
                normal_rd = int(item_normal[idx_rd])

                try:
                    vaf = normal_ad / (normal_ad + normal_rd)
                except ZeroDivisionError:
                    vaf = 0

                if vaf > 0.8:
                    normal_gt = '1/1'
                elif vaf > 0.25:
                    normal_gt = '0/1'
                else:
                    normal_gt = '0/0'

                item_normal[idx_ad] = '{},{}'.format(item_normal[idx_rd],
                                                     item_normal[idx_ad])
                item_normal.pop(idx_rd)
                item_normal = [normal_gt] + item_normal

                # TUMOR
                item_tumor = item[idx_SM2].split(':')
                tumor_ad = int(item_tumor[idx_ad])
                tumor_rd = int(item_tumor[idx_rd])

                try:
                    vaf = tumor_ad / (tumor_ad + tumor_rd)
                except ZeroDivisionError:
                    vaf = 0

                if vaf > 0.8:
                    tumor_gt = '1/1'
                else:
                    tumor_gt = '0/1'

                item_tumor[idx_ad] = '{},{}'.format(item_tumor[idx_rd],
                                                    item_tumor[idx_ad])
                item_tumor.pop(idx_rd)
                item_tumor = [tumor_gt] + item_tumor

                # Rewrite
                item[idx_format] = 'GT:' + ':'.join(format_items)
                item[idx_SM1] = ':'.join(item_normal)
                item[idx_SM2] = ':'.join(item_tumor)

            line_i = '\t'.join(item)

            vcfout.write(line_i + '\n')

            line_i = vcf.readline().rstrip()
コード例 #22
0
def convert(infile, outfile):

    idx_chrom,idx_pos,idx_id,idx_ref,idx_alt,idx_qual,idx_filter,idx_info,idx_format,idx_SM1,idx_SM2 = 0,1,2,3,4,5,6,7,8,9,10

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # VCF header
        while line_i.startswith('#'):

            if line_i.startswith('##FORMAT=<ID=AD,'):
                line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

            vcfout.write( line_i + '\n')

            line_i = vcf.readline().rstrip()


        while line_i:

            item = line_i.split('\t')

            format_items = item[idx_format].split(':')
            if 'AD' in format_items and 'RD' in format_items:

                # NORMAL
                idx_ad = format_items.index('AD')
                idx_rd = format_items.index('RD')
                format_items.pop(idx_rd)

                item_normal = item[idx_SM1].split(':')
                normal_ad = int(item_normal[idx_ad])
                normal_rd = int(item_normal[idx_rd])

                try:
                    vaf = normal_ad / (normal_ad + normal_rd)
                except ZeroDivisionError:
                    vaf = 0

                if vaf > 0.8:
                    normal_gt = '1/1'
                elif vaf > 0.25:
                    normal_gt = '0/1'
                else:
                    normal_gt = '0/0'

                item_normal[idx_ad] = '{},{}'.format( item_normal[idx_rd] , item_normal[idx_ad] )
                item_normal.pop(idx_rd)
                item_normal = [normal_gt] + item_normal

                # TUMOR
                item_tumor = item[idx_SM2].split(':')
                tumor_ad = int(item_tumor[idx_ad])
                tumor_rd = int(item_tumor[idx_rd])

                try:
                    vaf = tumor_ad / (tumor_ad + tumor_rd)
                except ZeroDivisionError:
                    vaf = 0

                if vaf > 0.8:
                    tumor_gt = '1/1'
                else:
                    tumor_gt = '0/1'

                item_tumor[idx_ad] = '{},{}'.format( item_tumor[idx_rd] , item_tumor[idx_ad] )
                item_tumor.pop(idx_rd)
                item_tumor = [tumor_gt] + item_tumor

                # Rewrite
                item[idx_format] = 'GT:' + ':'.join(format_items)
                item[idx_SM1] = ':'.join(item_normal)
                item[idx_SM2] = ':'.join(item_tumor)


            line_i = '\t'.join(item)

            vcfout.write(line_i+'\n')

            line_i = vcf.readline().rstrip()
コード例 #23
0
ファイル: modify_MuTect.py プロジェクト: bioinform/somaticseq
def convert(infile, outfile, tbam, nbam):
    
    paired_mode = True if nbam else False

    # Get tumor and normal sample names from the bam files:
    nbam_header = genome.pysam_header(nbam) if nbam else None
    tbam_header = genome.pysam_header(tbam)

    # When MuTect is run in a "single sample mode," the "normal" will be named "none."
    n_samplename = nbam_header.SM() if nbam else ['none']
    t_samplename = tbam_header.SM()

    if not ( len(n_samplename)==1 and len(t_samplename)==1 ):
        sys.stderr.write('There are multiple Sample Names present in the BAM file!')

    n_samplename = n_samplename[0]
    t_samplename = t_samplename[0]

    assert t_samplename or n_samplename
    
    if t_samplename and n_samplename:
        paired_mode = True
    else:
        paired_mode = False
    
    idx_chrom,idx_pos,idx_id,idx_ref,idx_alt,idx_qual,idx_filter,idx_info,idx_format = 0,1,2,3,4,5,6,7,8
    idx_SM1, idx_SM2 = 9,10
    
    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        while line_i.startswith('#'):


            if line_i.startswith('##'):
                vcfout.write( line_i + '\n' )

            elif line_i.startswith('#CHROM'):
                header_items = line_i.rstrip().split('\t')
    
                idxN = header_items.index(n_samplename)
                idxT = header_items.index(t_samplename)
    
                if paired_mode:
                    header_items[idx_SM1] = 'NORMAL'
                    header_items[idx_SM2] = 'TUMOR'

                else:

                    # Keep up to the first sample column, then make sure it's labeled the TUMOR sample name
                    header_items = header_items[:idx_SM1+1]
                    header_items[idx_SM1] = args.tumor_sample_name
    
                replaced_header = '\t'.join(header_items)
                vcfout.write(replaced_header + '\n')
    
            line_i = vcf.readline().rstrip()


        while line_i:

            items_i = line_i.split('\t')

            if paired_mode:
                items_i[idx_SM1], items_i[idx_SM2] = items_i[idxN], items_i[idxT]

            else:
                items_i = items_i[:idx_SM1] + [items_i[idxT]]

            # Print the new stuff:
            new_line = '\t'.join( items_i )

            # Have to get rid of "N" in REF, because after snpSift annotation, it changes the ALT and vcf-validator will complain.
            if not ( 'N' in items_i[idx_ref] ):
                vcfout.write( new_line + '\n' )

            line_i = vcf.readline().rstrip()
コード例 #24
0
def convert(infile, snv_out, indel_out):

    info_to_split = 'NLOD', 'TLOD'
    info_to_keep = 'STR', 'ECNT'

    with genome.open_textfile(infile) as vcf_in, open(
            snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

            if line_i.startswith('##normal_sample='):
                normal_name = line_i.split('=')[1]

            if line_i.startswith('##tumor_sample='):
                tumor_name = line_i.split('=')[1]

            line_i = vcf_in.readline().rstrip()
            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

        # This line will be #CHROM:
        header = line_i.split('\t')

        # This will be the first variant line:
        line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            # If "germlinerisk" is the only flag, then make it PASS since there is no matched normal
            if vcf_i.filters == 'germline_risk':
                vcf_i.filters = 'PASS'

            if ',' not in vcf_i.altbase:

                item = line_i.split('\t')
                if item[6] == 'germline_risk':
                    item[6] = 'PASS'

                new_line = '\t'.join(item)

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write(new_line + '\n')
                else:
                    indel_out.write(new_line + '\n')

            else:
                alt_bases = vcf_i.altbase.split(',')
                measures = []
                still_measures = []

                for measure_i in info_to_split:
                    try:
                        measures.append(
                            vcf_i.get_info_value(measure_i).split(','))
                    except AttributeError:
                        measures.append(None)

                for measure_i in info_to_keep:
                    try:
                        still_measures.append(vcf_i.get_info_value(measure_i))
                    except AttributeError:
                        still_measures.append(None)

                for ith_base, altbase_i in enumerate(alt_bases):

                    split_infos = [
                        '{}={}'.format(info_variable, info_value[ith_base])
                        for info_variable, info_value in zip(
                            info_to_split, measures) if info_value != None
                    ]

                    still_infos = [
                        '{}={}'.format(info_variable, info_value)
                        for info_variable, info_value in zip(
                            info_to_keep, still_measures)
                        if info_value != False
                    ]

                    split_infos.extend(still_infos)

                    info_string = ';'.join(split_infos)

                    GT0 = vcf_i.get_sample_value('GT', idx=0)
                    if GT0 != '0/0' and GT0 != '0/1':
                        sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0])
                    else:
                        sample_0 = vcf_i.samples[0]

                    new_line = '\t'.join(
                        (vcf_i.chromosome, str(vcf_i.position),
                         vcf_i.identifier, vcf_i.refbase, altbase_i,
                         vcf_i.qual, vcf_i.filters, info_string, vcf_i.field,
                         sample_0))

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write(new_line + '\n')
                    else:
                        indel_out.write(new_line + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #25
0
def convert(infile, outfile, tbam, nbam):

    paired_mode = True if nbam else False

    # Get tumor and normal sample names from the bam files:
    nbam_header = genome.pysam_header(nbam) if nbam else None
    tbam_header = genome.pysam_header(tbam)

    # When MuTect is run in a "single sample mode," the "normal" will be named "none."
    n_samplename = nbam_header.SM() if nbam else ['none']
    t_samplename = tbam_header.SM()

    if not (len(n_samplename) == 1 and len(t_samplename) == 1):
        sys.stderr.write(
            'There are multiple Sample Names present in the BAM file!')

    n_samplename = n_samplename[0]
    t_samplename = t_samplename[0]

    assert t_samplename or n_samplename

    if t_samplename and n_samplename:
        paired_mode = True
    else:
        paired_mode = False

    idx_chrom, idx_pos, idx_id, idx_ref, idx_alt, idx_qual, idx_filter, idx_info, idx_format = 0, 1, 2, 3, 4, 5, 6, 7, 8
    idx_SM1, idx_SM2 = 9, 10

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        while line_i.startswith('#'):

            if line_i.startswith('##'):
                vcfout.write(line_i + '\n')

            elif line_i.startswith('#CHROM'):
                header_items = line_i.rstrip().split('\t')

                idxN = header_items.index(n_samplename)
                idxT = header_items.index(t_samplename)

                if paired_mode:
                    header_items[idx_SM1] = 'NORMAL'
                    header_items[idx_SM2] = 'TUMOR'

                else:

                    # Keep up to the first sample column, then make sure it's labeled the TUMOR sample name
                    header_items = header_items[:idx_SM1 + 1]
                    header_items[idx_SM1] = args.tumor_sample_name

                replaced_header = '\t'.join(header_items)
                vcfout.write(replaced_header + '\n')

            line_i = vcf.readline().rstrip()

        while line_i:

            items_i = line_i.split('\t')

            if paired_mode:
                items_i[idx_SM1], items_i[idx_SM2] = items_i[idxN], items_i[
                    idxT]

            else:
                items_i = items_i[:idx_SM1] + [items_i[idxT]]

            # Print the new stuff:
            new_line = '\t'.join(items_i)

            # Have to get rid of "N" in REF, because after snpSift annotation, it changes the ALT and vcf-validator will complain.
            if not ('N' in items_i[idx_ref]):
                vcfout.write(new_line + '\n')

            line_i = vcf.readline().rstrip()
コード例 #26
0
ファイル: splitVcf.py プロジェクト: zprh/somaticseq
def split_into_snv_and_indel(infile, snv_out, indel_out):

    with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('#'):

            snv_out.write( line_i + '\n' )
            indel_out.write( line_i + '\n' )

            line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line( line_i )


            if (',' not in vcf_i.altbase) and ('/' not in vcf_i.altbase):

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write( line_i + '\n' )
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write( line_i + '\n' )

            else:
                
                item = line_i.split('\t')
                
                if ',' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split(',')
                elif '/' in vcf_i.altbase:
                    alt_bases = vcf_i.altbase.split('/')
                else:
                    raise Exception('Check the line: {}'.format(line_i))
                
                for ith_base, altbase_i in enumerate(alt_bases):

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        item_j    = copy(item)
                        item_j[4] = altbase_i
                        new_line  = '\t'.join(item_j)
                        
                        snv_out.write( new_line + '\n' )
                    
                    elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1:
                        item_j    = copy(item)
                        item_j[4] = altbase_i
                        new_line  = '\t'.join(item_j)
                        
                        indel_out.write( new_line + '\n')
                        
                    else:
                        complex_variant = complex2indel.translate(vcf_i.refbase, altbase_i)
                        
                        if complex_variant:
                            (new_ref, new_alt), offset = complex_variant
                            
                            if new_ref[0] == new_alt[0] and ( len(new_ref) == 1 or len(new_alt) == 1):
                                
                                item_j    = copy(item)
                                item_j[3] = new_ref
                                item_j[4] = new_alt
                                
                                # This *may* cause the output VCF file to go out of order
                                if offset != 0:
                                    item_j[1] = str( int(item[1]) + offset )
                                    
                                new_line = '\t'.join(item_j)
                                indel_out.write( new_line + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #27
0
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-infile',    '--input-vcf-file',    type=str, help='Input VCF file',  required=True)
parser.add_argument('-bam',       '--bam-file',          type=str, help='BAM file',        required=True)
parser.add_argument('-ref',       '--genome-reference',  type=str, help='.fasta file to get the ref base', required=True, default=None)
parser.add_argument('-outfile',   '--output-vcf-file',   type=str, help='Output VCF file', required=True)
parser.add_argument('-threshold', '--phasing-threshold', type=int, help='How far apart do we try to phase', required=False, default=1)

args = parser.parse_args()

infile    = args.input_vcf_file
bam       = args.bam_file
ref_fa    = args.genome_reference
outfile   = args.output_vcf_file
threshold = args.phasing_threshold

with genome.open_textfile(infile) as infile, \
pysam.AlignmentFile(bam) as bam, \
open(outfile, 'w') as outfile, \
pysam.FastaFile(ref_fa) as ref_fa:
    
    my_line = infile.readline().rstrip()
    
    while my_line.startswith('##'):
        outfile.write( my_line + '\n' )
        my_line = infile.readline().rstrip()
        
    # This is to read through and copy the #CHROM line
    assert my_line.startswith('#CHROM')
    outfile.write('##INFO=<ID=COORDINATES,Number=.,Type=Integer,Description="Coordinates of the bases">\n')
    outfile.write('##INFO=<ID=PDP,Number=.,Type=Integer,Description="Phased DP, one for reference, and each of the variant calls.">\n')
    outfile.write( my_line + '\n' )
コード例 #28
0
def vcfs2variants(vcf_files, bam_files, sample_names):

    assert len(vcf_files) == len(sample_names) == len(bam_files)

    variantDict = {}
    i = 0
    for vcf_file_i, bam_file_i, sample_name_i in zip(vcf_files, bam_files,
                                                     sample_names):

        with genome.open_textfile(vcf_file_i) as vcf, pysam.AlignmentFile(
                bam_file_i) as bam:

            line_i = vcf.readline().rstrip()
            while line_i.startswith('#'):
                line_i = vcf.readline().rstrip()

            while line_i:

                vcf_obj = genome.Vcf_line(line_i)
                item = line_i.split('\t')

                contig_i = item[0]
                pos_i = int(item[1])
                refbase = item[3]
                altbase = item[4]
                ID_field = item[2].split(';')
                filter_i = item[6].split(';')

                genes, amino_acid_changes, txn_ids = extract_snpEff(line_i)
                dbsnp_cosmic_ids = extract_dbsnp_cosmic(line_i)

                variant_id = (
                    contig_i,
                    pos_i,
                    refbase,
                    altbase,
                )

                vdp, rdp, odp, totaldp = vaf_from_bam(bam, (contig_i, pos_i),
                                                      refbase, altbase, 1)

                try:
                    vaf_i = vdp / totaldp
                except ZeroDivisionError:
                    vaf_i = math.nan

                if variant_id not in variantDict:
                    variantDict[variant_id] = {}
                    variantDict[variant_id]['GENES'] = genes
                    variantDict[variant_id]['AAChange'] = amino_acid_changes
                    variantDict[variant_id]['TRANSCRIPT'] = txn_ids
                    variantDict[variant_id]['DATABASE'] = dbsnp_cosmic_ids

                variantDict[variant_id][sample_name_i] = {
                    'FILTER': filter_i,
                    'VAF': vaf_i,
                    'VDP': vdp,
                    'DP': totaldp
                }

                line_i = vcf.readline().rstrip()

        i += 1

    return variantDict
コード例 #29
0
def convert(infile, outfile):

    with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout:

        line_i = vcf.readline().rstrip()

        # Skip headers from now on:
        while line_i.startswith('#'):

            if line_i.startswith('##FORMAT=<ID=DP4,'):
                line_i = '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">'

            elif line_i.startswith('##FORMAT=<ID=AD,'):
                line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

            vcfout.write( line_i + '\n')

            line_i = vcf.readline().rstrip()

        # Doing the work here:
        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            num_samples = len( vcf_i.samples )
            if num_samples == 1:
                paired = False

            elif num_samples == 2:
                paired = True

            elif num_samples > 2:
                sys.stderr.write('We found more than 2 sammples in this VCF file. It may be messed up, but I\'ll just assume the first 2 samples mean anything at all')
                paired = True

            elif num_samples == 0:
                raise Exception('No sample information here.')

            # Replace the wrong "G/A" with the correct "G,A" in ALT column:
            vcf_i.altbase = vcf_i.altbase.replace('/', ',')

            # vcf-validator is not going to accept multiple sequences in the REF, as is the case in VarScan2's indel output:
            vcf_i.refbase = re.sub( r'[^\w].*$', '', vcf_i.refbase )

            # Get rid of non-compliant characters in the ALT column:
            vcf_i.altbase = re.sub(r'[^\w,.]', '', vcf_i.altbase)

            # Eliminate dupliate entries in ALT:
            vcf_i.altbase = re.sub(r'(\w+),\1', r'\1', vcf_i.altbase )

            # Eliminate ALT entries when it matches with the REF column, to address vcf-validator complaints:
            if ',' in vcf_i.altbase:
                alt_item = vcf_i.altbase.split(',')

                if vcf_i.refbase in alt_item:

                    bad_idx = alt_item.index(vcf_i.refbase)
                    alt_item.pop(bad_idx)
                    vcf_i.altbase = ','.join(alt_item)

                # To fix this vcf-validator complaints:
                # Could not parse the allele(s) [GTC], first base does not match the reference
                for n1,alt_i in enumerate(alt_item[1::]):
                    if not alt_i.startswith( vcf_i.refbase ):

                        alt_item.pop(n1+1)
                        vcf_i.altbase = ','.join(alt_item)


            # Combine AD:RD into AD:
            format_items = vcf_i.get_sample_variable()
            if 'AD' in format_items and 'RD' in format_items:

                rd_sm1 = vcf_i.get_sample_value('RD', 0)
                ad_sm1 = vcf_i.get_sample_value('AD', 0)

                try:
                    rd_sm2 = vcf_i.get_sample_value('RD', 1)
                    ad_sm2 = vcf_i.get_sample_value('AD', 1)
                except IndexError:
                    rd_sm2 = ad_sm2 = 0


                idx_ad = format_items.index('AD')
                idx_rd = format_items.index('RD')
                format_items.pop(idx_rd)
                vcf_i.field = ':'.join(format_items)

                item_normal = vcf_i.samples[0].split(':')
                item_normal[idx_ad] = '{},{}'.format( rd_sm1, ad_sm1 )
                item_normal.pop(idx_rd)
                vcf_i.samples[0] = ':'.join(item_normal)

                if paired:

                    item_tumor = vcf_i.samples[1].split(':')
                    item_tumor[idx_ad] = '{},{}'.format( rd_sm2, ad_sm2 )
                    item_tumor.pop(idx_rd)
                    vcf_i.samples[1] = ':'.join(item_tumor)


            # Reform the line:
            line_i = '\t'.join(( vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters, vcf_i.info, vcf_i.field, '\t'.join((vcf_i.samples)) ))

            # VarScan2 output a line with REF allele as "M". GATK CombineVariants complain about that.
            if not re.search(r'[^GCTAU]', vcf_i.refbase, re.I):
                vcfout.write(line_i+'\n')

            # Next line:
            line_i = vcf.readline().rstrip()
コード例 #30
0
parser.add_argument('-infile',  '--vcf-in',   type=str, help='VCF in', required=True)
parser.add_argument('-outfile', '--vcf-out',  type=str, help='VCF out', required=True)
parser.add_argument('-callers', '--callers-classification-string', type=str, help='MVJSD or whatever',  required=True)
parser.add_argument('-tumor',   '--tumor-sample-name', type=str, help='tumor sample name',  required=False, default='TUMOR')
parser.add_argument('-trained', '--somaticseq-trained',    action='store_true', help='If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False)


args = parser.parse_args()

vcf_in_fn  = args.vcf_in
vcf_out_fn = args.vcf_out
caller_string = args.callers_classification_string
tumor = args.tumor_sample_name
somaticseq_trained = args.somaticseq_trained

with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout:
    
    line_in = vcfin.readline().rstrip('\n')
    
    while line_in.startswith('##'):
        
        if line_in.startswith('##SomaticSeq='):
            line_out = line_in + '-SEQC2'
            
        elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith('##INFO=<ID={COMBO}'.format(COMBO=caller_string)):
            line_out = re.sub('##INFO=', '##FORMAT=', line_in)
            
        else:
            line_out = line_in
        
        vcfout.write( line_out + '\n' )
コード例 #31
0
def convert(infile, snv_out, indel_out):

    info_to_split = 'NLOD', 'TLOD'
    info_to_keep = 'STR', 'ECNT'

    with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            snv_out.write( line_i + '\n' )
            indel_out.write( line_i + '\n' )

            if line_i.startswith('##normal_sample='):
                normal_name = line_i.split('=')[1]

            if line_i.startswith('##tumor_sample='):
                tumor_name = line_i.split('=')[1]

            line_i = vcf_in.readline().rstrip()
            snv_out.write( line_i + '\n' )
            indel_out.write( line_i + '\n' )

        # This line will be #CHROM:
        header = line_i.split('\t')

        # This will be the first variant line:
        line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line( line_i )

            # If "germlinerisk" is the only flag, then make it PASS since there is no matched normal
            if vcf_i.filters == 'germline_risk':
                vcf_i.filters = 'PASS'

            if ',' not in vcf_i.altbase:

                item = line_i.split('\t')
                if item[6] == 'germline_risk':
                    item[6] = 'PASS'

                new_line = '\t'.join( item )

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write( new_line + '\n' )
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write( new_line + '\n' )

            else:
                alt_bases = vcf_i.altbase.split(',')
                measures = []
                still_measures = []

                for measure_i in info_to_split:
                    try:
                        measures.append( vcf_i.get_info_value(measure_i).split(',') )
                    except AttributeError:
                        measures.append( None )

                for measure_i in info_to_keep:
                    try:
                        still_measures.append( vcf_i.get_info_value(measure_i) )
                    except AttributeError:
                        still_measures.append( None )

                for ith_base, altbase_i in enumerate(alt_bases):

                    split_infos = [ '{}={}'.format(info_variable, info_value[ith_base]) for info_variable, info_value in zip(info_to_split, measures) if info_value != None ]

                    still_infos = [ '{}={}'.format(info_variable, info_value) for info_variable, info_value in zip(info_to_keep, still_measures) if info_value != False ]

                    split_infos.extend(still_infos)

                    info_string = ';'.join( split_infos )

                    GT0 = vcf_i.get_sample_value('GT', idx=0)
                    if GT0 != '0/0' and GT0 != '0/1':
                        sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0])
                    else:
                        sample_0 = vcf_i.samples[0]

                    new_line = '\t'.join(( vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, altbase_i, vcf_i.qual, vcf_i.filters, info_string, vcf_i.field, sample_0 ))

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write( new_line + '\n' )
                    elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1:
                        indel_out.write( new_line + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #32
0
min_refMQ = args.min_refMQ
min_altMQ = args.min_altMQ
min_refBQ = args.min_refBQ
min_altBQ = args.min_altBQ
max_refNM = args.max_refNM
max_altNM = args.max_altNM
max_fetSB = args.max_fetSB
max_fetCD = args.max_fetCD
max_zMQ = args.max_zMQ
max_zBQ = args.max_zBQ
max_MQ0 = args.max_MQ0
min_VAF = args.min_VAF
min_DP = args.min_DP
min_varDP = args.min_varDP

with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out:

    line_i = vcf_in.readline().rstrip()

    while line_i.startswith('##'):

        vcf_out.write(line_i + '\n')
        line_i = vcf_in.readline().rstrip()

    vcf_out.write(line_i + '\n')

    # This line will be #CHROM:
    header = line_i.split('\t')
    sample_index = header.index(sample) - 9

    # This will be the first variant line:
コード例 #33
0
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            bam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            vardict=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

    # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 6 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write(out_header.replace('{', '').replace('}', '') + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)
                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                # Now, use pysam to look into the tBAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    # Reset num_caller to 0 for each variant in the same coordinate
                    num_callers = 0

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, tlod, ecnt = annotate_caller.ssMuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = tlod = ecnt = nan

                    if varscan:
                        varscan_classification, score_varscan2 = annotate_caller.ssVarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = score_varscan2 = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.ssLoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.ssScalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification = annotate_caller.ssStrelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = nan

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants.keys():
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### INFO EXTRACTION FROM BAM FILES ########## #########
                        # Tumor tBAM file:
                        tBamFeatures = sequencing_features.from_bam(
                            bam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring.
                        seq_span_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 41),
                            my_coordinate[1] + 40)
                        seq_left_80bp = ref_fa.fetch(
                            my_coordinate[0], max(0, my_coordinate[1] - 81),
                            my_coordinate[1])
                        seq_right_80bp = ref_fa.fetch(my_coordinate[0],
                                                      my_coordinate[1],
                                                      my_coordinate[1] + 81)

                        if len(seq_span_80bp) > 20:
                            LC_spanning = sequencing_features.subLC(
                                seq_span_80bp, 20)
                        else:
                            LC_spanning = math.nan

                        if len(seq_left_80bp) > 20:
                            left_LC = sequencing_features.subLC(
                                seq_left_80bp, 20)
                        else:
                            left_LC = math.nan

                        if len(seq_right_80bp) > 20:
                            right_LC = sequencing_features.subLC(
                                seq_right_80bp, 20)
                        else:
                            right_LC = math.nan

                        LC_adjacent = min(left_LC, right_LC)

                        LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
                        LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                      = my_coordinate[0],                                                    \
                        POS                        = my_coordinate[1],                                                    \
                        ID                         = my_identifiers,                                                      \
                        REF                        = ref_base,                                                            \
                        ALT                        = first_alt,                                                           \
                        if_MuTect                  = mutect_classification,                                               \
                        if_Strelka                 = strelka_classification,                                              \
                        if_VarScan2                = varscan_classification,                                              \
                        if_VarDict                 = vardict_classification,                                              \
                        if_LoFreq                  = lofreq_classification,                                               \
                        if_Scalpel                 = scalpel_classification,                                              \
                        VarScan2_Score             = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        if_dbsnp                   = if_dbsnp,                                                            \
                        COMMON                     = if_common,                                                           \
                        if_COSMIC                  = if_cosmic,                                                           \
                        COSMIC_CNT                 = num_cases,                                                           \
                        Consistent_Mates           = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates         = tBamFeatures['inconsistent_mates'],                                  \
                        Seq_Complexity_Span        = LC_spanning_phred,                                                   \
                        Seq_Complexity_Adj         = LC_adjacent_phred,                                                   \
                        M2_TLOD                    = tlod,                                                                \
                        M2_ECNT                    = ecnt,                                                                \
                        MSI                        = msi,                                                                 \
                        MSILEN                     = msilen,                                                              \
                        SHIFT3                     = shift3,                                                              \
                        MaxHomopolymer_Length      = homopolymer_length,                                                  \
                        SiteHomopolymer_Length     = site_homopolymer_length,                                             \
                        T_DP                       = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ                = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ                = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_p_MannWhitneyU_MQ     = '%g' % tBamFeatures['p_mannwhitneyu_mq'],                            \
                        tBAM_REF_BQ                = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ                = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_p_MannWhitneyU_BQ     = '%g' % tBamFeatures['p_mannwhitneyu_bq'],                            \
                        tBAM_REF_NM                = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM                = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff               = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant        = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant        = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant        = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant        = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET       = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR                  = tBamFeatures['ref_for'],                                             \
                        T_REF_REV                  = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR                  = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV                  = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET        = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'],                        \
                        tBAM_REF_Clipped_Reads     = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads     = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET          = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                   = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads           = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads            = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp         = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp         = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp         = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp         = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp         = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp         = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length               = indel_length,                                                        \
                        TrueVariant_or_False       = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan,
                        vardict, lofreq, scalpel, strelka)
        [opened_file.close() for opened_file in opened_files if opened_file]
コード例 #34
0
ファイル: modify_MuTect2.py プロジェクト: zprh/somaticseq
def convert(infile, snv_out, indel_out, is_tnscope):

    info_to_split = 'NLOD', 'TLOD'
    info_to_keep = 'STR', 'ECNT'

    with genome.open_textfile(infile) as vcf_in, open(
            snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

        line_i = vcf_in.readline().rstrip()

        while line_i.startswith('##'):

            if line_i.startswith('##normal_sample='):
                normal_name = line_i.split('=')[1]

            if line_i.startswith('##tumor_sample='):
                tumor_name = line_i.split('=')[1]

            if line_i.startswith('##INFO=<ID=SOR,'):
                line_i = re.sub(r'Float', 'String', line_i)

            snv_out.write(line_i + '\n')
            indel_out.write(line_i + '\n')

            line_i = vcf_in.readline().rstrip()

        # This line will be #CHROM:
        snv_out.write(line_i + '\n')
        indel_out.write(line_i + '\n')
        header = line_i.split('\t')

        if is_tnscope:
            # Doesn't matter which one is normal/tumor. These information are not used.
            normal_index, tumor_index = 1, 0

        else:
            normal_index = header.index(normal_name) - 9
            tumor_index = header.index(tumor_name) - 9

        # This will be the first variant line:
        line_i = vcf_in.readline().rstrip()

        while line_i:

            vcf_i = genome.Vcf_line(line_i)

            if ',' not in vcf_i.altbase:

                if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1:
                    snv_out.write(line_i + '\n')
                elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1:
                    indel_out.write(line_i + '\n')

            else:
                alt_bases = vcf_i.altbase.split(',')
                measures = []
                still_measures = []

                for measure_i in info_to_split:
                    try:
                        measures.append(
                            vcf_i.get_info_value(measure_i).split(','))
                    except AttributeError:
                        measures.append(None)

                for measure_i in info_to_keep:
                    try:
                        still_measures.append(vcf_i.get_info_value(measure_i))
                    except AttributeError:
                        still_measures.append(None)

                for ith_base, altbase_i in enumerate(alt_bases):

                    split_infos = [
                        '{}={}'.format(info_variable, info_value[ith_base])
                        for info_variable, info_value in zip(
                            info_to_split, measures) if info_value != None
                    ]

                    still_infos = [
                        '{}={}'.format(info_variable, info_value)
                        for info_variable, info_value in zip(
                            info_to_keep, still_measures)
                        if info_value != False
                    ]

                    split_infos.extend(still_infos)

                    info_string = ';'.join(split_infos)

                    GT0 = vcf_i.get_sample_value('GT', idx=0)
                    if GT0 != '0/0' and GT0 != '0/1':
                        sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0])
                    else:
                        sample_0 = vcf_i.samples[0]

                    GT1 = vcf_i.get_sample_value('GT', idx=1)
                    if GT1 != '0/0' and GT0 != '0/1':
                        sample_1 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[1])
                    else:
                        sample_1 = vcf_i.samples[1]

                    new_line = '\t'.join(
                        (vcf_i.chromosome, str(vcf_i.position),
                         vcf_i.identifier, vcf_i.refbase, altbase_i,
                         vcf_i.qual, vcf_i.filters, info_string, vcf_i.field,
                         sample_0, sample_1))

                    if len(vcf_i.refbase) == 1 and len(altbase_i) == 1:
                        snv_out.write(new_line + '\n')
                    elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1:
                        indel_out.write(new_line + '\n')

            line_i = vcf_in.readline().rstrip()
コード例 #35
0
def vcf2tsv(is_vcf=None,
            is_bed=None,
            is_pos=None,
            nbam_fn=None,
            tbam_fn=None,
            truth=None,
            cosmic=None,
            dbsnp=None,
            mutect=None,
            varscan=None,
            jsm=None,
            sniper=None,
            vardict=None,
            muse=None,
            lofreq=None,
            scalpel=None,
            strelka=None,
            tnscope=None,
            platypus=None,
            dedup=True,
            min_mq=1,
            min_bq=5,
            min_caller=0,
            ref_fa=None,
            p_scale=None,
            outfile=None):

    # Convert contig_sequence to chrom_seq dict:
    fai_file = ref_fa + '.fai'
    chrom_seq = genome.faiordict2contigorder(fai_file, 'fai')

    # Determine input format:
    if is_vcf:
        mysites = is_vcf
    elif is_bed:
        mysites = is_bed
    elif is_pos:
        mysites = is_pos
    else:
        mysites = fai_file
        logger.info('No position supplied. Will evaluate the whole genome.')

    # Re-scale output or not:
    if p_scale == None:
        logger.info('NO RE-SCALING')
    elif p_scale.lower() == 'phred':
        p_scale = 'phred'
    elif p_scale.lower() == 'fraction':
        p_scale = 'fraction'
    else:
        p_scale = None
        logger.info('NO RE-SCALING')

        # Define NaN and Inf:
    nan = float('nan')
    inf = float('inf')
    pattern_chr_position = genome.pattern_chr_position

    ## Running
    with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                         'w') as outhandle:

        my_line = my_sites.readline().rstrip()

        nbam = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa)
        tbam = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa)
        ref_fa = pysam.FastaFile(ref_fa)

        if truth:
            truth = genome.open_textfile(truth)
            truth_line = genome.skip_vcf_header(truth)

        if cosmic:
            cosmic = genome.open_textfile(cosmic)
            cosmic_line = genome.skip_vcf_header(cosmic)

        if dbsnp:
            dbsnp = genome.open_textfile(dbsnp)
            dbsnp_line = genome.skip_vcf_header(dbsnp)

        # 10 Incorporate callers: get thru the #'s
        if mutect:
            mutect = genome.open_textfile(mutect)
            mutect_line = genome.skip_vcf_header(mutect)

        if varscan:
            varscan = genome.open_textfile(varscan)
            varscan_line = genome.skip_vcf_header(varscan)

        if jsm:
            jsm = genome.open_textfile(jsm)
            jsm_line = genome.skip_vcf_header(jsm)

        if sniper:
            sniper = genome.open_textfile(sniper)
            sniper_line = genome.skip_vcf_header(sniper)

        if vardict:
            vardict = genome.open_textfile(vardict)
            vardict_line = genome.skip_vcf_header(vardict)

        if muse:
            muse = genome.open_textfile(muse)
            muse_line = genome.skip_vcf_header(muse)

        if lofreq:
            lofreq = genome.open_textfile(lofreq)
            lofreq_line = genome.skip_vcf_header(lofreq)

        if scalpel:
            scalpel = genome.open_textfile(scalpel)
            scalpel_line = genome.skip_vcf_header(scalpel)

        if strelka:
            strelka = genome.open_textfile(strelka)
            strelka_line = genome.skip_vcf_header(strelka)

        if tnscope:
            tnscope = genome.open_textfile(tnscope)
            tnscope_line = genome.skip_vcf_header(tnscope)

        if platypus:
            platypus = genome.open_textfile(platypus)
            platypus_line = genome.skip_vcf_header(platypus)

        # Get through all the headers:
        while my_line.startswith('#') or my_line.startswith('track='):
            my_line = my_sites.readline().rstrip()

        # First coordinate, for later purpose of making sure the input is sorted properly
        coordinate_i = re.match(genome.pattern_chr_position, my_line)
        coordinate_i = coordinate_i.group() if coordinate_i else ''

        # First line:
        outhandle.write(out_header.replace('{', '').replace('}', '') + '\n')

        while my_line:

            # If VCF, get all the variants with the same coordinate into a list:
            if is_vcf:

                my_vcf = genome.Vcf_line(my_line)

                my_coordinates = [(my_vcf.chromosome, my_vcf.position)]

                variants_at_my_coordinate = []

                alt_bases = my_vcf.altbase.split(',')
                for alt_i in alt_bases:
                    vcf_i = copy(my_vcf)
                    vcf_i.altbase = alt_i
                    variants_at_my_coordinate.append(vcf_i)

                # As long as the "coordinate" stays the same, it will keep reading until it's different.
                while my_coordinates[0] == (my_vcf.chromosome,
                                            my_vcf.position):

                    my_line = my_sites.readline().rstrip()
                    my_vcf = genome.Vcf_line(my_line)

                    ########## This block is code is to ensure the input VCF file is properly sorted ##
                    coordinate_j = re.match(genome.pattern_chr_position,
                                            my_line)
                    coordinate_j = coordinate_j.group() if coordinate_j else ''

                    if genome.whoisbehind(coordinate_i, coordinate_j,
                                          chrom_seq) == 1:
                        raise Exception(
                            '{} does not seem to be properly sorted.'.format(
                                mysites))

                    coordinate_i = coordinate_j
                    ###################################################################################

                    if my_coordinates[0] == (my_vcf.chromosome,
                                             my_vcf.position):

                        alt_bases = my_vcf.altbase.split(',')
                        for alt_i in alt_bases:

                            vcf_i = copy(my_vcf)
                            vcf_i.altbase = alt_i
                            variants_at_my_coordinate.append(vcf_i)

            elif is_bed:
                bed_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(bed_item[0],
                                                     int(bed_item[1]) + 1,
                                                     int(bed_item[2]))

            elif is_pos:
                pos_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(pos_item[0],
                                                     int(pos_item[1]),
                                                     int(pos_item[1]))

            elif fai_file:
                fai_item = my_line.split('\t')
                my_coordinates = genomic_coordinates(fai_item[0], 1,
                                                     int(fai_item[1]))

            ##### ##### ##### ##### ##### #####
            for my_coordinate in my_coordinates:

                ######## If VCF, can get ref base, variant base, as well as other identifying information ########
                if is_vcf:

                    ref_bases = []
                    alt_bases = []
                    indel_lengths = []
                    all_my_identifiers = []

                    for variant_i in variants_at_my_coordinate:

                        ref_base = variant_i.refbase
                        first_alt = variant_i.altbase.split(',')[0]
                        indel_length = len(first_alt) - len(ref_base)

                        ref_bases.append(ref_base)
                        alt_bases.append(first_alt)
                        indel_lengths.append(indel_length)

                        # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied.
                        if_dbsnp = 1 if re.search(r'rs[0-9]+',
                                                  variant_i.identifier) else 0
                        if_cosmic = 1 if re.search(r'COS[MN][0-9]+',
                                                   variant_i.identifier) else 0
                        if_common = 1 if variant_i.get_info_value(
                            'COMMON') == '1' else 0
                        num_cases = variant_i.get_info_value(
                            'CNT') if variant_i.get_info_value('CNT') else nan

                        if variant_i.identifier == '.':
                            my_identifier_i = set()
                        else:
                            my_identifier_i = variant_i.identifier.split(';')
                            my_identifier_i = set(my_identifier_i)

                        all_my_identifiers.append(my_identifier_i)

                ## If not, 1) get ref_base, first_alt from other VCF files.
                #          2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided)
                else:
                    variants_at_my_coordinate = [
                        None
                    ]  # Just to have something to iterate
                    ref_base = first_alt = indel_length = None

                    # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN.
                    if_dbsnp = if_cosmic = if_common = num_cases = nan

                # Keep track of NumCallers:
                num_callers = 0

                #################################### Find the same coordinate in those VCF files ####################################
                if mutect:
                    got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(
                        my_coordinate, mutect_line, mutect, chrom_seq)
                if varscan:
                    got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(
                        my_coordinate, varscan_line, varscan, chrom_seq)
                if jsm:
                    got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate(
                        my_coordinate, jsm_line, jsm, chrom_seq)
                if sniper:
                    got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate(
                        my_coordinate, sniper_line, sniper, chrom_seq)
                if vardict:
                    got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(
                        my_coordinate, vardict_line, vardict, chrom_seq)
                if muse:
                    got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate(
                        my_coordinate, muse_line, muse, chrom_seq)
                if lofreq:
                    got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(
                        my_coordinate, lofreq_line, lofreq, chrom_seq)
                if scalpel:
                    got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(
                        my_coordinate, scalpel_line, scalpel, chrom_seq)
                if strelka:
                    got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(
                        my_coordinate, strelka_line, strelka, chrom_seq)
                if tnscope:
                    got_tnscope, tnscope_variants, tnscope_line = genome.find_vcf_at_coordinate(
                        my_coordinate, tnscope_line, tnscope, chrom_seq)
                if platypus:
                    got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate(
                        my_coordinate, platypus_line, platypus, chrom_seq)
                if truth:
                    got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(
                        my_coordinate, truth_line, truth, chrom_seq)
                if dbsnp:
                    got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(
                        my_coordinate, dbsnp_line, dbsnp, chrom_seq)
                if cosmic:
                    got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(
                        my_coordinate, cosmic_line, cosmic, chrom_seq)

                # Now, use pysam to look into the BAM file(s), variant by variant from the input:
                for ith_call, my_call in enumerate(variants_at_my_coordinate):

                    if is_vcf:
                        # The particular line in the input VCF file:
                        variant_id = ((my_call.chromosome, my_call.position),
                                      my_call.refbase, my_call.altbase)

                        ref_base = ref_bases[ith_call]
                        first_alt = alt_bases[ith_call]
                        indel_length = indel_lengths[ith_call]
                        my_identifiers = all_my_identifiers[ith_call]

                    else:
                        variant_id = ((my_coordinate[0], my_coordinate[1]),
                                      ref_base, first_alt)

                    #################### Collect Caller Vcf ####################:
                    if mutect:
                        mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect(
                            variant_id, mutect_variants)
                        num_callers += mutect_classification
                    else:
                        mutect_classification = nlod = tlod = tandem = ecnt = nan

                    if varscan:
                        varscan_classification = annotate_caller.VarScan(
                            variant_id, varscan_variants)
                        num_callers += varscan_classification
                    else:
                        varscan_classification = nan

                    if jsm:
                        jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM(
                            variant_id, jsm_variants)
                        num_callers += jointsnvmix2_classification
                    else:
                        jointsnvmix2_classification = score_jointsnvmix2 = nan

                    if sniper:
                        sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper(
                            variant_id, sniper_variants)
                        num_callers += sniper_classification
                    else:
                        sniper_classification = score_somaticsniper = nan

                    if vardict:
                        vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict(
                            variant_id, vardict_variants)
                        num_callers += vardict_classification
                    else:
                        vardict_classification = msi = msilen = shift3 = score_vardict = nan

                    if muse:
                        muse_classification = annotate_caller.MuSE(
                            variant_id, muse_variants)
                        num_callers += muse_classification
                    else:
                        muse_classification = nan

                    if lofreq:
                        lofreq_classification = annotate_caller.LoFreq(
                            variant_id, lofreq_variants)
                        num_callers += lofreq_classification
                    else:
                        lofreq_classification = nan

                    if scalpel:
                        scalpel_classification = annotate_caller.Scalpel(
                            variant_id, scalpel_variants)
                        num_callers += scalpel_classification
                    else:
                        scalpel_classification = nan

                    if strelka:
                        strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka(
                            variant_id, strelka_variants)
                        num_callers += strelka_classification
                    else:
                        strelka_classification = somatic_evs = qss = tqss = nan

                    if tnscope:
                        tnscope_classification = annotate_caller.TNscope(
                            variant_id, tnscope_variants)
                        num_callers += tnscope_classification
                    else:
                        tnscope_classification = nan

                    if platypus:
                        platypus_classification = annotate_caller.countPASS(
                            variant_id, platypus_variants)
                        num_callers += platypus_classification
                    else:
                        platypus_classification = nan

                    # Potentially write the output only if it meets this threshold:
                    if num_callers >= min_caller:

                        ########## Ground truth file ##########
                        if truth:
                            if variant_id in truth_variants:
                                judgement = 1
                                my_identifiers.add('TruePositive')
                            else:
                                judgement = 0
                                my_identifiers.add('FalsePositive')
                        else:
                            judgement = nan

                        ########## dbSNP ########## Will overwrite dbSNP info from input VCF file
                        if dbsnp:
                            if_dbsnp, if_common, rsID = annotate_caller.dbSNP(
                                variant_id, dbsnp_variants)
                            for ID_i in rsID:
                                my_identifiers.add(ID_i)

                        ########## COSMIC ########## Will overwrite COSMIC info from input VCF file
                        if cosmic:
                            if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(
                                variant_id, cosmic_variants)
                            for ID_i in cosmicID:
                                my_identifiers.add(ID_i)

                        ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### #########
                        nBamFeatures = sequencing_features.from_bam(
                            nbam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)
                        tBamFeatures = sequencing_features.from_bam(
                            tbam, my_coordinate, ref_base, first_alt, min_mq,
                            min_bq)

                        n_ref = nBamFeatures['ref_for'] + nBamFeatures[
                            'ref_rev']
                        n_alt = nBamFeatures['alt_for'] + nBamFeatures[
                            'alt_rev']
                        t_ref = tBamFeatures['ref_for'] + tBamFeatures[
                            'ref_rev']
                        t_alt = tBamFeatures['alt_for'] + tBamFeatures[
                            'alt_rev']
                        sor = sequencing_features.somaticOddRatio(
                            n_ref, n_alt, t_ref, t_alt)

                        # Calculate VarScan'2 SCC directly without using VarScan2 output:
                        try:
                            score_varscan2 = genome.p2phred(
                                stats.fisher_exact(
                                    ((t_alt, n_alt), (t_ref, n_ref)),
                                    alternative='greater')[1])
                        except ValueError:
                            score_varscan2 = nan

                        # Homopolymer eval:
                        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                            ref_fa, my_coordinate, ref_base, first_alt)

                        # Fill the ID field of the TSV/VCF
                        my_identifiers = ';'.join(
                            my_identifiers) if my_identifiers else '.'

                        ###
                        out_line = out_header.format( \
                        CHROM                   = my_coordinate[0],                                                    \
                        POS                     = my_coordinate[1],                                                    \
                        ID                      = my_identifiers,                                                      \
                        REF                     = ref_base,                                                            \
                        ALT                     = first_alt,                                                           \
                        if_MuTect               = mutect_classification,                                               \
                        if_VarScan2             = varscan_classification,                                              \
                        if_JointSNVMix2         = jointsnvmix2_classification,                                         \
                        if_SomaticSniper        = sniper_classification,                                               \
                        if_VarDict              = vardict_classification,                                              \
                        MuSE_Tier               = muse_classification,                                                 \
                        if_LoFreq               = lofreq_classification,                                               \
                        if_Scalpel              = scalpel_classification,                                              \
                        if_Strelka              = strelka_classification,                                              \
                        if_TNscope              = tnscope_classification,                                              \
                        if_Platypus             = platypus_classification,                                             \
                        Strelka_Score           = somatic_evs,                                                         \
                        Strelka_QSS             = qss,                                                                 \
                        Strelka_TQSS            = tqss,                                                                \
                        VarScan2_Score          = rescale(score_varscan2,      'phred', p_scale, 1001),                \
                        SNVMix2_Score           = rescale(score_jointsnvmix2,  'phred', p_scale, 1001),                \
                        Sniper_Score            = rescale(score_somaticsniper, 'phred', p_scale, 1001),                \
                        VarDict_Score           = rescale(score_vardict,       'phred', p_scale, 1001),                \
                        if_dbsnp                = if_dbsnp,                                                            \
                        COMMON                  = if_common,                                                           \
                        if_COSMIC               = if_cosmic,                                                           \
                        COSMIC_CNT              = num_cases,                                                           \
                        Consistent_Mates        = tBamFeatures['consistent_mates'],                                    \
                        Inconsistent_Mates      = tBamFeatures['inconsistent_mates'],                                  \
                        N_DP                    = nBamFeatures['dp'],                                                  \
                        nBAM_REF_MQ             = '%g' % nBamFeatures['ref_mq'],                                       \
                        nBAM_ALT_MQ             = '%g' % nBamFeatures['alt_mq'],                                       \
                        nBAM_Z_Ranksums_MQ      = '%g' % nBamFeatures['z_ranksums_mq'],                                \
                        nBAM_REF_BQ             = '%g' % nBamFeatures['ref_bq'],                                       \
                        nBAM_ALT_BQ             = '%g' % nBamFeatures['alt_bq'],                                       \
                        nBAM_Z_Ranksums_BQ      = '%g' % nBamFeatures['z_ranksums_bq'],                                \
                        nBAM_REF_NM             = '%g' % nBamFeatures['ref_NM'],                                       \
                        nBAM_ALT_NM             = '%g' % nBamFeatures['alt_NM'],                                       \
                        nBAM_NM_Diff            = '%g' % nBamFeatures['NM_Diff'],                                      \
                        nBAM_REF_Concordant     = nBamFeatures['ref_concordant_reads'],                                \
                        nBAM_REF_Discordant     = nBamFeatures['ref_discordant_reads'],                                \
                        nBAM_ALT_Concordant     = nBamFeatures['alt_concordant_reads'],                                \
                        nBAM_ALT_Discordant     = nBamFeatures['alt_discordant_reads'],                                \
                        nBAM_Concordance_FET    = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        N_REF_FOR               = nBamFeatures['ref_for'],                                             \
                        N_REF_REV               = nBamFeatures['ref_rev'],                                             \
                        N_ALT_FOR               = nBamFeatures['alt_for'],                                             \
                        N_ALT_REV               = nBamFeatures['alt_rev'],                                             \
                        nBAM_StrandBias_FET     = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        nBAM_Z_Ranksums_EndPos  = '%g' % nBamFeatures['z_ranksums_endpos'],                            \
                        nBAM_REF_Clipped_Reads  = nBamFeatures['ref_SC_reads'],                                        \
                        nBAM_ALT_Clipped_Reads  = nBamFeatures['alt_SC_reads'],                                        \
                        nBAM_Clipping_FET       = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        nBAM_MQ0                = nBamFeatures['MQ0'],                                                 \
                        nBAM_Other_Reads        = nBamFeatures['noise_read_count'],                                    \
                        nBAM_Poor_Reads         = nBamFeatures['poor_read_count'],                                     \
                        nBAM_REF_InDel_3bp      = nBamFeatures['ref_indel_3bp'],                                       \
                        nBAM_REF_InDel_2bp      = nBamFeatures['ref_indel_2bp'],                                       \
                        nBAM_REF_InDel_1bp      = nBamFeatures['ref_indel_1bp'],                                       \
                        nBAM_ALT_InDel_3bp      = nBamFeatures['alt_indel_3bp'],                                       \
                        nBAM_ALT_InDel_2bp      = nBamFeatures['alt_indel_2bp'],                                       \
                        nBAM_ALT_InDel_1bp      = nBamFeatures['alt_indel_1bp'],                                       \
                        M2_NLOD                 = nlod,                                                                \
                        M2_TLOD                 = tlod,                                                                \
                        M2_STR                  = tandem,                                                              \
                        M2_ECNT                 = ecnt,                                                                \
                        SOR                     = sor,                                                                 \
                        MSI                     = msi,                                                                 \
                        MSILEN                  = msilen,                                                              \
                        SHIFT3                  = shift3,                                                              \
                        MaxHomopolymer_Length   = homopolymer_length,                                                  \
                        SiteHomopolymer_Length  = site_homopolymer_length,                                             \
                        T_DP                    = tBamFeatures['dp'],                                                  \
                        tBAM_REF_MQ             = '%g' % tBamFeatures['ref_mq'],                                       \
                        tBAM_ALT_MQ             = '%g' % tBamFeatures['alt_mq'],                                       \
                        tBAM_Z_Ranksums_MQ      = '%g' % tBamFeatures['z_ranksums_mq'],                                \
                        tBAM_REF_BQ             = '%g' % tBamFeatures['ref_bq'],                                       \
                        tBAM_ALT_BQ             = '%g' % tBamFeatures['alt_bq'],                                       \
                        tBAM_Z_Ranksums_BQ      = '%g' % tBamFeatures['z_ranksums_bq'],                                \
                        tBAM_REF_NM             = '%g' % tBamFeatures['ref_NM'],                                       \
                        tBAM_ALT_NM             = '%g' % tBamFeatures['alt_NM'],                                       \
                        tBAM_NM_Diff            = '%g' % tBamFeatures['NM_Diff'],                                      \
                        tBAM_REF_Concordant     = tBamFeatures['ref_concordant_reads'],                                \
                        tBAM_REF_Discordant     = tBamFeatures['ref_discordant_reads'],                                \
                        tBAM_ALT_Concordant     = tBamFeatures['alt_concordant_reads'],                                \
                        tBAM_ALT_Discordant     = tBamFeatures['alt_discordant_reads'],                                \
                        tBAM_Concordance_FET    = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \
                        T_REF_FOR               = tBamFeatures['ref_for'],                                             \
                        T_REF_REV               = tBamFeatures['ref_rev'],                                             \
                        T_ALT_FOR               = tBamFeatures['alt_for'],                                             \
                        T_ALT_REV               = tBamFeatures['alt_rev'],                                             \
                        tBAM_StrandBias_FET     = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001),  \
                        tBAM_Z_Ranksums_EndPos  = '%g' % tBamFeatures['z_ranksums_endpos'],                            \
                        tBAM_REF_Clipped_Reads  = tBamFeatures['ref_SC_reads'],                                        \
                        tBAM_ALT_Clipped_Reads  = tBamFeatures['alt_SC_reads'],                                        \
                        tBAM_Clipping_FET       = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001),    \
                        tBAM_MQ0                = tBamFeatures['MQ0'],                                                 \
                        tBAM_Other_Reads        = tBamFeatures['noise_read_count'],                                    \
                        tBAM_Poor_Reads         = tBamFeatures['poor_read_count'],                                     \
                        tBAM_REF_InDel_3bp      = tBamFeatures['ref_indel_3bp'],                                       \
                        tBAM_REF_InDel_2bp      = tBamFeatures['ref_indel_2bp'],                                       \
                        tBAM_REF_InDel_1bp      = tBamFeatures['ref_indel_1bp'],                                       \
                        tBAM_ALT_InDel_3bp      = tBamFeatures['alt_indel_3bp'],                                       \
                        tBAM_ALT_InDel_2bp      = tBamFeatures['alt_indel_2bp'],                                       \
                        tBAM_ALT_InDel_1bp      = tBamFeatures['alt_indel_1bp'],                                       \
                        InDel_Length            = indel_length,                                                        \
                        TrueVariant_or_False    = judgement )

                        # Print it out to stdout:
                        outhandle.write(out_line + '\n')

            # Read into the next line:
            if not is_vcf:
                my_line = my_sites.readline().rstrip()

        ##########  Close all open files if they were opened  ##########
        opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect,
                        varscan, jsm, sniper, vardict, muse, lofreq, scalpel,
                        strelka, tnscope, platypus)
        [opened_file.close() for opened_file in opened_files if opened_file]
コード例 #36
0
ファイル: modify_VarDict.py プロジェクト: zprh/somaticseq
def convert(infile, snv_out, indel_out):

    with genome.open_textfile(infile) as vcf, open(
            snv_out, 'w') as snpout, open(indel_out, 'w') as indelout:

        line_i = vcf.readline().rstrip()

        while line_i.startswith('##'):

            if re.match(r'^##INFO=<ID=(LSEQ|RSEQ),', line_i):
                line_i = line_i.replace('Number=G', 'Number=1')

            elif line_i.startswith('##FORMAT=<ID=BIAS,'):
                line_i = line_i.replace('Number=1', 'Number=.')

            elif line_i.startswith('##FORMAT=<ID=PSTD,') or \
            line_i.startswith('##FORMAT=<ID=QSTD,') or \
            line_i.startswith('##INFO=<ID=SOR,'):
                line_i = line_i.replace('Type=Float', 'Type=String')

            snpout.write(line_i + '\n')
            indelout.write(line_i + '\n')
            line_i = vcf.readline().rstrip()

        addition_header = []
        addition_header.append(
            '##INFO=<ID=Germline,Number=0,Type=Flag,Description="VarDict Germline">'
        )
        addition_header.append(
            '##INFO=<ID=StrongSomatic,Number=0,Type=Flag,Description="VarDict Strong Somatic">'
        )
        addition_header.append(
            '##INFO=<ID=LikelySomatic,Number=0,Type=Flag,Description="VarDict Likely Somatic">'
        )
        addition_header.append(
            '##INFO=<ID=LikelyLOH,Number=0,Type=Flag,Description="VarDict Likely LOH">'
        )
        addition_header.append(
            '##INFO=<ID=StrongLOH,Number=0,Type=Flag,Description="VarDict Strong LOH">'
        )
        addition_header.append(
            '##INFO=<ID=AFDiff,Number=0,Type=Flag,Description="VarDict AF Diff">'
        )
        addition_header.append(
            '##INFO=<ID=Deletion,Number=0,Type=Flag,Description="VarDict Deletion">'
        )
        addition_header.append(
            '##INFO=<ID=SampleSpecific,Number=0,Type=Flag,Description="VarDict SampleSpecific">'
        )
        addition_header.append(
            '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">'
        )

        for item_i in addition_header:
            snpout.write(item_i + '\n')
            indelout.write(item_i + '\n')

        # This is the #CHROM line
        header_main_item = line_i.split('\t')
        num_header = len(header_main_item)

        if num_header == 10:
            paired = False
        elif num_header == 11:
            paired = True

        snpout.write(line_i + '\n')
        indelout.write(line_i + '\n')

        line_i = vcf.readline().rstrip()
        while line_i:

            vcfcall = genome.Vcf_line(line_i)

            # Fix the occasional error where ALT and REF are the same:
            if vcfcall.refbase != vcfcall.altbase:

                # In the REF/ALT field, non-GCTA characters should be changed to N to fit the VCF standard:
                vcfcall.refbase = re.sub(r'[^GCTA]',
                                         'N',
                                         vcfcall.refbase,
                                         flags=re.I)
                vcfcall.altbase = re.sub(r'[^GCTA]',
                                         'N',
                                         vcfcall.altbase,
                                         flags=re.I)

                ## To be consistent with other tools, Combine AD:RD or ALD:RD into DP4.
                # VarDict puts Tumor first and Normal next
                # Also, the old version has no ALD (somatic.pl). The new version has ALD (paired.pl).
                format_field = vcfcall.field.split(':')
                idx_rd = format_field.index('RD')

                tumor_sample = vcfcall.samples[0].split(':')
                tumor_dp4 = tumor_sample.pop(idx_rd)

                if paired:
                    normal_sample = vcfcall.samples[1].split(':')
                    normal_dp4 = normal_sample.pop(idx_rd)

                format_field.pop(idx_rd)

                # As right now, the old version has no ALD. The new version has ALD.
                # If the VCF has no ALD, then the AD means the same thing ALD is supposed to mean.
                try:
                    idx_ad = format_field.index('ALD')
                except ValueError:
                    idx_ad = format_field.index('AD')

                if paired:
                    normal_dp4 = normal_dp4 + ',' + normal_sample.pop(idx_ad)

                tumor_dp4 = tumor_dp4 + ',' + tumor_sample.pop(idx_ad)
                format_field.pop(idx_ad)

                # Re-format the strings:
                format_field.append('DP4')

                if paired:
                    normal_sample.append(normal_dp4)
                tumor_sample.append(tumor_dp4)

                if paired:
                    normal_sample = ':'.join(normal_sample)
                tumor_sample = ':'.join(tumor_sample)
                new_format_string = ':'.join(format_field)

                # VarDict's END tag has caused problem with GATK CombineVariants. Simply get rid of it.
                vcfcall.info = re.sub(r'END=[0-9]+;', '', vcfcall.info)

                if paired:
                    line_i = '\t'.join(
                        (vcfcall.chromosome, str(vcfcall.position),
                         vcfcall.identifier, vcfcall.refbase, vcfcall.altbase,
                         vcfcall.qual, vcfcall.filters, vcfcall.info,
                         new_format_string, normal_sample, tumor_sample))
                else:
                    line_i = '\t'.join(
                        (vcfcall.chromosome, str(vcfcall.position),
                         vcfcall.identifier, vcfcall.refbase, vcfcall.altbase,
                         vcfcall.qual, vcfcall.filters, vcfcall.info,
                         new_format_string, tumor_sample))

                # Write to snp and indel into different files:
                if 'TYPE=SNV' in vcfcall.info:
                    snpout.write(line_i + '\n')

                elif 'TYPE=Deletion' in vcfcall.info or 'TYPE=Insertion' in vcfcall.info:
                    indelout.write(line_i + '\n')

                elif 'TYPE=Complex' in vcfcall.info and (len(
                        vcfcall.refbase) == len(vcfcall.altbase)):
                    i = 0

                    for ref_i, alt_i in zip(vcfcall.refbase, vcfcall.altbase):

                        if ref_i != alt_i:
                            if paired:
                                line_i = '\t'.join(
                                    (vcfcall.chromosome,
                                     str(vcfcall.position + i),
                                     vcfcall.identifier, ref_i, alt_i,
                                     vcfcall.qual, vcfcall.filters,
                                     vcfcall.info, new_format_string,
                                     normal_sample, tumor_sample))
                            else:
                                line_i = '\t'.join(
                                    (vcfcall.chromosome,
                                     str(vcfcall.position + i),
                                     vcfcall.identifier, ref_i, alt_i,
                                     vcfcall.qual, vcfcall.filters,
                                     vcfcall.info, new_format_string,
                                     tumor_sample))

                            snpout.write(line_i + '\n')

                        i += 1

            # Continue:
            line_i = vcf.readline().rstrip()
コード例 #37
0
ファイル: reformat_VCF2SEQC2.py プロジェクト: zprh/somaticseq
parser.add_argument('-infile',  '--vcf-in',   type=str, help='VCF in', required=True)
parser.add_argument('-outfile', '--vcf-out',  type=str, help='VCF out', required=True)
parser.add_argument('-callers', '--callers-classification-string', type=str, help='MVJSD or whatever',  required=True)
parser.add_argument('-tumor',   '--tumor-sample-name', type=str, help='tumor sample name',  required=False, default='TUMOR')
parser.add_argument('-trained', '--somaticseq-trained',    action='store_true', help='If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False)


args = parser.parse_args()

vcf_in_fn  = args.vcf_in
vcf_out_fn = args.vcf_out
caller_string = args.callers_classification_string
tumor = args.tumor_sample_name
somaticseq_trained = args.somaticseq_trained

with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout:
    
    line_in = vcfin.readline().rstrip('\n')
    
    while line_in.startswith('##'):
        
        if line_in.startswith('##SomaticSeq='):
            line_out = line_in + '-SEQC2'
            
        elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith('##INFO=<ID={COMBO}'.format(COMBO=caller_string)):
            line_out = re.sub('##INFO=', '##FORMAT=', line_in)
            
        else:
            line_out = line_in
        
        vcfout.write( line_out + '\n' )
コード例 #38
0
if args.pileup_DP4:
    header_append.append(
        '##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">'
    )
    format_append.append('plDP4')

if args.pileup_variant_allele_frequency:
    header_append.append(
        '##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">'
    )
    format_append.append('plVAF')

# Start Working by opening files:
try:
    my_vcf = genome.open_textfile(my_vcf)
    Tpileup = genome.open_textfile(Tpileup)
    outhandle = open(outfile, 'w')
    Npileup = genome.open_textfile(Npileup)
except AttributeError:
    pass

if Npileup:
    npileup_line = Npileup.readline().rstrip('\n')

if Tpileup:
    tpileup_line = Tpileup.readline().rstrip('\n')

# Add the extra headers:
out_vcf_headers = genome.vcf_header_modifier(my_vcf, addons=header_append)
コード例 #39
0
header_append = []
format_append = []

if args.pileup_DP4:
    header_append.append('##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">')
    format_append.append('plDP4')

if args.pileup_variant_allele_frequency:
    header_append.append('##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">')
    format_append.append('plVAF')



# Start Working by opening files:
try:
    my_vcf    = genome.open_textfile(my_vcf)
    Tpileup   = genome.open_textfile(Tpileup)
    outhandle = open(outfile, 'w')
    Npileup   = genome.open_textfile(Npileup)
except AttributeError:
    pass

if Npileup:
    npileup_line = Npileup.readline().rstrip('\n')

if Tpileup:
    tpileup_line = Tpileup.readline().rstrip('\n')

# Add the extra headers:    
out_vcf_headers = genome.vcf_header_modifier( my_vcf, addons=header_append )
コード例 #40
0
min_altMQ = args.min_altMQ
min_refBQ = args.min_refBQ
min_altBQ = args.min_altBQ
max_refNM = args.max_refNM
max_altNM = args.max_altNM
max_fetSB = args.max_fetSB
max_fetCD = args.max_fetCD
max_zMQ   = args.max_zMQ
max_zBQ   = args.max_zBQ
max_MQ0   = args.max_MQ0
min_VAF   = args.min_VAF
min_DP    = args.min_DP
min_varDP = args.min_varDP


with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out:
    
    line_i = vcf_in.readline().rstrip()
    
    while line_i.startswith('##'):
        
        vcf_out.write( line_i + '\n' )
        line_i = vcf_in.readline().rstrip()
    
    vcf_out.write( line_i + '\n' )

    # This line will be #CHROM:
    header = line_i.split('\t')
    sample_index = header.index(sample) - 9
    
    # This will be the first variant line: