Ejemplo n.º 1
0
def vcf(infileList, outfile):

    with open(outfile, 'w') as vcfout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as vcfin:

                line_i = vcfin.readline()

                while line_i.startswith('#'):
                    if not headerWritten:
                        vcfout.write(line_i)

                    line_i = vcfin.readline()

                # Turn off header writing from now on:
                headerWritten = True

                while line_i:
                    vcfout.write(line_i)
                    line_i = vcfin.readline()
Ejemplo n.º 2
0
def tsv(infileList, outfile):

    with open(outfile, 'w') as tsvout:

        headerWritten = False

        for file_i in infileList:

            with genome.open_textfile(file_i) as tsvin:

                # First line is a header
                line_i = tsvin.readline()

                if not headerWritten:
                    tsvout.write(line_i)

                # Turn off header writing from now on:
                headerWritten = True

                line_i = tsvin.readline()

                while line_i:
                    tsvout.write(line_i)
                    line_i = tsvin.readline()
Ejemplo n.º 3
0
    right_files = [i + '.vcf' + gz for i in chosen_chrom_sequence]

else:
    right_files = (args.input_vcf)

# Open files:
if args.call_method == 'VarDict':
    snpout = open(out_snp, 'w')
    indelout = open(out_indel, 'w')
else:
    vcfout = open(out_vcf, 'w')

### First, get the header. IF there are multiple VCF files (e.g., chromosome by chromosome), use the first file for this purpose:
vcf_header = []
with genome.open_textfile(right_files[0]) as vcf:

    line_i = vcf.readline().rstrip()

    # Save the headers and then sort them:
    vcfheader_filter_info_filter = []
    vcfheader_filter_info_filter.append(
        '##INFO=<ID={0},Number=0,Type=Flag,Description="Indicates if record is a {0} called somatic mutation">'
        .format(args.call_method))

    vcfheader_misc = []

    while line_i.startswith('#'):

        if re.match(r'##fileformat=', line_i):
            vcffileformat = line_i
Ejemplo n.º 4
0
header_append = []
format_append = []

if args.pileup_DP4:
    header_append.append('##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">')
    format_append.append('plDP4')

if args.pileup_variant_allele_frequency:
    header_append.append('##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">')
    format_append.append('plVAF')



# Start Working by opening files:
try:
    my_vcf    = genome.open_textfile(my_vcf)
    Tpileup   = genome.open_textfile(Tpileup)
    outhandle = open(outfile, 'w')
    Npileup   = genome.open_textfile(Npileup)
except AttributeError:
    pass

if Npileup:
    npileup_line = Npileup.readline().rstrip('\n')

if Tpileup:
    tpileup_line = Tpileup.readline().rstrip('\n')

# Add the extra headers:    
out_vcf_headers = genome.vcf_header_modifier( my_vcf, addons=header_append )
Ejemplo n.º 5
0
                    '--snv-out',
                    type=str,
                    help='Output VCF file',
                    required=True)

# Parse the arguments:
args = parser.parse_args()

infile = args.input_vcf
indel_out = args.indel_out
snv_out = args.snv_out

info_to_split = 'NLOD', 'TLOD'
info_to_keep = 'STR', 'ECNT'

with genome.open_textfile(infile) as vcf_in, open(
        snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:

    line_i = vcf_in.readline().rstrip()

    while line_i.startswith('##'):

        snv_out.write(line_i + '\n')
        indel_out.write(line_i + '\n')

        if line_i.startswith('##normal_sample='):
            normal_name = line_i.split('=')[1]

        if line_i.startswith('##tumor_sample='):
            tumor_name = line_i.split('=')[1]
Ejemplo n.º 6
0
for combo_i in itertools.product( (True, False), repeat = len(tools) ):
    
    # The four zeros represent [Total, dbsnp, COMMON, COSMIC]
    MVJS_combinations[combo_i] = [0, 0, 0, 0]


# Keeping a tab on all those scores
bina_score_tally = {}
subscore_evidence_tally = {}
bonus_knowledge_tally = {}
penalty_knowledge_tally = {}
num_methods_tally = {}


with genome.open_textfile(args.input_vcf) as vcf, open(args.output_vcf, 'w') as vcf_out:
    
    line_i = vcf.readline().rstrip()
    
    while line_i.startswith('#'):
        # Read thru the headers and metadata:
        
        if line_i.startswith('#CHROM'):
            header_item = line_i.split('\t')
            
            if len(header_item) == 11:
                paired_mode = True
                idxN, idxT = 0,1
                
            elif len(header_item) == 10:
                paired_mode = False
Ejemplo n.º 7
0


# Open files:
if args.call_method == 'VarDict':
    snpout   = open(out_snp,   'w')
    indelout = open(out_indel, 'w')
else:
    vcfout   = open(out_vcf,   'w')



# First, get the header. IF there are multiple VCF files (e.g., chromosome by
# chromosome), use the first file for this purpose:
vcf_header = []
with genome.open_textfile(right_files[0]) as vcf:

    line_i = vcf.readline().rstrip()

    # Save the headers and then sort them:
    vcfheader_filter_info_filter = []
    vcfheader_filter_info_filter.append('##INFO=<ID={0},Number=0,Type=Flag,Description="Indicates if record is a {0} called somatic mutation">'.format(args.call_method))

    vcfheader_misc = []

    while line_i.startswith('#'):

        if re.match(r'##fileformat=', line_i):
            vcffileformat = line_i

        elif re.match(r'^##FORMAT=<ID=DP4,', line_i):
Ejemplo n.º 8
0
                    '--output-vcf',
                    type=str,
                    help='Output VCF file',
                    required=True,
                    default=None)
parser.add_argument('-tools',
                    '--individual-mutation-tools',
                    type=str,
                    help='A list tools to sub-sample',
                    nargs='*',
                    required=True)
args = parser.parse_args()

subtools = set(args.individual_mutation_tools)

with genome.open_textfile(args.input_vcf) as vcfin, open(args.output_vcf,
                                                         'w') as vcfout:

    line_i = vcfin.readline().rstrip('\n')
    while line_i.startswith('#'):
        vcfout.write(line_i + '\n')
        line_i = vcfin.readline().rstrip('\n')

    while line_i:

        vcf_i = genome.Vcf_line(line_i)

        if 'FalseNegative' in vcf_i.identifier:
            vcfout.write(line_i + '\n')
        else:
            tools = vcf_i.get_info_value('SOURCES')
parser.add_argument('-threshold',
                    '--phasing-threshold',
                    type=int,
                    help='How far apart do we try to phase',
                    required=False,
                    default=1)

args = parser.parse_args()

infile = args.input_vcf_file
bam = args.bam_file
ref_fa = args.genome_reference
outfile = args.output_vcf_file
threshold = args.phasing_threshold

with genome.open_textfile(infile) as infile, \
pysam.AlignmentFile(bam) as bam, \
open(outfile, 'w') as outfile, \
pysam.FastaFile(ref_fa) as ref_fa:

    my_line = infile.readline().rstrip()

    while my_line.startswith('##'):
        outfile.write(my_line + '\n')
        my_line = infile.readline().rstrip()

    # This is to read through and copy the #CHROM line
    assert my_line.startswith('#CHROM')
    outfile.write(
        '##INFO=<ID=COORDINATES,Number=.,Type=Integer,Description="Coordinates of the bases">\n'
    )
Ejemplo n.º 10
0
# Variant Call Type, i.e., snp or indel
parser.add_argument('-infile', '--input-vcf',  type=str, help='Input VCF file', required=True)
parser.add_argument('-indel',  '--indel-out', type=str, help='Output VCF file', required=True)
parser.add_argument('-snv',    '--snv-out', type=str, help='Output VCF file', required=True)

# Parse the arguments:
args = parser.parse_args()

infile = args.input_vcf
indel_out = args.indel_out
snv_out = args.snv_out

info_to_split = 'NLOD', 'TLOD'
info_to_keep = 'STR', 'ECNT'

with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:
    
    line_i = vcf_in.readline().rstrip()
    
    while line_i.startswith('##'):
        
        snv_out.write( line_i + '\n' )
        indel_out.write( line_i + '\n' )
        
        if line_i.startswith('##normal_sample='):
            normal_name = line_i.split('=')[1]
            
        if line_i.startswith('##tumor_sample='):
            tumor_name = line_i.split('=')[1]
            
        line_i = vcf_in.readline().rstrip()
Ejemplo n.º 11
0
args = parser.parse_args()
infile = args.input_vcf
outfile = args.output_vcf

# Seperate output into snv/snp and indel's:
out_snp_file       = outfile.split(os.sep)
out_snp_file[-1]   = 'snp.' + out_snp_file[-1]

out_indel_file     = outfile.split(os.sep)
out_indel_file[-1] = 'indel.' + out_indel_file[-1]

out_snp   = os.sep.join(out_snp_file)
out_indel = os.sep.join(out_indel_file)


with genome.open_textfile(infile) as vcf, \
open(out_snp, 'w') as snpout, \
open(out_indel, 'w') as indelout:
    
    line_i = vcf.readline().rstrip()
    
    while line_i.startswith('##'):

        if re.match(r'^##INFO=<ID=(LSEQ|RSEQ),', line_i):
            line_i = line_i.replace('Number=G', 'Number=1')
        
        elif line_i.startswith('##FORMAT=<ID=BIAS,'):
            line_i = line_i.replace('Number=1', 'Number=.')
        
        elif line_i.startswith('##FORMAT=<ID=PSTD,') or \
        line_i.startswith('##FORMAT=<ID=QSTD,') or \
Ejemplo n.º 12
0
import genomic_file_handlers as genome

parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Variant Call Type, i.e., snp or indel
parser.add_argument('-infile',  '--input-vcf',  type=str, help='Input VCF file', required=True)
parser.add_argument('-outfile', '--output-vcf', type=str, help='Output VCF file', required=True)

# Parse the arguments:
args = parser.parse_args()
infile = args.input_vcf
outfile = args.output_vcf


with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out:
    
    line_i = vcf_in.readline().rstrip()
    
    while line_i.startswith('##'):
        
        vcf_out.write( line_i + '\n' )
        line_i = vcf_in.readline().rstrip()

    # This is the #CHROM line:
    headers = line_i.split('\t')
    num_columns = len(headers)
    vcf_out.write( line_i + '\n' )
    
    line_i = vcf_in.readline().rstrip()
    while line_i:
Ejemplo n.º 13
0
parser.add_argument('-snv',     '--snv-out',   type=str, help='Output VCF file', required=True)
parser.add_argument('-indel',   '--indel-out', type=str, help='Output VCF file', required=True)

parser.add_argument('-tnscope', '--is-tnscope', action="store_true", help='Actually TNscope VCF', required=False, default=False)

# Parse the arguments:
args = parser.parse_args()

infile = args.input_vcf
indel_out = args.indel_out
snv_out = args.snv_out

info_to_split = 'NLOD', 'TLOD'
info_to_keep = 'STR', 'ECNT'

with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out:
    
    line_i = vcf_in.readline().rstrip()
    
    while line_i.startswith('##'):
                
        if line_i.startswith('##normal_sample='):
            normal_name = line_i.split('=')[1]
            
        if line_i.startswith('##tumor_sample='):
            tumor_name = line_i.split('=')[1]
            
        if line_i.startswith('##INFO=<ID=SOR,'):
            line_i = re.sub(r'Float', 'String', line_i)

        snv_out.write( line_i + '\n' )
Ejemplo n.º 14
0
    '--somaticseq-trained',
    action='store_true',
    help=
    'If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .',
    required=False,
    default=False)

args = parser.parse_args()

vcf_in_fn = args.vcf_in
vcf_out_fn = args.vcf_out
caller_string = args.callers_classification_string
tumor = args.tumor_sample_name
somaticseq_trained = args.somaticseq_trained

with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout:

    line_in = vcfin.readline().rstrip('\n')

    while line_in.startswith('##'):

        if line_in.startswith('##SomaticSeq='):
            line_out = line_in + '-SEQC2'

        elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith(
                '##INFO=<ID={COMBO}'.format(COMBO=caller_string)):
            line_out = re.sub('##INFO=', '##FORMAT=', line_in)

        else:
            line_out = line_in
Ejemplo n.º 15
0
header_append = []
format_append = []

if args.pileup_DP4:
    header_append.append('##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">')
    format_append.append('plDP4')

if args.pileup_variant_allele_frequency:
    header_append.append('##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">')
    format_append.append('plVAF')



# Start Working by opening files:
try:
    my_vcf    = genome.open_textfile(my_vcf)
    Tpileup   = genome.open_textfile(Tpileup)
    outhandle = open(outfile, 'w')
    Npileup   = genome.open_textfile(Npileup)
except AttributeError:
    pass

if Npileup:
    npileup_line = Npileup.readline().rstrip('\n')

if Tpileup:
    tpileup_line = Tpileup.readline().rstrip('\n')

# Add the extra headers:    
out_vcf_headers = genome.vcf_header_modifier( my_vcf, addons=header_append )
Ejemplo n.º 16
0
{tBAM_ALT_Clipped_Reads}\t\
{tBAM_Clipping_FET}\t\
{tBAM_MQ0}\t\
{tBAM_Other_Reads}\t\
{tBAM_Poor_Reads}\t\
{tBAM_REF_InDel_3bp}\t\
{tBAM_REF_InDel_2bp}\t\
{tBAM_REF_InDel_1bp}\t\
{tBAM_ALT_InDel_3bp}\t\
{tBAM_ALT_InDel_2bp}\t\
{tBAM_ALT_InDel_1bp}\t\
{InDel_Length}\t\
{TrueVariant_or_False}'

## Running
with genome.open_textfile(mysites) as my_sites, open(outfile,
                                                     'w') as outhandle:

    my_line = my_sites.readline().rstrip()

    bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa)
    ref_fa = pysam.FastaFile(ref_fa)

    if truth:
        truth = genome.open_textfile(truth)
        truth_line = truth.readline().rstrip()
        while truth_line.startswith('#'):
            truth_line = truth.readline().rstrip()

    if cosmic:
        cosmic = genome.open_textfile(cosmic)
Ejemplo n.º 17
0
min_altMQ = args.min_altMQ
min_refBQ = args.min_refBQ
min_altBQ = args.min_altBQ
max_refNM = args.max_refNM
max_altNM = args.max_altNM
max_fetSB = args.max_fetSB
max_fetCD = args.max_fetCD
max_zMQ   = args.max_zMQ
max_zBQ   = args.max_zBQ
max_MQ0   = args.max_MQ0
min_VAF   = args.min_VAF
min_DP    = args.min_DP
min_varDP = args.min_varDP


with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out:
    
    line_i = vcf_in.readline().rstrip()
    
    while line_i.startswith('##'):
        
        vcf_out.write( line_i + '\n' )
        line_i = vcf_in.readline().rstrip()
    
    vcf_out.write( line_i + '\n' )

    # This line will be #CHROM:
    header = line_i.split('\t')
    sample_index = header.index(sample) - 9
    
    # This will be the first variant line:
Ejemplo n.º 18
0
parser.add_argument('-infile',  '--vcf-in',   type=str, help='VCF in', required=True)
parser.add_argument('-outfile', '--vcf-out',  type=str, help='VCF out', required=True)
parser.add_argument('-callers', '--callers-classification-string', type=str, help='MVJSD or whatever',  required=True)
parser.add_argument('-tumor',   '--tumor-sample-name', type=str, help='tumor sample name',  required=False, default='TUMOR')
parser.add_argument('-trained', '--somaticseq-trained',    action='store_true', help='If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False)


args = parser.parse_args()

vcf_in_fn  = args.vcf_in
vcf_out_fn = args.vcf_out
caller_string = args.callers_classification_string
tumor = args.tumor_sample_name
somaticseq_trained = args.somaticseq_trained

with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout:
    
    line_in = vcfin.readline().rstrip('\n')
    
    while line_in.startswith('##'):
        
        if line_in.startswith('##SomaticSeq='):
            line_out = line_in + '-SEQC2'
            
        elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith('##INFO=<ID={COMBO}'.format(COMBO=caller_string)):
            line_out = re.sub('##INFO=', '##FORMAT=', line_in)
            
        else:
            line_out = line_in
        
        vcfout.write( line_out + '\n' )