def vcf(infileList, outfile): with open(outfile, 'w') as vcfout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as vcfin: line_i = vcfin.readline() while line_i.startswith('#'): if not headerWritten: vcfout.write(line_i) line_i = vcfin.readline() # Turn off header writing from now on: headerWritten = True while line_i: vcfout.write(line_i) line_i = vcfin.readline()
def tsv(infileList, outfile): with open(outfile, 'w') as tsvout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as tsvin: # First line is a header line_i = tsvin.readline() if not headerWritten: tsvout.write(line_i) # Turn off header writing from now on: headerWritten = True line_i = tsvin.readline() while line_i: tsvout.write(line_i) line_i = tsvin.readline()
right_files = [i + '.vcf' + gz for i in chosen_chrom_sequence] else: right_files = (args.input_vcf) # Open files: if args.call_method == 'VarDict': snpout = open(out_snp, 'w') indelout = open(out_indel, 'w') else: vcfout = open(out_vcf, 'w') ### First, get the header. IF there are multiple VCF files (e.g., chromosome by chromosome), use the first file for this purpose: vcf_header = [] with genome.open_textfile(right_files[0]) as vcf: line_i = vcf.readline().rstrip() # Save the headers and then sort them: vcfheader_filter_info_filter = [] vcfheader_filter_info_filter.append( '##INFO=<ID={0},Number=0,Type=Flag,Description="Indicates if record is a {0} called somatic mutation">' .format(args.call_method)) vcfheader_misc = [] while line_i.startswith('#'): if re.match(r'##fileformat=', line_i): vcffileformat = line_i
header_append = [] format_append = [] if args.pileup_DP4: header_append.append('##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">') format_append.append('plDP4') if args.pileup_variant_allele_frequency: header_append.append('##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">') format_append.append('plVAF') # Start Working by opening files: try: my_vcf = genome.open_textfile(my_vcf) Tpileup = genome.open_textfile(Tpileup) outhandle = open(outfile, 'w') Npileup = genome.open_textfile(Npileup) except AttributeError: pass if Npileup: npileup_line = Npileup.readline().rstrip('\n') if Tpileup: tpileup_line = Tpileup.readline().rstrip('\n') # Add the extra headers: out_vcf_headers = genome.vcf_header_modifier( my_vcf, addons=header_append )
'--snv-out', type=str, help='Output VCF file', required=True) # Parse the arguments: args = parser.parse_args() infile = args.input_vcf indel_out = args.indel_out snv_out = args.snv_out info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open( snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): snv_out.write(line_i + '\n') indel_out.write(line_i + '\n') if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1]
for combo_i in itertools.product( (True, False), repeat = len(tools) ): # The four zeros represent [Total, dbsnp, COMMON, COSMIC] MVJS_combinations[combo_i] = [0, 0, 0, 0] # Keeping a tab on all those scores bina_score_tally = {} subscore_evidence_tally = {} bonus_knowledge_tally = {} penalty_knowledge_tally = {} num_methods_tally = {} with genome.open_textfile(args.input_vcf) as vcf, open(args.output_vcf, 'w') as vcf_out: line_i = vcf.readline().rstrip() while line_i.startswith('#'): # Read thru the headers and metadata: if line_i.startswith('#CHROM'): header_item = line_i.split('\t') if len(header_item) == 11: paired_mode = True idxN, idxT = 0,1 elif len(header_item) == 10: paired_mode = False
# Open files: if args.call_method == 'VarDict': snpout = open(out_snp, 'w') indelout = open(out_indel, 'w') else: vcfout = open(out_vcf, 'w') # First, get the header. IF there are multiple VCF files (e.g., chromosome by # chromosome), use the first file for this purpose: vcf_header = [] with genome.open_textfile(right_files[0]) as vcf: line_i = vcf.readline().rstrip() # Save the headers and then sort them: vcfheader_filter_info_filter = [] vcfheader_filter_info_filter.append('##INFO=<ID={0},Number=0,Type=Flag,Description="Indicates if record is a {0} called somatic mutation">'.format(args.call_method)) vcfheader_misc = [] while line_i.startswith('#'): if re.match(r'##fileformat=', line_i): vcffileformat = line_i elif re.match(r'^##FORMAT=<ID=DP4,', line_i):
'--output-vcf', type=str, help='Output VCF file', required=True, default=None) parser.add_argument('-tools', '--individual-mutation-tools', type=str, help='A list tools to sub-sample', nargs='*', required=True) args = parser.parse_args() subtools = set(args.individual_mutation_tools) with genome.open_textfile(args.input_vcf) as vcfin, open(args.output_vcf, 'w') as vcfout: line_i = vcfin.readline().rstrip('\n') while line_i.startswith('#'): vcfout.write(line_i + '\n') line_i = vcfin.readline().rstrip('\n') while line_i: vcf_i = genome.Vcf_line(line_i) if 'FalseNegative' in vcf_i.identifier: vcfout.write(line_i + '\n') else: tools = vcf_i.get_info_value('SOURCES')
parser.add_argument('-threshold', '--phasing-threshold', type=int, help='How far apart do we try to phase', required=False, default=1) args = parser.parse_args() infile = args.input_vcf_file bam = args.bam_file ref_fa = args.genome_reference outfile = args.output_vcf_file threshold = args.phasing_threshold with genome.open_textfile(infile) as infile, \ pysam.AlignmentFile(bam) as bam, \ open(outfile, 'w') as outfile, \ pysam.FastaFile(ref_fa) as ref_fa: my_line = infile.readline().rstrip() while my_line.startswith('##'): outfile.write(my_line + '\n') my_line = infile.readline().rstrip() # This is to read through and copy the #CHROM line assert my_line.startswith('#CHROM') outfile.write( '##INFO=<ID=COORDINATES,Number=.,Type=Integer,Description="Coordinates of the bases">\n' )
# Variant Call Type, i.e., snp or indel parser.add_argument('-infile', '--input-vcf', type=str, help='Input VCF file', required=True) parser.add_argument('-indel', '--indel-out', type=str, help='Output VCF file', required=True) parser.add_argument('-snv', '--snv-out', type=str, help='Output VCF file', required=True) # Parse the arguments: args = parser.parse_args() infile = args.input_vcf indel_out = args.indel_out snv_out = args.snv_out info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): snv_out.write( line_i + '\n' ) indel_out.write( line_i + '\n' ) if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1] line_i = vcf_in.readline().rstrip()
args = parser.parse_args() infile = args.input_vcf outfile = args.output_vcf # Seperate output into snv/snp and indel's: out_snp_file = outfile.split(os.sep) out_snp_file[-1] = 'snp.' + out_snp_file[-1] out_indel_file = outfile.split(os.sep) out_indel_file[-1] = 'indel.' + out_indel_file[-1] out_snp = os.sep.join(out_snp_file) out_indel = os.sep.join(out_indel_file) with genome.open_textfile(infile) as vcf, \ open(out_snp, 'w') as snpout, \ open(out_indel, 'w') as indelout: line_i = vcf.readline().rstrip() while line_i.startswith('##'): if re.match(r'^##INFO=<ID=(LSEQ|RSEQ),', line_i): line_i = line_i.replace('Number=G', 'Number=1') elif line_i.startswith('##FORMAT=<ID=BIAS,'): line_i = line_i.replace('Number=1', 'Number=.') elif line_i.startswith('##FORMAT=<ID=PSTD,') or \ line_i.startswith('##FORMAT=<ID=QSTD,') or \
import genomic_file_handlers as genome parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Variant Call Type, i.e., snp or indel parser.add_argument('-infile', '--input-vcf', type=str, help='Input VCF file', required=True) parser.add_argument('-outfile', '--output-vcf', type=str, help='Output VCF file', required=True) # Parse the arguments: args = parser.parse_args() infile = args.input_vcf outfile = args.output_vcf with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() # This is the #CHROM line: headers = line_i.split('\t') num_columns = len(headers) vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() while line_i:
parser.add_argument('-snv', '--snv-out', type=str, help='Output VCF file', required=True) parser.add_argument('-indel', '--indel-out', type=str, help='Output VCF file', required=True) parser.add_argument('-tnscope', '--is-tnscope', action="store_true", help='Actually TNscope VCF', required=False, default=False) # Parse the arguments: args = parser.parse_args() infile = args.input_vcf indel_out = args.indel_out snv_out = args.snv_out info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1] if line_i.startswith('##INFO=<ID=SOR,'): line_i = re.sub(r'Float', 'String', line_i) snv_out.write( line_i + '\n' )
'--somaticseq-trained', action='store_true', help= 'If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False) args = parser.parse_args() vcf_in_fn = args.vcf_in vcf_out_fn = args.vcf_out caller_string = args.callers_classification_string tumor = args.tumor_sample_name somaticseq_trained = args.somaticseq_trained with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout: line_in = vcfin.readline().rstrip('\n') while line_in.startswith('##'): if line_in.startswith('##SomaticSeq='): line_out = line_in + '-SEQC2' elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith( '##INFO=<ID={COMBO}'.format(COMBO=caller_string)): line_out = re.sub('##INFO=', '##FORMAT=', line_in) else: line_out = line_in
{tBAM_ALT_Clipped_Reads}\t\ {tBAM_Clipping_FET}\t\ {tBAM_MQ0}\t\ {tBAM_Other_Reads}\t\ {tBAM_Poor_Reads}\t\ {tBAM_REF_InDel_3bp}\t\ {tBAM_REF_InDel_2bp}\t\ {tBAM_REF_InDel_1bp}\t\ {tBAM_ALT_InDel_3bp}\t\ {tBAM_ALT_InDel_2bp}\t\ {tBAM_ALT_InDel_1bp}\t\ {InDel_Length}\t\ {TrueVariant_or_False}' ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = truth.readline().rstrip() while truth_line.startswith('#'): truth_line = truth.readline().rstrip() if cosmic: cosmic = genome.open_textfile(cosmic)
min_altMQ = args.min_altMQ min_refBQ = args.min_refBQ min_altBQ = args.min_altBQ max_refNM = args.max_refNM max_altNM = args.max_altNM max_fetSB = args.max_fetSB max_fetCD = args.max_fetCD max_zMQ = args.max_zMQ max_zBQ = args.max_zBQ max_MQ0 = args.max_MQ0 min_VAF = args.min_VAF min_DP = args.min_DP min_varDP = args.min_varDP with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() vcf_out.write( line_i + '\n' ) # This line will be #CHROM: header = line_i.split('\t') sample_index = header.index(sample) - 9 # This will be the first variant line:
parser.add_argument('-infile', '--vcf-in', type=str, help='VCF in', required=True) parser.add_argument('-outfile', '--vcf-out', type=str, help='VCF out', required=True) parser.add_argument('-callers', '--callers-classification-string', type=str, help='MVJSD or whatever', required=True) parser.add_argument('-tumor', '--tumor-sample-name', type=str, help='tumor sample name', required=False, default='TUMOR') parser.add_argument('-trained', '--somaticseq-trained', action='store_true', help='If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False) args = parser.parse_args() vcf_in_fn = args.vcf_in vcf_out_fn = args.vcf_out caller_string = args.callers_classification_string tumor = args.tumor_sample_name somaticseq_trained = args.somaticseq_trained with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout: line_in = vcfin.readline().rstrip('\n') while line_in.startswith('##'): if line_in.startswith('##SomaticSeq='): line_out = line_in + '-SEQC2' elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith('##INFO=<ID={COMBO}'.format(COMBO=caller_string)): line_out = re.sub('##INFO=', '##FORMAT=', line_in) else: line_out = line_in vcfout.write( line_out + '\n' )