def sambamba(inbams, outbam, tech='docker', input_parameters={}, remove_inbams=False): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = list(inbams) + [ outbam, ] merge_line, fileDict = container.container_params( input_parameters['sambamba_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) mounted_outbam = fileDict[outbam]['mount_path'] infile_string = ' '.join( [fileDict[file_i]['mount_path'] for file_i in inbams]) with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) # Do not change this: picard_fractional uses this to end the copying. out.write(f'{merge_line} \\\n') out.write('sambamba merge -t {} {} {}\n\n'.format( input_parameters['threads'], mounted_outbam, infile_string)) if remove_inbams: out.write('rm {}\n\n'.format(' '.join(inbams))) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def run_SomaticSeq(input_parameters, tech='docker'): DEFAULT_PARAMS = {'MEM': '4G', 'inclusion_region': None, 'exclusion_region': None, 'output_directory' : os.curdir, 'somaticseq_directory': 'SomaticSeq', 'action': 'echo', 'dbsnp' : None, 'cosmic': None, 'snv_classifier': None, 'indel_classifier': None, 'truth_snv': None, 'truth_indel': None, 'somaticseq_arguments': '', 'train_somaticseq': False, 'somaticseq_algorithm': 'xgboost'} for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] all_paths = [] for path_i in input_parameters['bam'], input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['inclusion_region'], input_parameters['exclusion_region'], input_parameters['dbsnp'], input_parameters['cosmic'], input_parameters['snv_classifier'], input_parameters['indel_classifier'], input_parameters['truth_snv'], input_parameters['truth_indel']: if path_i: all_paths.append( path_i ) container_line, fileDict = container.container_params( f'lethalfang/somaticseq:{VERSION}', tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] ) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference'] ]['mount_path'] mounted_tumor_bam = fileDict[ input_parameters['bam'] ]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory'] ]['mount_path'] outdir = os.path.join(input_parameters['output_directory'], input_parameters['somaticseq_directory']) logdir = os.path.join(outdir, 'logs') outfile = os.path.join(logdir, input_parameters['script'] ) mutect2 = '{}/MuTect2.vcf'.format(mounted_outdir) varscan = '{}/VarScan2.vcf'.format(mounted_outdir) vardict = '{}/VarDict.vcf'.format(mounted_outdir) lofreq = '{}/LoFreq.vcf'.format(mounted_outdir) scalpel = '{}/Scalpel.vcf'.format(mounted_outdir) strelka = '{}/Strelka/results/variants/variants.vcf.gz'.format(mounted_outdir) os.makedirs(logdir, exist_ok=True) with open(outfile, 'w') as out: out.write( "#!/bin/bash\n\n" ) out.write(f'#$ -o {logdir}\n' ) out.write(f'#$ -e {logdir}\n' ) out.write( '#$ -S /bin/bash\n' ) out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) ) out.write( 'set -e\n\n' ) out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) #out.write( 'docker pull lethalfang/somaticseq:{VERSION} \n\n'.format(VERSION=VERSION) ) out.write(f'{container_line} \\\n' ) out.write( '/opt/somaticseq/somaticseq/run_somaticseq.py \\\n' ) if input_parameters['train_somaticseq'] and input_parameters['threads'] == 1: out.write( '--somaticseq-train --algorithm {} \\\n'.format(input_parameters['somaticseq_algorithm']) ) out.write( '--output-directory {} \\\n'.format( os.path.join(mounted_outdir, input_parameters['somaticseq_directory']) ) ) out.write( '--genome-reference {} \\\n'.format(mounted_genome_reference) ) if input_parameters['inclusion_region']: mounted_inclusion = fileDict[ input_parameters['inclusion_region'] ]['mount_path'] out.write( '--inclusion-region {} \\\n'.format(mounted_inclusion) ) if input_parameters['exclusion_region']: mounted_exclusion = fileDict[ input_parameters['exclusion_region'] ]['mount_path'] out.write( '--exclusion-region {} \\\n'.format(input_parameters['exclusion_region']) ) if input_parameters['cosmic']: mounted_cosmic = fileDict[ input_parameters['cosmic'] ]['mount_path'] out.write( '--cosmic-vcf {} \\\n'.format(mounted_cosmic) ) if input_parameters['dbsnp']: mounted_dbsnp = fileDict[ input_parameters['dbsnp'] ]['mount_path'] out.write( '--dbsnp-vcf {} \\\n'.format(input_parameters['dbsnp_vcf']) ) if input_parameters['snv_classifier'] or input_parameters['indel_classifier']: out.write( '--algorithm {} \\\n'.format(input_parameters['somaticseq_algorithm']) ) if input_parameters['snv_classifier']: out.write( '--classifier-snv {} \\\n'.format( fileDict[ input_parameters['snv_classifier'] ]['mount_path'] ) ) if input_parameters['indel_classifier']: out.write( '--classifier-indel {} \\\n'.format( fileDict[ input_parameters['indel_classifier'] ]['mount_path'] ) ) if input_parameters['truth_snv']: out.write( '--truth-snv {} \\\n'.format( fileDict[ input_parameters['truth_snv'] ]['mount_path'] ) ) if input_parameters['truth_indel']: out.write( '--truth-indel {} \\\n'.format( fileDict[ input_parameters['truth_indel'] ]['mount_path'] ) ) if input_parameters['somaticseq_algorithm']: out.write( '--algorithm {} \\\n'.format(input_parameters['somaticseq_algorithm']) ) if input_parameters['somaticseq_arguments']: out.write( '{} \\\n'.format(input_parameters['somaticseq_arguments']) ) out.write( 'single \\\n' ) out.write( '--bam-file {} \\\n'.format(mounted_tumor_bam) ) if input_parameters['run_mutect2']: out.write( '--mutect2-vcf {} \\\n'.format(mutect2) ) if input_parameters['run_varscan2']: out.write( '--varscan-vcf {} \\\n'.format(varscan) ) if input_parameters['run_vardict']: out.write( '--vardict-vcf {} \\\n'.format(vardict) ) if input_parameters['run_lofreq']: out.write( '--lofreq-vcf {} \\\n'.format(lofreq) ) if input_parameters['run_scalpel']: out.write( '--scalpel-vcf {} \\\n'.format(scalpel) ) if input_parameters['run_strelka2']: out.write( '--strelka-vcf {} \\\n'.format(strelka) ) out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' ) # "Run" the script that was generated command_line = '{} {}'.format( input_parameters['action'], outfile ) returnCode = subprocess.call( command_line, shell=True ) return outfile
def merge_results(input_parameters, tech='docker'): DEFAULT_PARAMS = {'MEM': '4G', 'output_directory': os.curdir, 'somaticseq_directory': 'SomaticSeq', 'action': 'echo', 'script': 'mergeResults.{}.cmd'.format(ts), 'snv_classifier': None, 'indel_classifier': None, 'truth_snv': None, 'truth_indel': None, 'somaticseq_arguments': '', 'train_somaticseq': False, 'somaticseq_algorithm': 'xgboost'} for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] all_paths = [] for path_i in input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['snv_classifier'], input_parameters['indel_classifier'], input_parameters['truth_snv'], input_parameters['truth_indel']: if path_i: all_paths.append( path_i ) container_line, fileDict = container.container_params( f'lethalfang/somaticseq:{VERSION}', tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] ) # Mounted paths for all the input files and output directory: mounted_outdir = fileDict[ input_parameters['output_directory'] ]['mount_path'] prjdir = input_parameters['output_directory'] logdir = os.path.join(prjdir, 'logs') outfile = os.path.join(logdir, input_parameters['script'] ) mutect2 = mounted_outdir + '/{}/MuTect2.vcf' varscan = mounted_outdir + '/{}/VarScan2.vcf' vardict = mounted_outdir + '/{}/VarDict.vcf' lofreq = mounted_outdir + '/{}/LoFreq.vcf' scalpel = mounted_outdir + '/{}/Scalpel.vcf' strelka = mounted_outdir + '/{}/Strelka/results/variants/variants.vcf.gz' somaticdir = input_parameters['somaticseq_directory'] os.makedirs(logdir, exist_ok=True) with open(outfile, 'w') as out: out.write( "#!/bin/bash\n\n" ) out.write(f'#$ -o {logdir}\n' ) out.write(f'#$ -e {logdir}\n' ) out.write( '#$ -S /bin/bash\n' ) out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) ) out.write( 'set -e\n\n' ) out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) if input_parameters['run_mutect2']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( mutect2.format(i) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/MuTect2.vcf\n\n'.format(mounted_outdir) ) if input_parameters['run_varscan2']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( varscan.format(i) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/VarScan2.vcf\n\n'.format(mounted_outdir) ) if input_parameters['run_vardict']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( vardict.format(i) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/VarDict.vcf\n\n'.format(mounted_outdir) ) if input_parameters['run_lofreq']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( lofreq.format(i) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/LoFreq.vcf\n\n'.format(mounted_outdir) ) if input_parameters['run_scalpel']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( scalpel.format(i) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/Scalpel.vcf\n\n'.format(mounted_outdir) ) if input_parameters['run_strelka2']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( strelka.format(i) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/Strelka.vcf\n\n'.format(mounted_outdir) ) ###### SomaticSeq ##### if input_parameters['run_somaticseq']: # Ensemble.sSNV.tsv out.write(f'{container_line} \\\n' ) out.write( 'concat.py -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/Ensemble.sSNV.tsv'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/Ensemble.sSNV.tsv\n\n'.format(mounted_outdir) ) # Ensemble.sINDEL.tsv out.write(f'{container_line} \\\n' ) out.write( 'concat.py -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/Ensemble.sINDEL.tsv'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/Ensemble.sINDEL.tsv\n\n'.format(mounted_outdir) ) # If asked to create classifier, do it here when TSV files are combined if input_parameters['train_somaticseq'] and input_parameters['truth_snv']: out.write(f'{container_line} \\\n' ) if input_parameters['somaticseq_algorithm'] == 'ada': out.write( 'ada_model_builder_ntChange.R {}/Ensemble.sSNV.tsv\n\n'.format( mounted_outdir) ) else: out.write( 'somatic_xgboost.py train -threads {} -tsvs {}/Ensemble.sSNV.tsv\n\n'.format(input_parameters['threads'], mounted_outdir) ) if input_parameters['train_somaticseq'] and input_parameters['truth_indel']: out.write(f'{container_line} \\\n' ) if input_parameters['somaticseq_algorithm'] == 'ada': out.write( 'ada_model_builder_ntChange.R {}/Ensemble.sINDEL.tsv\n\n'.format( mounted_outdir) ) else: out.write( 'somatic_xgboost.py train -threads {} -tsvs {}/Ensemble.sINDEL.tsv\n\n'.format(input_parameters['threads'], mounted_outdir) ) # If in prediction mode, combine SSeq.Classified.sSNV.vcf, else Consensus.sSNV.vcf if input_parameters['snv_classifier']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/SSeq.Classified.sSNV.vcf'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/SSeq.Classified.sSNV.vcf\n\n'.format(mounted_outdir) ) # SSeq.Classified.sSNV.tsv out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/SSeq.Classified.sSNV.tsv'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/SSeq.Classified.sSNV.tsv\n\n'.format(mounted_outdir) ) # Consensus mode: Consensus.sSNV.vcf else: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/Consensus.sSNV.vcf'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/Consensus.sSNV.vcf\n\n'.format(mounted_outdir) ) # If in prediction mode, combine SSeq.Classified.sINDEL.vcf, else Consensus.sINDEL.vcf if input_parameters['indel_classifier']: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/SSeq.Classified.sINDEL.vcf'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/SSeq.Classified.sINDEL.vcf\n\n'.format(mounted_outdir) ) # SSeq.Classified.sINDEL.tsv out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/SSeq.Classified.sINDEL.tsv'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/SSeq.Classified.sINDEL.tsv\n\n'.format(mounted_outdir) ) # Consensus mode: Consensus.sINDEL.vcf else: out.write(f'{container_line} \\\n' ) out.write( 'concat.py --bgzip-output -infiles \\\n' ) for i in range(1, input_parameters['threads']+1): out.write( '{}/{}/{}/Consensus.sINDEL.vcf'.format(mounted_outdir, i, somaticdir) + ' ' ) out.write( '\\\n' ) out.write('-outfile {}/Consensus.sINDEL.vcf\n\n'.format(mounted_outdir) ) out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' ) command_line = '{} {}'.format( input_parameters['action'], outfile ) returnCode = subprocess.call( command_line, shell=True ) return outfile
def bwa(input_parameters, tech='docker'): if input_parameters['in_fastq2']: paired_end = True else: paired_end = False for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['output_directory'], input_parameters[ 'genome_reference'], input_parameters[ 'in_fastq1'], input_parameters['in_fastq2']: if path_i: all_paths.append(path_i) bwa_line, fileDict = container.container_params( input_parameters['bwa_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path'] mounted_fq2 = fileDict[input_parameters['in_fastq2']]['mount_path'] temporary_files = [] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'] * input_parameters['threads'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write(f'{bwa_line} bash -c \\\n') out.write('"bwa mem \\\n') out.write('-R \'{}\' \\\n'.format(input_parameters['bam_header'])) out.write('-M {} -t {} \\\n'.format( input_parameters['extra_bwa_arguments'], input_parameters['threads'])) out.write('{} \\\n'.format(mounted_reference)) out.write('{} \\\n'.format(mounted_fq1)) if paired_end: out.write('{} \\\n'.format(mounted_fq2)) out.write('| samtools view -Sbh - \\\n') out.write( '| samtools sort -m {MEM}G --threads {THREADS} -o {DIR}/{OUTFILE}"\n\n' .format(MEM=math.ceil(input_parameters['MEM'] / 2), THREADS=math.ceil(input_parameters['threads'] / 2), DIR=mounted_outdir, OUTFILE=input_parameters['out_bam'])) out.write(f'{bwa_line} \\\n') out.write('samtools index -@{} {}\n'.format( input_parameters['threads'], os.path.join(mounted_outdir, input_parameters['out_bam']))) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters['inclusion_region']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['strelka2_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] if input_parameters['inclusion_region']: mounted_inclusion = fileDict[ input_parameters['inclusion_region']]['mount_path'] bed_gz = fileDict[ input_parameters['inclusion_region']]['filename'] + '.gz' with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') # Make .bed.gz out of .bed files using tabix: tabix_line, tabixDict = container.container_params( 'lethalfang/tabix:1.7', tech, all_paths) tabix_selector = tabixDict[ input_parameters['inclusion_region']]['mount_path'] tabix_outdir = tabixDict[ input_parameters['output_directory']]['mount_path'] out.write( '{DOCKER_LINE} bash -c "cat {SELECTOR} | bgzip > {OUTDIR}/{BEDGZ}\"\n' .format(DOCKER_LINE=tabix_line, SELECTOR=tabix_selector, OUTDIR=tabix_outdir, BEDGZ=bed_gz)) out.write('{DOCKER_LINE} tabix -f {OUTDIR}/{BEDGZ}\n\n'.format( DOCKER_LINE=tabix_line, OUTDIR=tabix_outdir, BEDGZ=bed_gz)) out.write(f'{container_line} \\\n') out.write('/opt/strelka/bin/configureStrelkaSomaticWorkflow.py \\\n') out.write('--tumorBam={} \\\n'.format(mounted_tumor_bam)) out.write('--normalBam={} \\\n'.format(mounted_normal_bam)) out.write('--referenceFasta={} \\\n'.format(mounted_genome_reference)) out.write('--callMemMb={} \\\n'.format( eval(input_parameters['MEM'].rstrip('G')) * 1024)) out.write('--callRegions={}/{} \\\n'.format(mounted_outdir, bed_gz)) if input_parameters['exome']: out.write('--exome \\\n') if input_parameters['strelka_config_arguments']: out.write('{} \\\n'.format( input_parameters['strelka_config_arguments'])) out.write('--runDir={}/{}\n\n'.format(mounted_outdir, input_parameters['outdir_name'])) out.write(f'{container_line} \\\n') out.write('{}/{}/runWorkflow.py -m local -j 1 {}\n'.format( mounted_outdir, input_parameters['outdir_name'], input_parameters['strelka_run_arguments'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def alienTrimmer(input_parameters, tech='docker'): if input_parameters['in_fastq2']: paired_end = True else: paired_end = False for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['output_directory'], input_parameters[ 'in_fastq1'], input_parameters['in_fastq2']: if path_i: all_paths.append(path_i) trim_line, fileDict = container.container_params( input_parameters['alienTrimmerImage'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] temporary_files = [] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') # AlienTrimmer does not do bgzipped fastq files, unfortunately: if input_parameters['in_fastq1'].endswith('.gz'): out_fastq_1 = uuid.uuid4().hex + '.fastq' out_fastq_2 = uuid.uuid4().hex + '.fastq' if paired_end: tabix_line, tabixDict = container.container_params( 'lethalfang/tabix:1.7', tech, (input_parameters['output_directory'], input_parameters['in_fastq1'], input_parameters['in_fastq2'])) else: tabix_line, tabixDict = container.container_params( 'lethalfang/tabix:1.7', tech, (input_parameters['output_directory'], input_parameters['in_fastq1'])) tabix_outdir = tabixDict[ input_parameters['output_directory']]['mount_path'] tabix_fq1 = tabixDict[input_parameters['in_fastq1']]['mount_path'] out.write(f'{tabix_line} bash -c \\\n') out.write('"gunzip -c {} > {}/{}"\n\n'.format( tabix_fq1, tabix_outdir, out_fastq_1)) mounted_fq1 = os.path.join(mounted_outdir, out_fastq_1) temporary_files.append(out_fastq_1) if paired_end: tabix_fq2 = tabixDict[ input_parameters['in_fastq2']]['mount_path'] out.write(f'{tabix_line} bash -c \\\n') out.write('"gunzip -c {} > {}/{}"\n\n'.format( tabix_fq2, tabix_outdir, out_fastq_2)) mounted_fq2 = os.path.join(mounted_outdir, out_fastq_2) temporary_files.append(out_fastq_2) else: mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path'] if paired_end: mounted_fq2 = fileDict[ input_parameters['in_fastq2']]['mount_path'] out.write(f'{trim_line} \\\n') out.write('/opt/AlienTrimmer_0.4.0/src/AlienTrimmer \\\n') if paired_end: trimmed_fq1 = uuid.uuid4().hex + '.fastq' trimmed_fq2 = uuid.uuid4().hex + '.fastq' singleton = uuid.uuid4().hex + '.fastq' out.write('-if {} -ir {} \\\n'.format(mounted_fq1, mounted_fq2)) out.write('-of {}/{} -or {}/{} \\\n'.format( mounted_outdir, trimmed_fq1, mounted_outdir, trimmed_fq2)) out.write('-os {}/{} \\\n'.format(mounted_outdir, singleton)) temporary_files.extend([trimmed_fq1, trimmed_fq2, singleton]) else: trimmed_fq1 = uuid.uuid4().hex + '.fastq' out.write('-i {} \\\n'.write(mounted_fq1)) out.write('-o {}/{} \\\n'.write(mounted_outdir, trimmed_fq1)) temporary_files.append(trimmed_fq1) out.write('-c {} \\\n'.format(input_parameters['adapter'])) out.write('-l {}\n\n'.format(input_parameters['minimum_length'])) out.write(f'{tabix_line} bash -c \\\n') out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format( tabix_outdir, trimmed_fq1, input_parameters['threads'], tabix_outdir, input_parameters['out_fastq1_name'])) if paired_end: out.write(f'{tabix_line} bash -c \\\n') out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format( tabix_outdir, trimmed_fq2, input_parameters['threads'], tabix_outdir, input_parameters['out_fastq2_name'])) out.write(f'{tabix_line} bash -c \\\n') out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format( tabix_outdir, singleton, input_parameters['threads'], tabix_outdir, input_parameters['out_singleton_name'])) out.write('\n') for file_i in temporary_files: out.write('rm {}\n'.format( os.path.join(input_parameters['output_directory'], file_i))) # Remove untrimmed files: if input_parameters['remove_untrimmed']: out.write('\n') out.write('rm {}\n'.format(input_parameters['in_fastq1'])) if input_parameters['in_fastq2']: out.write('rm {}\n'.format(input_parameters['in_fastq2'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters['inclusion_region']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['vardict_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) minVAF = input_parameters['minimum_VAF'] total_bases = 0 num_lines = 0 if input_parameters['inclusion_region']: bed_file = input_parameters['inclusion_region'] with open(bed_file) as bed: line_i = bed.readline().rstrip() while line_i.startswith('track'): line_i = bed.readline().rstrip() while line_i: item = line_i.rstrip().split('\t') total_bases = total_bases + int(item[2]) - int(item[1]) num_lines += 1 line_i = bed.readline().rstrip() else: fai_file = input_parameters['genome_reference'] + '.fai' bed_file = os.path.join(input_parameters['output_directory'], 'genome.bed') with open(fai_file) as fai, open(bed_file, 'w') as wgs_bed: for line_i in fai: item = line_i.split('\t') total_bases += int(item[1]) num_lines += 1 wgs_bed.write('{}\t{}\t{}\n'.format(item[0], '0', item[1])) # However the "bed_file" is defined here, create a dockered line and mount dictionary for it: bed_split_line, bedDict = container.container_params( 'lethalfang/somaticseq:{}'.format(VERSION), tech, (bed_file, input_parameters['output_directory'])) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_bed = bedDict[bed_file]['mount_path'] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') # Decide if Bed file needs to be "split" such that each line has a small enough region if input_parameters['process_bed'] or total_bases / num_lines > 50000: out.write(f'{bed_split_line} \\\n') out.write( '/opt/somaticseq/somaticseq/utilities/split_mergedBed.py \\\n') out.write('-infile {} -outfile {}/split_regions.bed\n\n'.format( mounted_bed, bedDict[input_parameters['output_directory']]['mount_path'])) bed_file = '{}/split_regions.bed'.format(mounted_outdir) out.write(f'{container_line} bash -c \\\n') out.write('"/opt/VarDict-1.7.0/bin/VarDict \\\n') if input_parameters['vardict_arguments']: out.write('{} \\\n'.format(input_parameters['vardict_arguments'])) out.write('-G {} \\\n'.format(mounted_genome_reference)) out.write('-f {} -h \\\n'.format(minVAF)) out.write('-b \'{}|{}\' \\\n'.format(mounted_tumor_bam, mounted_normal_bam)) out.write('-Q 1 -c 1 -S 2 -E 3 -g 4 {} \\\n'.format(bed_file)) out.write('> {}/vardict.var"\n\n'.format(mounted_outdir)) out.write('\n') out.write(f'{container_line} \\\n') out.write( 'bash -c "cat {}/vardict.var | awk \'NR!=1\' | /opt/VarDict/testsomatic.R | /opt/VarDict/var2vcf_paired.pl -N \'TUMOR|NORMAL\' -f {} \\\n' .format(mounted_outdir, minVAF)) out.write('> {}/{}"\n\n'.format(mounted_outdir, input_parameters['outfile'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['genome_reference']) assert os.path.exists(input_parameters['dbsnp_gz']) assert os.path.exists(input_parameters['dbsnp_gz'] + '.tbi') logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['tumor_bam'], input_parameters[ 'normal_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters[ 'inclusion_region'], input_parameters['dbsnp_gz']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['lofreq_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_inclusion = fileDict[ input_parameters['inclusion_region']]['mount_path'] mounted_dbsnp_gz = fileDict[input_parameters['dbsnp_gz']]['mount_path'] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write(f'{container_line} \\\n') out.write('lofreq somatic \\\n') out.write('-t {} \\\n'.format(mounted_tumor_bam)) out.write('-n {} \\\n'.format(mounted_normal_bam)) out.write('--call-indels \\\n') out.write('-l {} \\\n'.format(mounted_inclusion)) out.write('-f {} \\\n'.format(mounted_genome_reference)) out.write('-o {}/{} \\\n'.format(mounted_outdir, input_parameters['out_prefix'])) if input_parameters['lofreq_arguments']: out.write('{} \\\n'.format(input_parameters['lofreq_arguments'])) out.write('-d {}\n'.format(mounted_dbsnp_gz)) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) assert os.path.exists(input_parameters['reference_dict']) logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters[ 'inclusion_region'], input_parameters[ 'reference_dict']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['scalpel_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_reference_dict = fileDict[ input_parameters['reference_dict']]['mount_path'] mounted_inclusion = fileDict[ input_parameters['inclusion_region']]['mount_path'] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write(f'{container_line} bash -c \\\n') out.write('"/opt/scalpel/scalpel-discovery --somatic \\\n') out.write('--ref {} \\\n'.format(mounted_genome_reference)) out.write('--bed {} \\\n'.format(mounted_inclusion)) out.write('--normal {} \\\n'.format(mounted_normal_bam)) out.write('--tumor {} \\\n'.format(mounted_tumor_bam)) out.write('--window 600 \\\n') if input_parameters['scalpel_two_pass']: out.write('--two-pass \\\n') if input_parameters['scalpel_discovery_arguments']: out.write('{} \\\n'.format( DISCOVERY_ARGS=input_parameters['scalpel_discovery_arguments']) ) out.write('--dir {}/scalpel && \\\n'.format(mounted_outdir)) out.write('/opt/scalpel/scalpel-export --somatic \\\n') out.write( '--db {}/scalpel/main/somatic.db.dir \\\n'.format(mounted_outdir)) out.write('--ref {} \\\n'.format(mounted_genome_reference)) out.write('--bed {} \\\n'.format(mounted_inclusion)) out.write('{} \\\n'.format( input_parameters['scalpel_export_arguments'])) out.write('> {}/scalpel/scalpel.vcf"\n\n'.format(mounted_outdir)) out.write(f'{container_line} bash -c \\\n') out.write( '"cat {}/scalpel/scalpel.vcf | /opt/vcfsorter.pl {} - \\\n'.format( mounted_outdir, mounted_reference_dict)) out.write('> {}/{}\"\n'.format(mounted_outdir, input_parameters['outfile'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def picard(inbams, outbam, tech='docker', input_parameters={}, remove_inbams=False): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = list(inbams) + [ outbam, ] merge_line, fileDict = container.container_params( input_parameters['picard_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) mounted_outbam = fileDict[outbam]['mount_path'] infile_string = '' for file_i in inbams: infile_string = infile_string + 'I={} '.format( fileDict[file_i]['mount_path']) picard_index_file = re.sub(r'm$', 'i', outbam) if outbam.endswith('.bam'): samtools_index_file = outbam + '.bai' elif outbam.endswith('.cram'): samtools_index_file = outbam + '.crai' else: raise Exception('Output file {} seems wrong.'.format(outbam)) with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) # Do not change this: picard_fractional uses this to end the copying. out.write(f'{merge_line} \\\n') out.write( 'java -Xmx{}G -jar /opt/picard.jar MergeSamFiles {} {} ASSUME_SORTED=true CREATE_INDEX=true O={}\n\n' .format(input_parameters['MEM'], infile_string, input_parameters['extra_picard_arguments'], mounted_outbam)) if remove_inbams: out.write('rm {}\n\n'.format(' '.join(inbams))) out.write('mv {} {}\n\n'.format(picard_index_file, samtools_index_file)) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def picard(input_parameters, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['output_directory'], input_parameters[ 'in_bam']: if path_i: all_paths.append(path_i) markdup_line, fileDict = container.container_params( input_parameters['picard_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) samtools_line, stDict = container.container_params( input_parameters['samtools_image'], tech, [ input_parameters['output_directory'], ], input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_inbam = fileDict[input_parameters['in_bam']]['mount_path'] tempdir = uuid.uuid4().hex with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) # Do not change this: fractional uses this to end the copying. out.write('mkdir -p {}/{}\n\n'.format( input_parameters['output_directory'], tempdir)) out.write(f'{markdup_line} \\\n') out.write( 'java -Xmx{}G -jar /opt/picard.jar MarkDuplicatesWithMateCigar \\\n' .format(input_parameters['MEM'])) out.write('I={} \\\n'.format(mounted_inbam)) out.write('M={}/{} \\\n'.format( mounted_outdir, re.sub( r'\.(bam|cram)', '', fileDict[input_parameters['in_bam']]['filename'] + '.markdup'))) out.write('ASSUME_SORT_ORDER=coordinate \\\n') out.write('TMP_DIR={}/{} \\\n'.format(mounted_outdir, tempdir)) out.write('MINIMUM_DISTANCE=1000 \\\n') out.write('O={}/{}\n\n'.format(mounted_outdir, input_parameters['out_bam'])) if input_parameters['index_bam']: out.write(f'{samtools_line} \\\n') out.write('samtools index -@{} {}/{}\n\n'.format( input_parameters['threads'], stDict[input_parameters['output_directory']]['mount_path'], input_parameters['out_bam'])) out.write('rm -r {}/{}\n'.format(input_parameters['output_directory'], tempdir)) out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' ) # Do not change this: fractional uses this to end the copying. # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def fractional(bed, input_parameters, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] outdir = str(Path(bed).absolute().parent) logdir = os.path.join(outdir, 'logs') outfile = os.path.join(logdir, 'markdup_fractional.{}.cmd'.format(ts)) os.makedirs(logdir, exist_ok=True) sambam_line, stDict = container.container_params( input_parameters['sambamba_image'], tech, [ input_parameters['in_bam'], bed, ], input_parameters['extra_docker_options']) mounted_inbam = stDict[input_parameters['in_bam']]['mount_path'] mounted_bed = stDict[bed]['mount_path'] mounted_outdir = stDict[bed]['mount_dir'] temp_split_bam = uuid.uuid4().hex + '.bam' split_deduped_bam = uuid.uuid4().hex + '.bam' with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write(f'{sambam_line} \\\n') out.write('sambamba view -L {} -t {} -f bam -o {} {}\n\n'.format( mounted_bed, 1, os.path.join(mounted_outdir, temp_split_bam), mounted_inbam)) fractional_parameters = copy(input_parameters) fractional_parameters['output_directory'] = outdir fractional_parameters['in_bam'] = os.path.join(outdir, temp_split_bam) fractional_parameters['out_bam'] = split_deduped_bam fractional_parameters['script'] = 'to_be_deleted.{}.cmd'.format(ts) fractional_parameters['index_bam'] = False if input_parameters['software'] == 'picard': dedup_script = picard(fractional_parameters, tech) elif input_parameters['software'] == 'sambamba': fractional_parameters['threads'] = 2 dedup_script = sambamba(fractional_parameters, tech) with open(os.path.join(logdir, fractional_parameters['script'])) as dedup: line_i = dedup.readline() while not line_i.startswith('echo -e "Start'): line_i = dedup.readline() while not line_i.startswith('echo -e "Done'): out.write(line_i) line_i = dedup.readline() out.write('rm {}\n'.format(os.path.join(outdir, temp_split_bam))) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile, os.path.join(outdir, split_deduped_bam)
def sambamba(input_parameters, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['output_directory'], input_parameters[ 'in_bam']: if path_i: all_paths.append(path_i) markdup_line, fileDict = container.container_params( input_parameters['sambamba_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_inbam = fileDict[input_parameters['in_bam']]['mount_path'] tempdir = uuid.uuid4().hex with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) # Do not change this: fractional uses this to end the copying. out.write('mkdir -p {}/{}\n\n'.format( input_parameters['output_directory'], tempdir)) out.write(f'{markdup_line} \\\n') out.write('sambamba markdup -t {} --tmpdir {} {} {}\n\n'.format( input_parameters['threads'], os.path.join(mounted_outdir, tempdir), mounted_inbam, os.path.join(mounted_outdir, input_parameters['out_bam']))) out.write('rm -r {}/{}\n'.format(input_parameters['output_directory'], tempdir)) out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' ) # Do not change this: fractional uses this to end the copying. # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) assert os.path.exists(input_parameters['dbsnp_gz']) assert os.path.exists(input_parameters['dbsnp_gz'] + '.tbi') logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters[ 'inclusion_region'], input_parameters['dbsnp_gz']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['muse_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_dbsnp_gz = fileDict[input_parameters['dbsnp_gz']]['mount_path'] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write( 'cat {} | awk -F "\\t" \'{{print $1 "\\t" $2 "\\t" $3}}\' > {}/bed_3columns.bed\n\n' .format(input_parameters['inclusion_region'], input_parameters['output_directory'])) out.write(f'{container_line} \\\n') out.write('MuSEv1.0rc_submission_c039ffa call \\\n') out.write('-O {}/MuSE \\\n'.format(mounted_outdir)) out.write('-l {}/bed_3columns.bed \\\n'.format(mounted_outdir)) out.write('-f {} \\\n'.format(mounted_genome_reference)) out.write('{} \\\n'.format(mounted_tumor_bam)) out.write('{}\n\n'.format(mounted_normal_bam)) out.write(f'{container_line} \\\n') out.write('MuSEv1.0rc_submission_c039ffa sump \\\n') out.write('-I {}/MuSE.MuSE.txt \\\n'.format(mounted_outdir)) if input_parameters['exome']: out.write('-E \\\n') else: out.write('-G \\\n') if input_parameters['muse_arguments']: out.write('{} \\\n'.format( EXTRA_ARGS=input_parameters['muse_arguments'])) out.write('-O {}/{} \\\n'.format(mounted_outdir, input_parameters['outfile'])) out.write('-D {}\n'.format(mounted_dbsnp_gz)) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters['inclusion_region']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['varscan2_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) mpileine_line, plDict = container.container_params( 'lethalfang/samtools:1.7', tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] # Mounted paths for mpileup dockers pl_genome_reference = plDict[ input_parameters['genome_reference']]['mount_path'] pl_tumor_bam = plDict[input_parameters['tumor_bam']]['mount_path'] pl_normal_bam = plDict[input_parameters['normal_bam']]['mount_path'] pl_outdir = plDict[input_parameters['output_directory']]['mount_path'] if input_parameters['inclusion_region']: selector_text = '-l {}'.format( plDict[input_parameters['inclusion_region']]['mount_path']) else: selector_text = '' if input_parameters['minimum_VAF']: minVAF = input_parameters['minimum_VAF'] outname = re.sub(r'\.[a-zA-Z]+$', '', input_parameters['outfile']) with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write(f'{mpileine_line} bash -c \\\n') out.write('"samtools mpileup \\\n') out.write( '-B -q {minMQ} -Q {minBQ} {extra_pileup_arguments} {selector_text} -f \\\n' .format(minMQ=input_parameters['min_MQ'], minBQ=input_parameters['min_BQ'], extra_pileup_arguments=input_parameters[ 'varscan_pileup_arguments'], selector_text=selector_text)) out.write('{} \\\n'.format(pl_genome_reference)) out.write('{} \\\n'.format(pl_normal_bam)) out.write('> {}/normal.pileup"\n\n'.format(pl_outdir)) out.write(f'{mpileine_line} bash -c \\\n') out.write('"samtools mpileup \\\n') out.write( '-B -q {minMQ} -Q {minBQ} {extra_pileup_arguments} {selector_text} -f \\\n' .format(minMQ=input_parameters['min_MQ'], minBQ=input_parameters['min_BQ'], extra_pileup_arguments=input_parameters[ 'varscan_pileup_arguments'], selector_text=selector_text)) out.write('{} \\\n'.format(pl_genome_reference)) out.write('{} \\\n'.format(pl_tumor_bam)) out.write('> {}/tumor.pileup"\n\n'.format(pl_outdir)) out.write(f'{container_line} \\\n') out.write('java -Xmx{} -jar /VarScan2.3.7.jar somatic \\\n'.format( input_parameters['MEM'])) out.write('{}/normal.pileup \\\n'.format(mounted_outdir)) out.write('{}/tumor.pileup \\\n'.format(mounted_outdir)) out.write('{}/{} {} --output-vcf 1 --min-var-freq {}\n\n'.format( mounted_outdir, outname, input_parameters['varscan_arguments'], input_parameters['minimum_VAF'])) out.write(f'{container_line} \\\n') out.write( 'java -Xmx{} -jar /VarScan2.3.7.jar processSomatic \\\n'.format( input_parameters['MEM'])) out.write('{}/{}.snp.vcf\n\n'.format(mounted_outdir, outname)) out.write(f'{container_line} \\\n') out.write( 'java -Xmx{} -jar /VarScan2.3.7.jar somaticFilter \\\n'.format( input_parameters['MEM'])) out.write('{}/{}.snp.Somatic.hc.vcf \\\n'.format( mounted_outdir, outname)) out.write('-indel-file {}/{}.indel.vcf \\\n'.format( mounted_outdir, outname)) out.write('-output-file {}/{}.snp.Somatic.hc.filter.vcf\n\n'.format( mounted_outdir, outname)) out.write('rm {}/normal.pileup\n'.format( input_parameters['output_directory'])) out.write('rm {}/tumor.pileup\n'.format( input_parameters['output_directory'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) assert os.path.exists(input_parameters['reference_dict']) logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters['reference_dict']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['jsm2_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_reference_dict = fileDict[ input_parameters['reference_dict']]['mount_path'] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') out.write('\n') out.write(f'{container_line} \\\n') out.write( '/opt/JointSNVMix-0.7.5/build/scripts-2.7/jsm.py train joint_snv_mix_two \\\n' ) out.write('--convergence_threshold {} \\\n'.format( input_parameters['converge_threshold'])) out.write('--skip_size {} \\\n'.format(input_parameters['skip_size'])) if input_parameters['jsm_train_arguments']: out.write('{} \\\n'.format( input_parameters['jsm_train_arguments'])) out.write('{} \\\n'.format(mounted_genome_reference)) out.write('{} \\\n'.format(mounted_normal_bam)) out.write('{} \\\n'.format(mounted_tumor_bam)) out.write('/opt/JointSNVMix-0.7.5/config/joint_priors.cfg \\\n') out.write('/opt/JointSNVMix-0.7.5/config/joint_params.cfg \\\n') out.write('{}/jsm.parameter.cfg\n'.format(mounted_outdir)) out.write('\n') out.write('echo -e \'##fileformat=VCFv4.1\' > {}/{}\n'.format( input_parameters['output_directory'], input_parameters['outfile'])) out.write( 'echo -e \'##INFO=<ID=AAAB,Number=1,Type=Float,Description="Probability of Joint Genotype AA in Normal and AB in Tumor">\' >> {}/{}\n' .format(input_parameters['output_directory'], input_parameters['outfile'])) out.write( 'echo -e \'##INFO=<ID=AABB,Number=1,Type=Float,Description="Probability of Joint Genotype AA in Normal and BB in Tumor">\' >> {}/{}\n' .format(input_parameters['output_directory'], input_parameters['outfile'])) out.write( 'echo -e \'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\' >> {}/{}\n' .format(input_parameters['output_directory'], input_parameters['outfile'])) out.write( 'echo -e \'##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">\' >> {}/{}\n' .format(input_parameters['output_directory'], input_parameters['outfile'])) out.write( 'echo -e \'##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">\' >> {}/{}\n' .format(input_parameters['output_directory'], input_parameters['outfile'])) out.write( 'echo -e \'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNORMAL\\tTUMOR\' >> {}/{}\n' .format(input_parameters['output_directory'], input_parameters['outfile'])) out.write('\n') out.write(f'{container_line} bash -c \\\n') out.write( '"/opt/JointSNVMix-0.7.5/build/scripts-2.7/jsm.py classify joint_snv_mix_two \\\n' ) if input_parameters['jsm_classify_arguments']: out.write('{} \\\n'.format( input_parameters['jsm_classify_arguments'])) out.write('{} \\\n'.format(mounted_genome_reference)) out.write('{} \\\n'.format(mounted_normal_bam)) out.write('{} \\\n'.format(mounted_tumor_bam)) out.write("{}/jsm.parameter.cfg \\\n".format(mounted_outdir)) out.write( '/dev/stdout | awk -F \'\\t\' \'NR!=1 && \\$4!=\\"N\\" && \\$10+\\$11>=0.95\' | \\\n' ) out.write( 'awk -F \'\\t\' \'{print \\$1 \\"\\t\\" \\$2 \\"\\t.\\t\\" \\$3 \\"\\t\\" \\$4 \\"\\t.\\t.\\tAAAB=\\" \\$10 \\";AABB=\\" \\$11 \\"\\tRD:AD\\t\\" \\$5 \\":\\" \\$6 \\"\\t\\" \\$7 \\":\\" \\$8}\' \\\n' ) out.write('| /opt/vcfsorter.pl {} - >> {}/{}"\n\n'.format( mounted_reference_dict, mounted_outdir, input_parameters['outfile'])) if input_parameters['threads'] > 1: bedtool_line, outdir_i = container.container_params( 'lethalfang/bedtools:2.26.0', tech, (input_parameters['output_directory'], )) mounted_bed_outdir = outdir_i[ input_parameters['output_directory']]['mount_path'] out.write('\n\ni=1\n') out.write('while [[ $i -le {} ]]\n'.format( input_parameters['threads'])) out.write('do\n') out.write( ' {DOCKER_LINE} bash -c "bedtools intersect -a {OUTDIR}/{OUTVCF} -b {OUTDIR}/${{i}}/${{i}}.bed -header | uniq > {OUTDIR}/${{i}}/{OUTVCF}"\n' .format(DOCKER_LINE=bedtool_line, OUTDIR=mounted_bed_outdir, OUTVCF=input_parameters['outfile'])) out.write(' i=$(( $i + 1 ))\n') out.write('done\n') out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def trimmomatic(input_parameters, tech='docker'): if input_parameters['in_fastq2']: paired_end = True else: paired_end = False for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['output_directory'], input_parameters[ 'in_fastq1'], input_parameters['in_fastq2']: if path_i: all_paths.append(path_i) trim_line, fileDict = container.container_params( input_parameters['trimmomaticImage'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) # Mounted paths for all the input files and output directory: mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path'] mounted_fq2 = fileDict[input_parameters['in_fastq2']]['mount_path'] temporary_files = [] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') out.write(f'{trim_line} \\\n') out.write( 'java -Xmx{}G -jar /opt/Trimmomatic/trimmomatic.jar \\\n'.format( input_parameters['MEM'])) if paired_end: out.write('PE -threads {} -phred33 \\\n'.format( input_parameters['threads'])) out.write( '{FQ1} {FQ2} {DIR}/{PAIR1} {DIR}/{UNPAIR1} {DIR}/{PAIR2} {DIR}/{UNPAIR2} \\\n' .format( FQ1=mounted_fq1, FQ2=mounted_fq2, DIR=mounted_outdir, PAIR1=input_parameters['out_fastq1_name'], PAIR2=input_parameters['out_fastq2_name'], UNPAIR1='unpaired.' + input_parameters['out_fastq1_name'], UNPAIR2='unpaired.' + input_parameters['out_fastq2_name'])) else: out.write('SE -threads {} -phred33 \\\n'.format( input_parameters['threads'])) out.write('{FQ1} {DIR}/{PAIR1} \\\n'.format( FQ1=mounted_fq1, DIR=mounted_outdir, PAIR1=input_parameters['out_fastq1_name'])) out.write( 'ILLUMINACLIP:{ADAPTER}:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:{MINLEN}\n' .format(ADAPTER=input_parameters['adapter'], MINLEN=input_parameters['minimum_length'])) # Remove untrimmed files: if input_parameters['remove_untrimmed']: out.write('\n') out.write('rm {}\n'.format(input_parameters['in_fastq1'])) if input_parameters['in_fastq2']: out.write('rm {}\n'.format(input_parameters['in_fastq2'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists(input_parameters['normal_bam']) assert os.path.exists(input_parameters['tumor_bam']) assert os.path.exists(input_parameters['genome_reference']) logdir = os.path.join(input_parameters['output_directory'], 'logs') outfile = os.path.join(logdir, input_parameters['script']) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters[ 'tumor_bam'], input_parameters[ 'genome_reference'], input_parameters[ 'output_directory'], input_parameters['inclusion_region']: if path_i: all_paths.append(path_i) container_line, fileDict = container.container_params( input_parameters['mutect2_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options']) tumor_name_line, tumor_bam = container.container_params( 'lethalfang/samtools:1.7', tech, (input_parameters['tumor_bam'], )) normal_name_line, normal_bam = container.container_params( 'lethalfang/samtools:1.7', tech, (input_parameters['normal_bam'], )) # Resolve mounted paths mounted_genome_reference = fileDict[ input_parameters['genome_reference']]['mount_path'] mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path'] mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory']]['mount_path'] if input_parameters['inclusion_region']: mounted_inclusion = fileDict[ input_parameters['inclusion_region']]['mount_path'] with open(outfile, 'w') as out: out.write("#!/bin/bash\n\n") out.write(f'#$ -o {logdir}\n') out.write(f'#$ -e {logdir}\n') out.write('#$ -S /bin/bash\n') out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM'])) out.write('set -e\n\n') out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n') tumor_bam_path = tumor_bam[input_parameters['tumor_bam']]['mount_path'] tumor_sample_name_extraction = f'tumor_name=`{tumor_name_line} samtools view -H {tumor_bam_path} | egrep -w \'^@RG\' | grep -Po \'SM:[^\\t$]+\' | sed \'s/SM://\' | uniq | sed -e \'s/[[:space:]]*$//\'`\n' out.write(tumor_sample_name_extraction) normal_bam_path = normal_bam[ input_parameters['normal_bam']]['mount_path'] normal_sample_name_extraction = f'normal_name=`{normal_name_line} samtools view -H {normal_bam_path} | egrep -w \'^@RG\' | grep -Po \'SM:[^\\t$]+\' | sed \'s/SM://\' | uniq | sed -e \'s/[[:space:]]*$//\'`\n' out.write(normal_sample_name_extraction) out.write('\n') out.write(f'{container_line} \\\n') out.write('java -Xmx{} -jar /gatk/gatk.jar Mutect2 \\\n'.format( input_parameters['MEM'])) out.write(f'--reference {mounted_genome_reference} \\\n') if input_parameters['inclusion_region']: out.write('--intervals {} \\\n'.format(mounted_inclusion)) out.write('--input {} \\\n'.format(mounted_tumor_bam)) out.write('--input {} \\\n'.format(mounted_normal_bam)) out.write('--normal-sample ${normal_name} \\\n') out.write('--tumor-sample ${tumor_name} \\\n') out.write('--native-pair-hmm-threads {} \\\n'.format(1)) if input_parameters['mutect2_arguments']: out.write('{} \\\n'.format(input_parameters['mutect2_arguments'])) out.write('--output {}/unfiltered.{}\n\n'.format( mounted_outdir, input_parameters['outfile'])) out.write(f'{container_line} \\\n') out.write( 'java -Xmx{} -jar /gatk/gatk.jar FilterMutectCalls \\\n'.format( input_parameters['MEM'])) out.write('--variant {}/unfiltered.{} \\\n'.format( mounted_outdir, input_parameters['outfile'])) if input_parameters['mutect2_filter_arguments']: out.write('{} \\\n'.format( input_parameters['mutect2_filter_arguments'])) out.write('--output {}/{}\n'.format(mounted_outdir, input_parameters['outfile'])) out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n') # "Run" the script that was generated command_line = '{} {}'.format(input_parameters['action'], outfile) returnCode = subprocess.call(command_line, shell=True) return outfile
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker' ): for param_i in DEFAULT_PARAMS: if param_i not in input_parameters: input_parameters[param_i] = DEFAULT_PARAMS[param_i] # The following are required: assert os.path.exists( input_parameters['normal_bam'] ) assert os.path.exists( input_parameters['tumor_bam'] ) assert os.path.exists( input_parameters['genome_reference'] ) logdir = os.path.join( input_parameters['output_directory'], 'logs' ) outfile = os.path.join( logdir, input_parameters['script'] ) all_paths = [] for path_i in input_parameters['normal_bam'], input_parameters['tumor_bam'], input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['inclusion_region']: if path_i: all_paths.append( path_i ) container_line, fileDict = container.container_params( input_parameters['somaticsniper_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] ) # Mounted paths for all the input files and output directory: mounted_genome_reference = fileDict[ input_parameters['genome_reference'] ]['mount_path'] mounted_tumor_bam = fileDict[ input_parameters['tumor_bam'] ]['mount_path'] mounted_normal_bam = fileDict[ input_parameters['normal_bam'] ]['mount_path'] mounted_outdir = fileDict[ input_parameters['output_directory'] ]['mount_path'] if input_parameters['inclusion_region']: mounted_inclusion = fileDict[ input_parameters['inclusion_region'] ]['mount_path'] with open(outfile, 'w') as out: out.write( "#!/bin/bash\n\n" ) out.write(f'#$ -o {logdir}\n' ) out.write(f'#$ -e {logdir}\n' ) out.write( '#$ -S /bin/bash\n' ) out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) ) out.write( 'set -e\n\n' ) out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' ) out.write(f'{container_line} \\\n' ) out.write( '/opt/somatic-sniper/build/bin/bam-somaticsniper \\\n' ) out.write( '-q {} -Q {} -s {} -F vcf {} \\\n'.format(input_parameters['min_MQ'], input_parameters['somatic_score'], input_parameters['prior'], input_parameters['somaticsniper_arguments']) ) out.write( '-f {} \\\n'.format(mounted_genome_reference) ) out.write( '{} \\\n'.format(mounted_tumor_bam) ) out.write( '{} \\\n'.format(mounted_normal_bam) ) out.write( '{}/{}\n'.format(mounted_outdir, input_parameters['outfile']) ) if input_parameters['threads'] > 1: bedtool_line, outdir_i = container.container_params( 'lethalfang/bedtools:2.26.0', tech, (input_parameters['output_directory'], ) ) mounted_bed_outdir = outdir_i[ input_parameters['output_directory'] ]['mount_path'] out.write( '\n\ni=1\n' ) out.write( 'while [[ $i -le {} ]]\n'.format(input_parameters['threads']) ) out.write( 'do\n' ) out.write( ' {DOCKER_LINE} bash -c "bedtools intersect -a {OUTDIR}/{OUTVCF} -b {OUTDIR}/${{i}}/${{i}}.bed -header | uniq > {OUTDIR}/${{i}}/{OUTVCF}"\n'.format(DOCKER_LINE=bedtool_line, OUTDIR=mounted_bed_outdir, OUTVCF=input_parameters['outfile']) ) out.write( ' i=$(( $i + 1 ))\n' ) out.write( 'done\n' ) out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' ) # "Run" the script that was generated command_line = '{} {}'.format( input_parameters['action'], outfile ) returnCode = subprocess.call( command_line, shell=True ) return outfile