def make_variant_dict_parallel(vcf_files, bam_files, sample_names, bed_region, nthreads): dirname = tempfile.gettempdir() #os.curdir partial_regions = split_regions.split( bed_region, os.path.join(dirname, uuid.uuid4().hex + '.bed'), nthreads) pool = multiprocessing.Pool(nthreads) map_args = map(lambda bed_i: (vcf_files, bed_i), partial_regions) partitioned_vcf_files = pool.map_async(intersect_multiple_vcf_files, map_args).get() map_args = map( lambda partial_vcf_files: (partial_vcf_files, bam_files, sample_names), partitioned_vcf_files) variant_dictionaries = pool.map_async(make_variant_dict, map_args).get() pool.close() # partitioned_vcf_files is a list of a list of vcf files vcfs_to_be_deleted = [] for list_of_vcfs in partitioned_vcf_files: vcfs_to_be_deleted = vcfs_to_be_deleted + list_of_vcfs for file_i in partial_regions + vcfs_to_be_deleted: os.remove(file_i) return variant_dictionaries
def splitRegions(nthreads, outfiles, bed=None, fai=None): assert bed or fai if fai and not bed: bed = split_bed.fai2bed(fai, outfiles) writtenBeds = split_bed.split(bed, outfiles, nthreads) return writtenBeds
def splitRegions(input_parameters): fai = input_parameters['genome_reference'] + '.fai' tempdir = os.path.join(TMPDIR, uuid.uuid4().hex) os.makedirs(tempdir, exist_ok=True) bed = split_bed.fai2bed( fai, os.path.join(input_parameters['output_directory'], 'genome.bed')) writtenBeds = split_bed.split(bed, os.path.join(tempdir, 'th.bed'), input_parameters['threads']) return writtenBeds
########################################################## if __name__ == '__main__': workflowArguments = run() if workflowArguments['inclusion_region']: bed_file = workflowArguments['inclusion_region'] else: split_bed.fai2bed(workflowArguments['genome_reference'] + '.fai', workflowArguments['output_directory'] + os.sep + 'genome.bed') bed_file = workflowArguments['output_directory'] + os.sep + 'genome.bed' split_bed.split(bed_file, workflowArguments['output_directory'] + os.sep + 'bed', workflowArguments['threads']) os.makedirs(workflowArguments['output_directory'] + os.sep + 'logs', exist_ok=True) for thread_i in range(1, workflowArguments['threads']+1): if workflowArguments['threads'] > 1: perThreadParameter = copy(workflowArguments) # Add OUTDIR/thread_i for each thread perThreadParameter['output_directory'] = workflowArguments['output_directory'] + os.sep + str(thread_i) perThreadParameter['inclusion_region'] = '{}/{}.bed'.format( perThreadParameter['output_directory'], str(thread_i) ) os.makedirs(perThreadParameter['output_directory'] + os.sep + 'logs', exist_ok=True)
def make_workflow(args, workflowArguments): logger.info( 'Create SomaticSeq Workflow Scripts: ' + ', '.join(['{}={}'.format(i, vars(args)[i]) for i in vars(args)])) ts = re.sub(r'[:-]', '.', datetime.now().isoformat(sep='.', timespec='milliseconds')) workflow_tasks = { 'caller_jobs': [], 'somaticseq_jobs': [], 'merging_jobs': [] } ################# TUMOR-NORMAL RUNS ################# if workflowArguments['which'] == 'paired': if workflowArguments['inclusion_region']: bed_file = workflowArguments['inclusion_region'] else: split_bed.fai2bed( workflowArguments['genome_reference'] + '.fai', workflowArguments['output_directory'] + os.sep + 'genome.bed') bed_file = workflowArguments[ 'output_directory'] + os.sep + 'genome.bed' split_bed.split(bed_file, workflowArguments['output_directory'] + os.sep + 'bed', workflowArguments['threads']) os.makedirs(workflowArguments['output_directory'] + os.sep + 'logs', exist_ok=True) # Unparallelizables if workflowArguments['run_jointsnvmix2']: import utilities.dockered_pipelines.somatic_mutations.JointSNVMix2 as JointSNVMix2 input_arguments = copy(workflowArguments) input_arguments['script'] = 'jsm2.{}.cmd'.format(ts) jointsnvmix2_job = JointSNVMix2.tumor_normal( input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(jointsnvmix2_job) if workflowArguments['run_somaticsniper']: import utilities.dockered_pipelines.somatic_mutations.SomaticSniper as SomaticSniper input_arguments = copy(workflowArguments) input_arguments['script'] = 'somaticsniper.{}.cmd'.format(ts) somaticsniper_job = SomaticSniper.tumor_normal( input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(somaticsniper_job) # Parallelizables to_create_merging_script = True for thread_i in range(1, workflowArguments['threads'] + 1): if workflowArguments['threads'] > 1: perThreadParameter = copy(workflowArguments) # Add OUTDIR/thread_i for each thread perThreadParameter['output_directory'] = workflowArguments[ 'output_directory'] + os.sep + str(thread_i) perThreadParameter['inclusion_region'] = '{}/{}.bed'.format( perThreadParameter['output_directory'], str(thread_i)) os.makedirs(perThreadParameter['output_directory'] + os.sep + 'logs', exist_ok=True) # Move 1.bed, 2.bed, ..., n.bed to each thread's subdirectory move( '{}/{}.bed'.format(workflowArguments['output_directory'], thread_i), '{}/{}.bed'.format(perThreadParameter['output_directory'], thread_i)) # Results combiner if to_create_merging_script: input_arguments = copy(workflowArguments) input_arguments['script'] = 'mergeResults.{}.cmd'.format( ts) merging_job = tumor_normal.merge_results( input_arguments, args.container_tech) workflow_tasks['merging_jobs'].append(merging_job) to_create_merging_script = False else: perThreadParameter = copy(workflowArguments) perThreadParameter['inclusion_region'] = bed_file # Invoke parallelizable callers one by one: if workflowArguments['run_mutect2']: import utilities.dockered_pipelines.somatic_mutations.MuTect2 as MuTect2 input_arguments = copy(perThreadParameter) input_arguments['script'] = 'mutect2.{}.cmd'.format(ts) mutect2_job = MuTect2.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(mutect2_job) if workflowArguments['run_scalpel']: import utilities.dockered_pipelines.somatic_mutations.Scalpel as Scalpel input_arguments = copy(perThreadParameter) input_arguments['script'] = 'scalpel.{}.cmd'.format(ts) scalpel_job = Scalpel.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(scalpel_job) if workflowArguments['run_vardict']: import utilities.dockered_pipelines.somatic_mutations.VarDict as VarDict input_arguments = copy(perThreadParameter) input_arguments['script'] = 'vardict.{}.cmd'.format(ts) vardict_job = VarDict.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(vardict_job) if workflowArguments['run_varscan2']: import utilities.dockered_pipelines.somatic_mutations.VarScan2 as VarScan2 input_arguments = copy(perThreadParameter) input_arguments['script'] = 'varscan2.{}.cmd'.format(ts) varscan2_job = VarScan2.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(varscan2_job) if workflowArguments['run_lofreq']: import utilities.dockered_pipelines.somatic_mutations.LoFreq as LoFreq input_arguments = copy(perThreadParameter) input_arguments['script'] = 'lofreq.{}.cmd'.format(ts) if input_arguments['dbsnp_vcf'].endswith('.vcf.gz'): input_arguments['dbsnp_gz'] = input_arguments['dbsnp_vcf'] elif input_arguments['dbsnp_vcf'].endswith('.vcf'): input_arguments[ 'dbsnp_gz'] = input_arguments['dbsnp_vcf'] + '.gz' assert os.path.exists(input_arguments['dbsnp_gz']) assert os.path.exists(input_arguments['dbsnp_gz'] + '.tbi') else: raise Exception( 'LoFreq has no properly bgzipped dbsnp file.') lofreq_job = LoFreq.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(lofreq_job) if workflowArguments['run_muse']: import utilities.dockered_pipelines.somatic_mutations.MuSE as MuSE input_arguments = copy(perThreadParameter) input_arguments['script'] = 'muse.{}.cmd'.format(ts) if input_arguments['dbsnp_vcf'].endswith('.vcf.gz'): input_arguments['dbsnp_gz'] = input_arguments['dbsnp_vcf'] elif input_arguments['dbsnp_vcf'].endswith('.vcf'): input_arguments[ 'dbsnp_gz'] = input_arguments['dbsnp_vcf'] + '.gz' assert os.path.exists(input_arguments['dbsnp_gz']) assert os.path.exists(input_arguments['dbsnp_gz'] + '.tbi') else: raise Exception( 'MuSE has no properly bgzipped dbsnp file.') muse_job = MuSE.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(muse_job) if workflowArguments['run_strelka2']: import utilities.dockered_pipelines.somatic_mutations.Strelka2 as Strelka2 input_arguments = copy(perThreadParameter) input_arguments['script'] = 'strelka.{}.cmd'.format(ts) strelka2_job = Strelka2.tumor_normal(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(strelka2_job) if workflowArguments['run_somaticseq']: input_arguments = copy(perThreadParameter) input_arguments['script'] = 'somaticSeq.{}.cmd'.format(ts) somaticseq_job = tumor_normal.run_SomaticSeq( input_arguments, args.container_tech) workflow_tasks['somaticseq_jobs'].append(somaticseq_job) ################# TUMOR-ONLY RUNS ################# elif workflowArguments['which'] == 'single': if workflowArguments['inclusion_region']: bed_file = workflowArguments['inclusion_region'] else: split_bed.fai2bed( workflowArguments['genome_reference'] + '.fai', workflowArguments['output_directory'] + os.sep + 'genome.bed') bed_file = workflowArguments[ 'output_directory'] + os.sep + 'genome.bed' split_bed.split(bed_file, workflowArguments['output_directory'] + os.sep + 'bed', workflowArguments['threads']) os.makedirs(workflowArguments['output_directory'] + os.sep + 'logs', exist_ok=True) # Parallelizables to_create_merging_script = True for thread_i in range(1, workflowArguments['threads'] + 1): if workflowArguments['threads'] > 1: perThreadParameter = copy(workflowArguments) # Add OUTDIR/thread_i for each thread perThreadParameter['output_directory'] = workflowArguments[ 'output_directory'] + os.sep + str(thread_i) perThreadParameter['inclusion_region'] = '{}/{}.bed'.format( perThreadParameter['output_directory'], str(thread_i)) os.makedirs(perThreadParameter['output_directory'] + os.sep + 'logs', exist_ok=True) # Move 1.bed, 2.bed, ..., n.bed to each thread's subdirectory move( '{}/{}.bed'.format(workflowArguments['output_directory'], thread_i), '{}/{}.bed'.format(perThreadParameter['output_directory'], thread_i)) # Results combiner # Results combiner if to_create_merging_script: input_arguments = copy(workflowArguments) input_arguments['script'] = 'mergeResults.{}.cmd'.format( ts) merging_job = tumor_only.merge_results( input_arguments, args.container_tech) workflow_tasks['merging_jobs'].append(merging_job) to_create_merging_script = False else: perThreadParameter = copy(workflowArguments) perThreadParameter['inclusion_region'] = bed_file # Invoke parallelizable callers one by one: if workflowArguments['run_mutect2']: import utilities.dockered_pipelines.somatic_mutations.MuTect2 as MuTect2 input_arguments = copy(perThreadParameter) input_arguments['script'] = 'mutect2.{}.cmd'.format(ts) mutect2_job = MuTect2.tumor_only(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(mutect2_job) if workflowArguments['run_scalpel']: import utilities.dockered_pipelines.somatic_mutations.Scalpel as Scalpel input_arguments = copy(perThreadParameter) input_arguments['script'] = 'scalpel.{}.cmd'.format(ts) scalpel_job = Scalpel.tumor_only(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(scalpel_job) if workflowArguments['run_vardict']: import utilities.dockered_pipelines.somatic_mutations.VarDict as VarDict input_arguments = copy(perThreadParameter) input_arguments['script'] = 'vardict.{}.cmd'.format(ts) vardict_job = VarDict.tumor_only(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(vardict_job) if workflowArguments['run_varscan2']: import utilities.dockered_pipelines.somatic_mutations.VarScan2 as VarScan2 input_arguments = copy(perThreadParameter) input_arguments['script'] = 'varscan2.{}.cmd'.format(ts) varscan2_job = VarScan2.tumor_only(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(varscan2_job) if workflowArguments['run_lofreq']: import utilities.dockered_pipelines.somatic_mutations.LoFreq as LoFreq input_arguments = copy(perThreadParameter) input_arguments['script'] = 'lofreq.{}.cmd'.format(ts) if input_arguments['dbsnp_vcf'].endswith('.vcf.gz'): input_arguments['dbsnp_gz'] = input_arguments['dbsnp_vcf'] elif input_arguments['dbsnp_vcf'].endswith('.vcf'): input_arguments[ 'dbsnp_gz'] = input_arguments['dbsnp_vcf'] + '.gz' assert os.path.exists(input_arguments['dbsnp_gz']) assert os.path.exists(input_arguments['dbsnp_gz'] + '.tbi') else: raise Exception( 'LoFreq has no properly bgzipped dbsnp file.') lofreq_job = LoFreq.tumor_only(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(lofreq_job) if workflowArguments['run_strelka2']: import utilities.dockered_pipelines.somatic_mutations.Strelka2 as Strelka2 input_arguments = copy(perThreadParameter) input_arguments['script'] = 'strelka.{}.cmd'.format(ts) strelka2_job = Strelka2.tumor_only(input_arguments, args.container_tech) workflow_tasks['caller_jobs'].append(strelka2_job) if workflowArguments['run_somaticseq']: input_arguments = copy(perThreadParameter) input_arguments['script'] = 'somaticSeq.{}.cmd'.format(ts) somaticseq_job = tumor_only.run_SomaticSeq( input_arguments, args.container_tech) workflow_tasks['somaticseq_jobs'].append(somaticseq_job) ##### Log the scripts created ##### for script_type in workflow_tasks: line_i = '{} {} scripts created: '.format( len(workflow_tasks[script_type]), script_type) logger.info(line_i) i = 1 for script_i in workflow_tasks[script_type]: line_j = '{}) {}'.format(i, script_i) logger.info(line_j) i += 1 ########## Execute the workflow ########## if args.run_workflow_locally: import utilities.dockered_pipelines.run_workflows as run_workflows run_workflows.run_workflows( (workflow_tasks['caller_jobs'], workflow_tasks['somaticseq_jobs'], workflow_tasks['merging_jobs']), args.threads) logger.info( 'SomaticSeq Workflow Done. Check your results. You may remove the {} sub_directories.' .format(args.threads)) return workflow_tasks