def run_merge_paired_reads(input, output, params=None): """Merge R1 and R2 ends into interleaving file for bfast to work on. """ # Update input and output from global config object params = config['merge_paired_reads_params'] input = sorted(input) params['input_read1'] = input[0] params['input_read2'] = input[1] params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "python %(exec)s --gzipped -1 %(input_read1)s -2 %(input_read2)s | gzip -c > %(output)s" % params # cmd = "python %(exec)s -1 %(input_read1)s -2 %(input_read2)s | gzip -c > %(output)s" % params logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="mergepairs", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_sickle(input, output, output_singles, params=None): """Run sickle to trim ends of reads based on sequence quality and length. Also gzips the output to save space. """ input = sorted(input) params['input_read1'] = input[0] params['input_read2'] = input[1] params['input_read1_base'] = os.path.splitext(os.path.basename(input[0]))[0] params['input_read2_base'] = os.path.splitext(os.path.basename(input[1]))[0] params['output_read1'] = output[0].split(".gz")[0] # "%(output_dir)s/%(input_read1_base)s" % (params) params['output_read2'] = output[1].split(".gz")[0] # "%(output_dir)s/%(input_read1_base)s" % (params) params['output_singles'] = output_singles # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = ("module load %(modules)s\n" "%(exec)s pe -t sanger -l %(length)s -q %(quality)s -f %(input_read1)s -r %(input_read2)s -o %(output_read1)s -p %(output_read2)s -s %(output_singles)s\n" "gzip %(output_read1)s &\n" "gzip %(output_read2)s &\n" "wait" % params) logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="sickle", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_write_recalibrated_bam(input, output, bqsr_file, params=None): """GATK write BAM file with recalibrated base scores. """ params['input'] = input params['output'] = output params['bqsr_file'] = bqsr_file # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "module load %(modules)s\n" % params cmd += ("java -Xmx%(maxjheap)s -Djava.io.tmpdir=%(tmp_dir)s -jar %(jar_file)s " "-T PrintReads -I %(input)s -R %(reference_fasta)s -BQSR %(bqsr_file)s " "-o %(output)s" % params) logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="writerecal", nodes=params['qsub_nodes'], # params=params['qsub_params'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_realign_indel_creator(input, output, params=None): """First part of GATK recalibration. """ realign_params = config['gatk_realigner_target_creator_params'] realign_params['input'] = input realign_params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] knowns = "" for known in config['gatk_realigner_target_creator_params']['known_files']: knowns += "--known %s " % known realign_params['knowns'] = knowns cmd = "java -Xmx%(maxjheap)s -jar %(jar_file)s -nt %(threads)s -R %(reference_fasta)s -T RealignerTargetCreator -o %(output)s %(knowns)s" % realign_params job_id = utils.safe_qsub_run(cmd, jobname="realignTC", nodes=realign_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_indel_realigner(input, output, params=None): """Second part of GATK recalibration. """ realign_params = config['gatk_indel_realigner_params'] realign_params['input'] = input realign_params['output'] = output knowns = "" for known in config['gatk_realigner_target_creator_params']['known_files']: knowns += "-known %s " % known realign_params['knowns'] = knowns realign_params['target_intervals'] = config['gatk_realigner_target_creator_params']['output_file'] # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "java -Xmx%(maxjheap)s -Djava.io.tmpdir=%(tmp_dir)s -jar %(jar_file)s -I %(input)s -R %(reference_fasta)s -T IndelRealigner -targetIntervals %(target_intervals)s -o %(output)s %(knowns)s --consensusDeterminationModel KNOWNS_ONLY -LOD 0.4" % realign_params logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="indelRe", nodes=realign_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_mark_duplicates(input, output, params=None): """Set up and run the Picard MarkDuplicates program. """ # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] # # Update input and output from global config object # params = config['picard_markduplicates_params'] params['input'] = input params['output'] = output params['metrics_file'] = "%s.metrics" % output cmd = "module load %(modules)s\n" % params cmd += ("java -Xmx%(maxjheap)s -Djava.io.tmpdir=%(tmp_dir)s -jar %(jar_file)s " "INPUT=%(input)s OUTPUT=%(output)s METRICS_FILE=%(metrics_file)s CREATE_INDEX=true" "OPTICAL_DUPLICATE_PIXEL_DISTANCE=%(optical_duplicate_pixel_distance)s" % params) job_id = utils.safe_qsub_run(cmd, jobname="markdups", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_base_score_recalibrator(input, output, params=None): """GATK base score recalibration. """ realign_params = config['gatk_base_score_recal_params'] realign_params['input'] = input realign_params['output'] = output knowns = "" for known in config['gatk_base_score_recal_params']['known_files']: knowns += "-knownSites %s " % known realign_params['knowns'] = knowns # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "java -Xmx%(maxjheap)s -Djava.io.tmpdir=%(tmp_dir)s -jar %(jar_file)s -T BaseRecalibrator -I %(input)s -R %(reference_fasta)s -o %(output)s %(knowns)s" % realign_params logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="bsRecal", nodes=realign_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_index_splitbam(input, output, params=None): """Run samtools index on bam file. """ # params = dict(sample_name=sample_name, index=index) # Let a parser argument handle setting up arguments and options parser = argparse.ArgumentParser() # Update input and output from global config object bamindex_params = config['bamindex_params'] bamindex_params['input'] = input bamindex_params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "%(exec)s index %(input)s %(output)s" % bamindex_params job_id = utils.safe_qsub_run(cmd, jobname="splitbamindex", nodes=bamindex_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_mutect(input, output, text_file, params=None): """Run mutect. """ params = config['mutect_params'] assert len(input) == 2, "Not the right number of input files (should be 2, received %i)" % len(input) # Figure out which file is normal, which is tumor...should be alphabetical? input = sorted(input) params['input_normal'] = input[0] params['input_tumor'] = input[1] params['vcf_file'] = output params['call_stats_file'] = text_file ## params['coverage_file'] = coverage_file params['log_file'] = config['mutect_params']['output_dir'] # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "%(java_exec)s -Xmx%(maxjheap)s -Djava.io.tmpdir=%(tmp_dir)s -jar %(jar_file)s --analysis_type MuTect --dbsnp %(dbsnp_file)s --cosmic %(cosmic_file)s --input_file:normal %(input_normal)s --input_file:tumor %(input_tumor)s --reference_sequence %(reference_fasta)s --out %(call_stats_file)s --vcf %(vcf_file)s --enable_extended_output" % params logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="mutect", nodes=params['qsub_nodes'], params=params['qsub_params'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_sort_sam(input, output, sample_name=None, index=None): """Run Picard SortSam to convert to sorted bam file. """ params = dict(sample_name=sample_name, index=index) # Update input and output from global config object picard_params = config['picard_sortsam_params'] picard_params['input'] = input picard_params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmdline = "--maxjheap=%(maxjheap)s --jar=%(jar_file)s --input=%(input)s --output=%(output)s --sort_order=%(sort_order)s SortSam " % picard_params picard_cmd = "python -m ccrngspy.tasks.Picard %s" % cmdline # stdout, stderr = utils.safe_run(picard_cmd, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) logger.debug("params = %s" % (params, )) job_id = utils.safe_qsub_run(picard_cmd, jobname="sortsam", nodes=picard_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_mergebam(input, output, patient_id=None): """Merge all bam files for a single patient together (this includes matched tumor and normal into one file). The separate tumor/normal sample info is encoded in the read groups of the BAM files. This is done for future steps (GATK recommends that tumor/normal paired data is run through re-align/re-calibrate step together). """ params = dict(patient_id=patient_id) # Update input and output from global config object mergebam_params = config['mergebam_params'] mergebam_params['input'] = " ".join(input) mergebam_params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmdline = "--maxjheap=%(maxjheap)s --jar=%(jar_file)s --input %(input)s --output=%(output)s --sort_order=%(sort_order)s MergeSamFiles --use_threading=true" % mergebam_params cmd = "python -m ccrngspy.tasks.Picard %s" % cmdline logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="mb%s" % (params['patient_id']), nodes=mergebam_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_fastqc(input, output, params=None): """Set up and run the fastqc program. """ params = config['fastqc_params'] # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] try: tmp = params['casava'] except KeyError as e: logger.info("Casava parameter not specified, assuming false.") params['casava'] = False fastqc_task = FastQC.FastQC(input_files=input, output_directory=params['output_dir'], casava=params['casava'], threads=params['threads']) cmd = fastqc_task.make_command() logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="fastqc", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_bwa_sampe(input, output, sample_name, params=None): """Run bwa sampe. """ params['output'] = output params['sai_R1'] = input[0][0] params['sai_R2'] = input[1][0] # For some reason, -f param didn't work with samtools sort # So, need to use prefix verison (without bam suffix) (params['fastq_R1'], params['fastq_R2']) = input[0][1] params['read_group_string'] = bwa_helpers.sample_name_to_read_group_string(sample_name=sample_name, samples=samples, config=config) cmd = "module load %(modules)s\n" % params cmd = ('bwa sampe -r "%(read_group_string)s" %(reference_fasta)s %(sai_R1)s %(sai_R2)s %(fastq_R1)s %(fastq_R2)s | ' 'samtools view -Sb - > %(output)s' % params) logger.debug("cmd = %s" % (cmd)) # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] job_id = utils.safe_qsub_run(cmd, jobname="bwa_sampe", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_tophat(input, output, params=None): """Run Tophat on paired reads. """ # Let a parser argument handle setting up arguments and options parser = argparse.ArgumentParser() # Add Tophat arguments tophat = Tophat.TophatRunner() parser = tophat.argparse(parser) # Update input and output from global config object tophat_params = config['tophat_params'] tophat_params['input'] = input # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] ## fastq read files tophat_params['file1'] = input[0] tophat_params['file2'] = input[1] tophat_params['sample'] = params['sample'] tophat_params['output'] = output logger.debug('tophat_params = %s' % (tophat_params, )) cmdline = '--bowtie_index=%(bowtie_index)s -1 %(file1)s -2 %(file2)s -o %(output)s --threads=%(threads)s --other_params="%(other_params)s"' % tophat_params # tophat_cmd = "python -m ccrngspy.tasks.Tophat %s" % cmdline args = parser.parse_args(shlex.split(cmdline)) logger.debug("cmdline = %s" % (shlex.split(cmdline), )) tophat.set_options(args) tophat_command = tophat.make_command() logger.debug("cmd = %s" % (tophat_command, )) logger.debug("params = %s" % (params, )) # job_stdout, job_stderr = utils.safe_qsub_run(tophat_command, jobname="tophat_%s" % params['sample'], # nodes=tophat_params['qsub_nodes'], # params="-v np=%(threads)s" % tophat_params, # stdout=stdout, stderr=stderr) # logger.debug("stdout = %s, stderr = %s" % (job_stdout, job_stderr)) job_stdout = utils.safe_qsub_run(tophat_command, jobname="tophat_%s" % params['sample'], nodes=tophat_params['qsub_nodes'], params="-v np=%(threads)s" % tophat_params, stdout=stdout, stderr=stderr) logger.debug("stdout = %s" % (job_stdout))
def run_bfast_postprocess(input, output, flowcell_id=None, sample_name=None, index=None): """Run postprocess. """ params = dict(sample_name=sample_name, index=index) # Let a parser argument handle setting up arguments and options parser = argparse.ArgumentParser() # Add BFAST arguments bfast = BFAST.BFASTBase() # Use bfast provided by PATH; set using the module functions. bfast.set_exec("bfast") parser = bfast.argparse(parser) # Update input and output from global config object bfast_params = config['bfast_postprocess_params'] bfast_params['input'] = input bfast_params['output'] = output read_group_id = read_group_id_dict[sample_name] read_group_string = "@RG\tID:%s\tPL:ILLUMINA\tPU:%s\tLB:%s\tSM:%s" % (read_group_id, flowcell_id, config['general_params']['library_name'], sample_name) read_group_tempfile = tempfile.NamedTemporaryFile(mode="w", dir=bfast_params['scratch_dir'], delete=True) read_group_tempfile.write("%s\n" % read_group_string) read_group_tempfile.flush() bfast_params['read_group_string'] = read_group_tempfile.name # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmdline = ("--bfast_threads=%(threads)s --bfast_output_file=%(output)s " "--bfast_reference_fasta=%(reference_fasta)s postprocess " "--bfast_read_group_string=%(read_group_string)s " "--bfast_aligned_file=%(input)s" % bfast_params) args = parser.parse_args(cmdline.split(" ")) bfast_command ="module load %(modules)s\n" % bfast_params bfast_command += bfast.make_postprocess_command(args) # stdout, stderr = utils.safe_run(bfast_command, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) job_id = utils.safe_qsub_run(bfast_command, jobname="bfpp%s%s" % (params['sample_name'], params['index']), nodes=bfast_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_bfast_match(input, output, params=None): """Run match. """ # Let a parser argument handle setting up arguments and options parser = argparse.ArgumentParser() # Add BFAST arguments bfast = BFAST.BFASTBase() # Use bfast provided by PATH; set using the module functions. bfast.set_exec("bfast") parser = bfast.argparse(parser) # Update input and output from global config object bfast_params = config['bfast_match_params'] bfast_params['input'] = input bfast_params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] bfast_params['sample'] = params['sample_name'] bfast_params['bfast_ref_index'] = params['bfast_ref_index'] cmdline = ("--bfast_temp_dir=%(temp_dir)s --bfast_threads=%(threads)s " "--bfast_reference_fasta=%(reference_fasta)s " "--bfast_output_file=%(output)s match --bfast_gzipped " "--bfast_reads_file=%(input)s --bfast_space=0 " "--bfast_main_indexes=%(bfast_ref_index)s" % bfast_params) args = parser.parse_args(cmdline.split(" ")) bfast_command = "module load %(modules)s\n" % bfast_params bfast_command += bfast.make_match_command(args) logger.debug("cmd = %s" % (bfast_command,)) # stdout, stderr = utils.safe_run(bfast_command, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) job_id = utils.safe_qsub_run(bfast_command, jobname="bfmatch", nodes=bfast_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_sort_sam(input, output, params=None): """Set up and run the Picard SortSam program. This task works differently than the others; instead of calling the program directly by writing out the command line string needed to run it, this runs a python script by calling the main function of ccrngspy.tasks.Picard. This is because the Picard code is based off of the Galaxy wrapper for Picard, and doesn't work exactly the same as the rest. 2012-03-30 I will consider re-writing it so that it is consistent. (dailykm) """ # # Let a parser argument handle setting up arguments and options # parser = argparse.ArgumentParser() # # Add Picard arguments # picard = Picard.PicardBase() # parser = picard.argparse(parser) # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] # Update input and output from global config object picard_params = config['picard_sortsam_params'] picard_params['input'] = input picard_params['output'] = output logger.debug("picard_params = %s" % (picard_params,)) # Set up using the default arguments, specifying the input and output files since they are required! cmdline = "--maxjheap=%(maxjheap)s --jar=%(jar_file)s --input=%(input)s --output=%(output)s --sort_order=%(sort_order)s SortSam" % picard_params picard_cmd = "python -m ccrngspy.tasks.Picard %s" % cmdline # stdout, stderr = utils.safe_run(picard_cmd, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) logger.debug("params = %s" % (params, )) job_stdout = utils.safe_qsub_run(picard_cmd, jobname="sort_sam", nodes=picard_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("stdout = %s" % (job_stdout))
def run_sort_sam(input, output, params=None): """Set up and run the Picard SortSam program. This task works differently than the others; instead of calling the program directly by writing out the command line string needed to run it, this runs a python script by calling the main function of ccrngspy.tasks.Picard. This is because the Picard code is based off of the Galaxy wrapper for Picard, and doesn't work exactly the same as the rest. 2012-03-30 I will consider re-writing it so that it is consistent. (dailykm) """ # # Let a parser argument handle setting up arguments and options # parser = argparse.ArgumentParser() # # Add Picard arguments # picard = Picard.PicardBase() # parser = picard.argparse(parser) # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] # Update input and output from global config object picard_params = config['picard_sortsam_params'] picard_params['input'] = input picard_params['output'] = output logger.debug("picard_params = %s" % (picard_params,)) # Set up using the default arguments, specifying the input and output files since they are required! cmdline = "--jar=%(jar_file)s --input=%(input)s --output=%(output)s --sort_order=%(sort_order)s SortSam" % picard_params picard_cmd = "python -m ccrngspy.tasks.Picard %s" % cmdline # stdout, stderr = utils.safe_run(picard_cmd, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) logger.debug("params = %s" % (params, )) job_stdout, job_stderr = utils.safe_qsub_run(picard_cmd, jobname="sort_sam", nodes=picard_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("stdout = %s, stderr = %s" % (job_stdout, job_stderr))
def run_collect_rnaseq_metrics(input, output, sample): """Set up and run the Picard CollectRnaSeqMetrics program. This task works differently than the others; instead of calling the program directly by writing out the command line string needed to run it, this runs a python script by calling the main function of ccrngspy.tasks.Picard. This is because the Picard code is based off of the Galaxy wrapper for Picard, and doesn't work exactly the same as the rest. 2012-03-30 I will consider re-writing it so that it is consistent. (dailykm) """ # # Let a parser argument handle setting up arguments and options # parser = argparse.ArgumentParser() # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] # Update input and output from global config object picard_params = config['picard_params'] picard_params['input'] = input picard_params['output'] = output # Set up using the default arguments, specifying the input and output files since they are required! cmdline = "--jar=%(jar_file)s --input=%(input)s --output=%(output)s --ref_flat=%(ref_flat)s --ref_file=%(ref_file)s CollectRnaSeqMetrics --minimum_length=%(minimum_length)s --chart_output=%(chart_output)s --metric_accumulation_level=%(metric_accumulation_level)s --stop_after=%(stop_after)s" % picard_params # args = parser.parse_args(cmdline.split()) # # Run the function for collecting RNASeq metrics # args.func(args) picard_cmd = "python -m ccrngspy.tasks.Picard %s" % cmdline # stdout, stderr = utils.safe_run(picard_cmd, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) job_stdout, job_stderr = utils.safe_qsub_run(picard_cmd, jobname="rum_%s" % sample, nodes=picard_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("stdout = %s, stderr = %s" % (job_stdout, job_stderr))
def run_collect_rnaseq_metrics(input, output, sample): """Set up and run the Picard CollectRnaSeqMetrics program. This task works differently than the others; instead of calling the program directly by writing out the command line string needed to run it, this runs a python script by calling the main function of ccrngspy.tasks.Picard. This is because the Picard code is based off of the Galaxy wrapper for Picard, and doesn't work exactly the same as the rest. 2012-03-30 I will consider re-writing it so that it is consistent. (dailykm) """ # # Let a parser argument handle setting up arguments and options # parser = argparse.ArgumentParser() # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] # Update input and output from global config object picard_params = config['picard_params'] picard_params['input'] = input picard_params['output'] = output # Set up using the default arguments, specifying the input and output files since they are required! cmdline = "--maxjheap=%(maxjheap)s --jar=%(jar_file)s --input=%(input)s --output=%(output)s --ref_flat=%(ref_flat)s --ref_file=%(ref_file)s CollectRnaSeqMetrics --minimum_length=%(minimum_length)s --chart_output=%(chart_output)s --metric_accumulation_level=%(metric_accumulation_level)s --stop_after=%(stop_after)s" % picard_params # args = parser.parse_args(cmdline.split()) # # Run the function for collecting RNASeq metrics # args.func(args) picard_cmd = "python -m ccrngspy.tasks.Picard %s" % cmdline # stdout, stderr = utils.safe_run(picard_cmd, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) job_stdout = utils.safe_qsub_run(picard_cmd, jobname="rnaseqmet_%s" % sample, nodes=picard_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("stdout = %s" % (job_stdout))
def run_flagstat(input, output, params=None): """Run samtools flagstat on bam file. """ params['input'] = input params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "module load %(modules)s\n" % params cmd += "samtools flagstat %(input)s > %(output)s" % params job_id = utils.safe_qsub_run(cmd, jobname="flagstat", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_cleansam(input, output, params=None): """Clean up BAM file. """ params['input'] = input params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "module load %(modules)s\n" % params cmd += ("java -Xmx%(maxjheap)s -Djava.io.tmpdir=%(tmp_dir)s -jar %(jar_file)s " "INPUT=%(input)s OUTPUT=%(output)s CREATE_INDEX=true" % params) job_id = utils.safe_qsub_run(cmd, jobname="cleansam", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_fastqc(input, output, params=None): """Set up and run fastqc. """ # Let a parser argument handle setting up arguments and options parser = argparse.ArgumentParser() # Add FastQC arguments fastqc = FastQC.FastQC() parser = fastqc.argparse(parser) # Update input and output from global config object fastqc_params = config['fastqc_params'] fastqc_params['input'] = input # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] cmdline = "--outdir=%(output_dir)s --threads=%(threads)s %(input)s" % fastqc_params args = parser.parse_args(cmdline.split()) fastqc.set_options(args) # Final command to run fastqc_command = fastqc.make_command() # if fastqc_params['run_type'] == 'remote': # stdout, stderr = utils.safe_qsub_run(fastqc_command, jobname="run_fastqc") # elif fastqc_params['run_type'] == 'local': job_stdout, job_stderr = utils.safe_qsub_run(fastqc_command, jobname="fastqc", nodes=fastqc_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("stdout = %s, stderr = %s" % (job_stdout, job_stderr)) # post task, touch output file! of = file(output, mode="w") of.close()
def run_rum(input, output, params=None): """Run RUM on paired reads. """ # Let a parser argument handle setting up arguments and options parser = argparse.ArgumentParser() # Add RUM arguments rum = RUM.RUMrunner() parser = rum.argparse(parser) # Update input and output from global config object rum_params = config['rum_params'] rum_params['input'] = input # Output dir for qsub stdout and stderr stdout = config['general_params']['log_file_dir'] stderr = config['general_params']['log_file_dir'] ## fastq read files rum_params['file1'] = input[0] rum_params['file2'] = input[1] rum_params['sample'] = params['sample'] cmdline = "--rum_config_file=%(config_file)s --rum_run_name=%(sample)s --rum_outdir=%(output_dir)s/%(sample)s --rum_read_files %(file1)s %(file2)s --rum_chunks=%(chunks)s --rum_ram=%(ram_per_chunk)s" % rum_params args = parser.parse_args(cmdline.split()) rum.set_options(args) rum_command = rum.make_command() # stdout, stderr = utils.safe_run(rum_command, shell=False) # logger.debug("stdout = %s, err = %s" % (stdout, stderr)) job_stdout, job_stderr = utils.safe_qsub_run(rum_command, jobname="rum_%s" % params['sample'], nodes=rum_params['qsub_nodes'], params="-l walltime=168:00:00", stdout=stdout, stderr=stderr) logger.debug("stdout = %s, stderr = %s" % (job_stdout, job_stderr))
def run_bwa_aln(input, output, params=None): """Run bwa on individual gzipped fastq files. """ params['input'] = " ".join(input) params['output'] = output cmd = "module load %(modules)s\n" % params cmd += "bwa aln -t %(threads)s %(reference_fasta)s %(input)s > %(output)s" % params logger.debug("cmd = %s" % (cmd)) # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] job_id = utils.safe_qsub_run(cmd, jobname="bwa_aln", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_filterbam(input, output, params=None): """ """ params['input'] = input params['output'] = output cmd = "module load %(modules)s\n" % params cmd += "samtools view -h -F uUfd -q 1 -b %(input)s > %(output)s" % params logger.debug("cmd = %s" % (cmd)) # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] job_id = utils.safe_qsub_run(cmd, jobname="filterbam", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_index_splitbam(input, output, params=None): """Run samtools index on bam file. """ # Update input and output from global config object params['input'] = input params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "module load %(modules)s\n" % params cmd += "samtools index %(input)s %(output)s" % params job_id = utils.safe_qsub_run(cmd, jobname="splitbamindex", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_merge_bfastmatch(input, output, sample_name=None, index=None): """Merge bfast match files using bfast bmfmerge utility with default parameters. """ # Update input and output from global config object merge_params = config['bfast_mergematch_params'] merge_params['input'] = " ".join(input) merge_params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = ("module load %(modules)s\n" "%(exec)s %(input)s > %(output)s" % merge_params) job_id = utils.safe_qsub_run(cmd, jobname="mergebfmatch", nodes=merge_params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_sortbam(input, output, output_prefix, params=None): """Sort bam file. """ params['input'] = input params['output'] = output_prefix cmd = "module load %(modules)s\n" % params cmd = ('samtools sort -m 500M -@ %(threads)s %(input)s %(output)s' % params) logger.debug("cmd = %s" % (cmd)) # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] job_id = utils.safe_qsub_run(cmd, jobname="sortbam", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_split_bam(input, output, params=None): """Split BAM files into separate tumor/normal files based on read group. """ params['input'] = input ## params['output_dir'] = config['gatk_base_score_recal_params']['output_dir'] # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = '%(exec)s %(input)s --output_dir=%(output_dir)s ' % params cmd += '--output_file_expr="%(SM)s.bam"' logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="splitbam", nodes=params['qsub_nodes'], # params=params['qsub_params'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_bam2cfg(input, output, params=None): """Create breakdancer config files. """ # Update input and output from global config object params['input'] = input params['output'] = output # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "module load %(modules)s\n" % params cmd += "%(exec)s -g -h %(input)s > %(output)s" % params logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="bam2cfg", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))
def run_breakdancer(input, output, bedfile, fastqfile, params=None): """Run breakdancer, only to find intra-chromosomal structural variations (-t param). """ params['input'] = input params['output'] = output params['bedfile'] = bedfile params['fastqfile'] = fastqfile # Output dir for qsub stdout and stderr stdout = config['general_params']['stdout_log_file_dir'] stderr = config['general_params']['stderr_log_file_dir'] cmd = "module load %(modules)s\n" % params cmd += "%(exec)s -g %(bedfile)s -d %(fastqfile)s %(transchrom)s -r %(min_reads)s %(input)s > %(output)s" % params logger.debug("cmd = %s" % (cmd,)) job_id = utils.safe_qsub_run(cmd, jobname="breakdancer", nodes=params['qsub_nodes'], stdout=stdout, stderr=stderr) logger.debug("job_id = %s" % (job_id,))