def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of reads and %mapped from sam file")) #------------------------- num_reads = "" percent_reads_mapped = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get("percentReadsMapped", "") # reuse already fresh metrics if num_reads and percent_reads_mapped: verbose_print("Reusing previously calculated number of reads and %mapped") else: num_reads = command.run("samtools view -S -c " + file) num_reads = num_reads.strip() mapped = command.run("samtools view -S -c -F 4 " + file) mapped = mapped.strip() try: percent_reads_mapped = 100.0 * float(mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: handle_error("Cannot calculate number of reads and %mapped.") #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print("Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean insert size from bam file")) #------------------------- ave_insert_size = "" file = os.path.join(sample_dir, "reads.sorted.bam") if verify_input_file("BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics if ave_insert_size: verbose_print("Reusing previously calculated mean insert size") else: # Extract inferred insert sizes (TLEN, column 9 of BAM file) for reads "mapped in proper pair" (2) and "first in pair" (64) = 66 tempfile = NamedTemporaryFile(delete=False, dir=sample_dir, prefix="tmp.inserts.", mode='w') command.run("samtools view -f 66 " + file + " | cut -f 9 | sed 's/^-//'", tempfile.name) insert_count = 0 insert_sum = 0 with open(tempfile.name) as f: for line in f: try: insert_sum += int(line) insert_count += 1 except ValueError: pass os.unlink(tempfile.name) if insert_count > 0 and insert_sum > 0: ave_insert_size = float(insert_sum) / float(insert_count) ave_insert_size = "%.2f" % ave_insert_size else: handle_error("Cannot calculate mean insert size.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get("phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print("Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get("snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print("Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print("Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions(file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get("missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print("Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions(file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def run(args): """Run all the steps of the snp pipeline in th correct order. Parameters ---------- args : Namespace referenceFile : str Relative or absolute path to the reference fasta file forceFlag : bool Force processing even when result files already exist and are newer than inputs mirror : str Mode to create a mirror copy of the reference directory and all the sample directories. Possible values: {soft, hard, copy} configFile : str Relative or absolute path to a configuration file for overriding defaults and defining extra parameters for the tools and scripts within the pipeline. jobQueueMgr : str Job queue manager for remote parallel job execution in an HPC environment. Currently "torque" and "grid" are supported. If not specified, the pipeline will execute locally. workDir : str Output directory for the result files. samplesDir : str Relative or absolute path to the parent directory of all the sample directories. samplesFile : str Relative or absolute path to a file listing all of the sample directories. """ global log_dir global job_queue_mgr # Where are we running: grid, torque, or None (local) job_queue_mgr = args.jobQueueMgr # Erase any left-over error log environment variable from a previous run os.environ.pop("errorOutputFile", None) # the 2nd arg avoids an exception when not in dict # Handle output working directory. Create the directory if it does not exist. # Any errors creating the work_dir will not be logged to the error log because # the error log belongs in the work_dir. work_dir = args.workDir try: utils.mkdir_p(work_dir) except OSError as exc: utils.fatal_error("Error: could not create the output directory %s" % work_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: output directory % is not writable." % work_dir) # The error log is in the main workdir error_output_file = os.path.join(work_dir, "error.log") os.environ["errorOutputFile"] = error_output_file # TODO: copy old error log to old logs directory, because otherwise it will be removed and lost forever if os.path.isfile(error_output_file): os.remove(error_output_file) # Validate reference fasta file reference_file_path = args.referenceFile if not os.path.isfile(reference_file_path): utils.fatal_error("Error: reference file %s does not exist." % reference_file_path) if os.path.getsize(reference_file_path) == 0: utils.fatal_error("Error: reference file %s is empty." % reference_file_path) reference_file_name = os.path.basename(reference_file_path) # Force rebuild flag is passed to all the subtask commands below force_flag = " -f " if args.forceFlag else " " # Create the logs directory with name like "logs-20170215.144253" run_time_stamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) log_dir = os.path.join(work_dir, "logs-" + run_time_stamp) try: utils.mkdir_p(log_dir) except OSError as exc: utils.fatal_error("Error: could not create the logs directory %s" % log_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: logs directory % is not writable." % log_dir) # Handle configuration file, use the specified file, or create a default file if args.configFile: config_file_path = args.configFile if not os.path.isfile(config_file_path): utils.fatal_error("Error: configuration file %s does not exist." % config_file_path) if os.path.getsize(config_file_path) == 0: utils.fatal_error("Error: configuration file %s is empty." % config_file_path) shutil.copy2(config_file_path, log_dir) # copy2 tries to preserve timestamps config_params = utils.read_properties(config_file_path, recognize_vars=True) validate_properties(config_params) else: command.run("cfsan_snp_pipeline data configurationFile " + log_dir, outfile=sys.stdout) config_file_path = os.path.join(log_dir, "snppipeline.conf") config_params = utils.read_properties(config_file_path, recognize_vars=True) # Validate the configured aligner choice snp_pipeline_aligner = config_params.get("SnpPipeline_Aligner", "").lower() or "bowtie2" if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.fatal_error("Config file error in SnpPipeline_Aligner parameter: only bowtie2 and smalt aligners are supported.") os.environ["SnpPipeline_Aligner"] = snp_pipeline_aligner # Stop the pipeline by default upon single sample errors if not configured either way # The environment variable is used by called processes stop_on_error = config_params.get("StopOnSampleError", "").lower() or "true" os.environ["StopOnSampleError"] = stop_on_error # Convert the stop_on_error flag to boolean for internal use in this function stop_on_error = stop_on_error == "true" # How many CPU cores can we use? max_cpu_cores = config_params.get("MaxCpuCores", None) if max_cpu_cores == "": max_cpu_cores = None if max_cpu_cores: try: max_cpu_cores = int(max_cpu_cores) if max_cpu_cores < 1: utils.fatal_error("Config file error in MaxCpuCores parameter: %s is less than one." % max_cpu_cores) except ValueError: utils.fatal_error("Config file error in MaxCpuCores parameter: %s is not a valid number." % max_cpu_cores) if job_queue_mgr is None: # workstation num_local_cpu_cores = psutil.cpu_count() max_cpu_cores = min(num_local_cpu_cores, max_cpu_cores) if max_cpu_cores else num_local_cpu_cores # Put the configuration parameters into the process environment variables os.environ["Bowtie2Build_ExtraParams"] = config_params.get("Bowtie2Build_ExtraParams", "") os.environ["SmaltIndex_ExtraParams"] = config_params.get("SmaltIndex_ExtraParams", "") os.environ["SamtoolsFaidx_ExtraParams"] = config_params.get("SamtoolsFaidx_ExtraParams", "") os.environ["Bowtie2Align_ExtraParams"] = config_params.get("Bowtie2Align_ExtraParams", "") os.environ["SmaltAlign_ExtraParams"] = config_params.get("SmaltAlign_ExtraParams", "") os.environ["SamtoolsSamFilter_ExtraParams"] = config_params.get("SamtoolsSamFilter_ExtraParams", "") os.environ["SamtoolsSort_ExtraParams"] = config_params.get("SamtoolsSort_ExtraParams", "") os.environ["RemoveDuplicateReads"] = config_params.get("RemoveDuplicateReads", "").lower() or "true" os.environ["PicardMarkDuplicates_ExtraParams"] = config_params.get("PicardMarkDuplicates_ExtraParams", "") os.environ["PicardJvm_ExtraParams"] = config_params.get("PicardJvm_ExtraParams", "") os.environ["SamtoolsMpileup_ExtraParams"] = config_params.get("SamtoolsMpileup_ExtraParams", "") os.environ["VarscanMpileup2snp_ExtraParams"] = config_params.get("VarscanMpileup2snp_ExtraParams", "") os.environ["VarscanJvm_ExtraParams"] = config_params.get("VarscanJvm_ExtraParams", "") os.environ["FilterRegions_ExtraParams"] = config_params.get("FilterRegions_ExtraParams", "") os.environ["MergeSites_ExtraParams"] = config_params.get("MergeSites_ExtraParams", "") os.environ["CallConsensus_ExtraParams"] = config_params.get("CallConsensus_ExtraParams", "") os.environ["SnpMatrix_ExtraParams"] = config_params.get("SnpMatrix_ExtraParams", "") os.environ["BcftoolsMerge_ExtraParams"] = config_params.get("BcftoolsMerge_ExtraParams", "") os.environ["SnpReference_ExtraParams"] = config_params.get("SnpReference_ExtraParams", "") os.environ["MergeVcfs_ExtraParams"] = config_params.get("MergeVcfs_ExtraParams", "") os.environ["CollectMetrics_ExtraParams"] = config_params.get("CollectMetrics_ExtraParams", "") os.environ["CombineMetrics_ExtraParams"] = config_params.get("CombineMetrics_ExtraParams", "") # Verify the dependencies are available on the path dependencies = ["cfsan_snp_pipeline", snp_pipeline_aligner, "samtools", "java", "tabix", "bgzip", "bcftools"] found_all_dependencies = True for executable in dependencies: if not utils.which(executable): utils.report_error(executable + " is not on the path") found_all_dependencies = False stdout = command.run("java net.sf.varscan.VarScan 2>&1") if "Error" in stdout: utils.report_error("CLASSPATH is not configured with the path to VarScan") found_all_dependencies = False if os.environ["RemoveDuplicateReads"] == "true": stdout = command.run("java picard.cmdline.PicardCommandLine 2>&1") if "Error" in stdout: utils.report_error("CLASSPATH is not configured with the path to Picard") found_all_dependencies = False if not found_all_dependencies: utils.fatal_error("Check the SNP Pipeline installation instructions here: http://snp-pipeline.readthedocs.org/en/latest/installation.html") # Process the sample directory command line option # TODO: detect broken fastq symlinks if args.samplesDir: samples_parent_dir = args.samplesDir.rstrip('/') # strip trailing slash if not utils.verify_non_empty_directory("Samples directory", samples_parent_dir): sys.exit(1) # verify at least one of the subdirectories contains fastq files. dir_sizes = get_sorted_sample_dirs_fastq_sizes(samples_parent_dir) dir_sizes = [(size, path) for size, path in dir_sizes if size > 0] if len(dir_sizes) == 0: utils.fatal_error("Samples directory %s does not contain subdirectories with fastq files." % samples_parent_dir) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(samples_parent_dir, sample_dirs_file) # Process the file of sample directories command line option # TODO: detect broken fastq symlinks if args.samplesFile: sample_dirs_file = args.samplesFile if not os.path.isfile(sample_dirs_file): utils.fatal_error("Error: the file of samples directories, %s, does not exist." % sample_dirs_file) if os.path.getsize(sample_dirs_file) == 0: utils.fatal_error("Error: the file of samples directories, %s, is empty." % sample_dirs_file) rewrite_cleansed_file_of_sample_dirs(sample_dirs_file, os.path.join(work_dir, "sampleDirectories.txt")) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") validate_file_of_sample_dirs(sample_dirs_file) with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() sample_count = len(sample_dirs_list) # -------------------------------------------------------- if job_queue_mgr is None: progress("Step 1 - Prep work") else: print("Step 1 - Prep work") # -------------------------------------------------------- # Mirror the input reference and samples if requested # TODO: make this a pure python solution if args.mirror: if args.mirror == "soft": # soft link, subsequent freshness checks use the timestamp of original file, not the soft link mirror_flag = " -s " elif args.mirror == "hard": # hard link, automatically preserves attributes of the original file mirror_flag = " -l " else: # regular copy, -p explicitly preserves attributes of the original file mirror_flag = " -p " # flush stdout to keep the unbuffered stderr in chronological order with stdout sys.stdout.flush() # Mirror/link the reference work_reference_dir = os.path.join(work_dir, "reference") utils.mkdir_p(work_reference_dir) src_reference_file = os.path.abspath(reference_file_path) cmd = "cp -v -u -f" + mirror_flag + src_reference_file + ' ' + work_reference_dir subprocess.check_call(cmd, shell=True) # since we mirrored the reference, we need to update our reference location reference_file_path = os.path.join(work_reference_dir, reference_file_name) # Mirror/link the samples work_samples_parent_dir = os.path.join(work_dir, "samples") for directory in sample_dirs_list: basedir = os.path.basename(directory) work_sample_dir = os.path.join(work_samples_parent_dir, basedir) utils.mkdir_p(work_sample_dir) src_sample_dir = os.path.abspath(directory) # copy without stderr message and without exit error code because the fastq or fq files might not exist cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fastq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) # since we mirrored the samples, we need to update our sorted list of samples sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(work_samples_parent_dir, sample_dirs_file) # refresh the list of sample dirs -- now in sorted order with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() # get the *.fastq or *.fq files in each sample directory, possibly compresessed, on one line per sample, ready to feed to bowtie sample_full_path_names_file = os.path.join(work_dir, "sampleFullPathNames.txt") with open(sample_full_path_names_file, 'w') as f: for directory in sample_dirs_list: file_list = fastq.list_fastq_files(directory) print(' '.join(file_list), file=f) # Initialize the job runner if job_queue_mgr is None: runner = JobRunner("local", exception_handler=handle_exception, verbose=args.verbose >= 4) elif job_queue_mgr == "grid": strip_job_array_suffix = config_params.get("GridEngine_StripJobArraySuffix", "true").lower() qsub_extra_params = config_params.get("GridEngine_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) else: strip_job_array_suffix = config_params.get("Torque_StripJobArraySuffix", "false").lower() qsub_extra_params = config_params.get("Torque_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) progress("Step 2 - Index the reference") log_file = os.path.join(log_dir, "indexRef.log") command_line = "cfsan_snp_pipeline index_ref" + force_flag + reference_file_path job_id_index_ref = runner.run(command_line, "indexRef", log_file) progress("Step 3 - Map the sample reads to the reference") # Parse the user-specified aligner parameters to find the number of CPU cores requested, for example, "-p 16" or "-n 16" # Set the default number of CPU cores if the user did not configure a value. if snp_pipeline_aligner == "smalt": extra_params_env_var = "SmaltAlign_ExtraParams" threads_option = "-n" else: extra_params_env_var = "Bowtie2Align_ExtraParams" threads_option = "-p" max_processes, threads_per_process = configure_process_threads(extra_params_env_var, threads_option, 8, max_cpu_cores) parallel_environment = config_params.get("GridEngine_PEname", None) log_file = os.path.join(log_dir, "mapReads.log") command_line = "cfsan_snp_pipeline map_reads" + force_flag + reference_file_path + " {1} {2}" job_id_map_reads = runner.run_array(command_line, "mapReads", log_file, sample_full_path_names_file, max_processes=max_processes, wait_for=[job_id_index_ref], threads=threads_per_process, parallel_environment=parallel_environment) progress("Step 4 - Find sites with SNPs in each sample") if job_queue_mgr in ["grid", "torque"]: time.sleep(1.0 + float(sample_count) / 150) # workaround torque bug when submitting two large consecutive array jobs, potential bug for grid log_file = os.path.join(log_dir, "callSites.log") command_line = "cfsan_snp_pipeline call_sites" + force_flag + reference_file_path + " {1}" job_id_call_sites = runner.run_array(command_line, "callSites", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_map_reads], slot_dependency=True) progress("Step 5 - Filter abnormal SNP regions") log_file = os.path.join(log_dir, "filterRegions.log") extra_params = os.environ.get("FilterRegions_ExtraParams", "") command_line = "cfsan_snp_pipeline filter_regions" + force_flag + "-n var.flt.vcf " + sample_dirs_file + ' ' + reference_file_path + ' ' + extra_params job_id_filter_regions = runner.run(command_line, "filterRegions", log_file, wait_for_array=[job_id_call_sites]) # Starting from here, there are 2 threads: # Thread X.1: the thread processing the original VCF files and corresponding downstream results # Thread X.2: the thread processing the preserved VCF files and corresponding downstream results progress("Step 6.1 - Merge the SNP sites across all samples into the SNP list file") # The mergeSites process creates the filtered list of sample directories. It is the list of samples not having excessive snps. # When running on a workstation, the file exists at this point during the script execution, but on grid or torque, it has not yet been created. However, # we know the path to the file regardless of whether it exists yet. filtered_sample_dirs_file = sample_dirs_file + ".OrigVCF.filtered" # touch $filtered_sample_dirs_file # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites.log") output_file = os.path.join(work_dir, "snplist.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file job_id_merge_sites = runner.run(command_line, "mergeSites", log_file, wait_for=[job_id_filter_regions]) progress("Step 6.2 - Merge the SNP sites across all samples into the SNP list file") # Create another copy of sample directories file, for the thread processing preserved snp files. filtered_sample_dirs_file2 = sample_dirs_file + ".PresVCF.filtered" # touch $filtered_sample_dirs_file2 # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites_preserved.log") output_file = os.path.join(work_dir, "snplist_preserved.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file2 job_id_merge_sites2 = runner.run(command_line, "mergeSites_preserved", log_file, wait_for=[job_id_filter_regions]) progress("Step 7.1 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = "{1}/consensus.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus.vcf {1}/reads.all.pileup" job_id_call_consensus = runner.run_array(command_line, "callConsensus", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites]) progress("Step 7.2 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = "{1}/consensus_preserved.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " -e {1}/var.flt_removed.vcf --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus_preserved.vcf {1}/reads.all.pileup" job_id_call_consensus2 = runner.run_array(command_line, "callConsensus_preserved", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites2]) progress("Step 8.1 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix.log") output_file = os.path.join(work_dir, "snpma.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_snp_matrix = runner.run(command_line, "snpMatrix", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 8.2 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus_preserved.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_snp_matrix2 = runner.run(command_line, "snpMatrix_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 9.1 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = os.path.join(work_dir, "referenceSNP.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference = runner.run(command_line, "snpReference", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 9.2 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = os.path.join(work_dir, "referenceSNP_preserved.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference2 = runner.run(command_line, "snpReference_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 10.1 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs.log") output_file = os.path.join(work_dir, "snpma.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_merge_vcfs = runner.run(command_line, "mergeVcfs", log_file, wait_for_array=[job_id_call_consensus]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 10.2 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-n consensus_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_merge_vcfs2 = runner.run(command_line, "mergeVcfs_preserved", log_file, wait_for_array=[job_id_call_consensus2]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 11.1 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance.log") input_file = os.path.join(work_dir, "snpma.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance = runner.run(command_line, "distance", log_file, wait_for=[job_id_snp_matrix]) progress("Step 11.2 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance_preserved.log") input_file = os.path.join(work_dir, "snpma_preserved.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise_preserved.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix_preserved.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance2 = runner.run(command_line, "distance_preserved", log_file, wait_for=[job_id_snp_matrix2]) progress("Step 12 - Collect metrics for each sample") log_file = os.path.join(log_dir, "collectMetrics.log") output_file = "{1}/metrics" extra_params = os.environ.get("CollectMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline collect_metrics" + force_flag + "-o " + output_file + ' ' + extra_params + " {1} " + reference_file_path job_id_collect_metrics = runner.run_array(command_line, "collectMetrics", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_call_consensus, job_id_call_consensus2], slot_dependency=True) progress("Step 13 - Combine the metrics across all samples into the metrics table") log_file = os.path.join(log_dir, "combineMetrics.log") output_file = os.path.join(work_dir, "metrics.tsv") extra_params = os.environ.get("CombineMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline combine_metrics" + force_flag + "-n metrics -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file combine_metrics_job_id = runner.run(command_line, "combineMetrics", log_file, wait_for_array=[job_id_collect_metrics]) # Step 14 - Notify user of any non-fatal errors accumulated during processing if os.path.isfile(error_output_file) and os.path.getsize(error_output_file) > 0 and not stop_on_error: print("\nThere were errors processing some samples.\nSee the log file %s for a summary of errors." % error_output_file, file=sys.stderr) # Exit here to prevent showing the "cfsan_snp_pipeline run finished" message. The jobs are queued, not finished yet. if job_queue_mgr is not None: # HPC sys.exit(0)
def combine_metrics(args): """Combine the per-sample metrics files into a single table of metrics for all samples. This function expects, or creates '(*)', the following files arranged in the following way: samples sample_name_one/metrics metrics.tsv All the input files are created outside of this function. Before running this command, the metrics file for each sample must be created by the collect_metrics command. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace sampleDirsFile : Path to file containing a list of directories -- one per sample metricsFileName : File name of the metrics files which must exist in each of the sample directories mergedMetricsFile : Path to the output merged metrics file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== sample_directories_list_path = args.sampleDirsFile utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global") metrics_file_name = args.metricsFileName merged_metrics_path = args.mergedMetricsFile with open(sample_directories_list_path, "r") as f: sample_directories = [line.rstrip() for line in f] sample_directories = [d for d in sample_directories if d] metrics_files = [os.path.join(d, metrics_file_name) for d in sample_directories] #========================================================================== # Check if merge has already been done #========================================================================== needs_rebuild = utils.target_needs_rebuild(metrics_files, merged_metrics_path) if not args.forceFlag and not needs_rebuild: verbose_print("# The merged metrics file is already freshly created. Use the -f option to force a rebuild.") return #========================================================================== # Parse the metrics files and print the tabular results #========================================================================== with open(merged_metrics_path, 'w') as f: # Emit the column headings column_headings = ["Sample", "Fastq Files", "Fastq File Size", "Machine", "Flowcell", "Number of Reads", "Duplicate Reads", "Percent of Reads Mapped", "Percent Proper Pair", "Average Insert Size", "Average Pileup Depth", "Phase1 SNPs", "Phase1 Preserved SNPs", "Phase2 SNPs", "Phase2 Preserved SNPs", "Missing SNP Matrix Positions", "Missing Preserved SNP Matrix Positions", "Excluded Sample", "Excluded Preserved Sample", "Warnings and Errors"] if not args.spaceHeadings: column_headings = [heading.replace(' ', '_') for heading in column_headings] tabbed_headings = '\t'.join(column_headings) f.write(tabbed_headings + '\n') # Reads the metrics from each sample, and emit the values for metrics_file in metrics_files: verbose_print("Processing " + metrics_file) message = None if not os.path.isfile(metrics_file): message = "Sample metrics file %s does not exist." % metrics_file elif os.path.getsize(metrics_file) == 0: message = "Sample metrics file %s is empty." % metrics_file if message: f.write(message + '\n') utils.sample_warning(message) continue metrics = utils.read_properties(metrics_file) f.write(quoted(metrics.get("sample", "")) + '\t') f.write(quoted(metrics.get("fastqFileList", "")) + '\t') f.write(metrics.get("fastqFileSize", "") + '\t') f.write(metrics.get("machine", "") + '\t') f.write(metrics.get("flowcell", "") + '\t') f.write(metrics.get("numberReads", "") + '\t') f.write(metrics.get("numberDupReads", "") + '\t') f.write(metrics.get("percentReadsMapped", "") + '\t') f.write(metrics.get("percentProperPair", "") + '\t') f.write(metrics.get("aveInsertSize", "") + '\t') f.write(metrics.get("avePileupDepth", "") + '\t') f.write(metrics.get("phase1Snps", "") + '\t') f.write(metrics.get("phase1SnpsPreserved", "") + '\t') f.write(metrics.get("snps", "") + '\t') f.write(metrics.get("snpsPreserved", "") + '\t') f.write(metrics.get("missingPos", "") + '\t') f.write(metrics.get("missingPosPreserved", "") + '\t') f.write(metrics.get("excludedSample", "") + '\t') f.write(metrics.get("excludedSamplePreserved", "") + '\t') f.write(quoted(metrics.get("errorList", "")) + '\n')
def run(args): """Run all the steps of the snp pipeline in th correct order. Parameters ---------- args : Namespace referenceFile : str Relative or absolute path to the reference fasta file forceFlag : bool Force processing even when result files already exist and are newer than inputs mirror : str Mode to create a mirror copy of the reference directory and all the sample directories. Possible values: {soft, hard, copy} configFile : str Relative or absolute path to a configuration file for overriding defaults and defining extra parameters for the tools and scripts within the pipeline. jobQueueMgr : str Job queue manager for remote parallel job execution in an HPC environment. Currently "torque" and "grid" are supported. If not specified, the pipeline will execute locally. workDir : str Output directory for the result files. samplesDir : str Relative or absolute path to the parent directory of all the sample directories. samplesFile : str Relative or absolute path to a file listing all of the sample directories. purge : bool Purge the intermediate output files when the pipeline completes successfully. """ global log_dir global job_queue_mgr start_time = time.time() # Where are we running: grid, torque, or None (local) job_queue_mgr = args.jobQueueMgr # Erase any left-over error log environment variable from a previous run os.environ.pop("errorOutputFile", None) # the 2nd arg avoids an exception when not in dict # Handle output working directory. Create the directory if it does not exist. # Any errors creating the work_dir will not be logged to the error log because # the error log belongs in the work_dir. work_dir = args.workDir try: utils.mkdir_p(work_dir) except OSError as exc: utils.fatal_error("Error: could not create the output directory %s" % work_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: output directory % is not writable." % work_dir) # The error log is in the main workdir error_output_file = os.path.join(work_dir, "error.log") os.environ["errorOutputFile"] = error_output_file # TODO: copy old error log to old logs directory, because otherwise it will be removed and lost forever if os.path.isfile(error_output_file): os.remove(error_output_file) # Validate reference fasta file reference_file_path = args.referenceFile if not os.path.isfile(reference_file_path): utils.fatal_error("Error: reference file %s does not exist." % reference_file_path) if os.path.getsize(reference_file_path) == 0: utils.fatal_error("Error: reference file %s is empty." % reference_file_path) reference_file_name = os.path.basename(reference_file_path) # Force rebuild flag is passed to all the subtask commands below force_flag = " -f " if args.forceFlag else " " # Create the logs directory with name like "logs-20170215.144253" run_time_stamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) log_dir = os.path.join(work_dir, "logs-" + run_time_stamp) try: utils.mkdir_p(log_dir) except OSError as exc: utils.fatal_error("Error: could not create the logs directory %s" % log_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: logs directory % is not writable." % log_dir) # Handle configuration file, use the specified file, or create a default file if args.configFile: config_file_path = args.configFile if not os.path.isfile(config_file_path): utils.fatal_error("Error: configuration file %s does not exist." % config_file_path) if os.path.getsize(config_file_path) == 0: utils.fatal_error("Error: configuration file %s is empty." % config_file_path) shutil.copy2(config_file_path, log_dir) # copy2 tries to preserve timestamps config_params = utils.read_properties(config_file_path, recognize_vars=True) validate_properties(config_params) else: command.run("cfsan_snp_pipeline data configurationFile " + log_dir, outfile=sys.stdout) config_file_path = os.path.join(log_dir, "snppipeline.conf") config_params = utils.read_properties(config_file_path, recognize_vars=True) # Validate the configured aligner choice snp_pipeline_aligner = config_params.get("SnpPipeline_Aligner", "").lower() or "bowtie2" if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.fatal_error( "Config file error in SnpPipeline_Aligner parameter: only bowtie2 and smalt aligners are supported." ) os.environ["SnpPipeline_Aligner"] = snp_pipeline_aligner # Stop the pipeline by default upon single sample errors if not configured either way # The environment variable is used by called processes stop_on_error = config_params.get("StopOnSampleError", "").lower() or "true" os.environ["StopOnSampleError"] = stop_on_error # Convert the stop_on_error flag to boolean for internal use in this function stop_on_error = stop_on_error == "true" # How many CPU cores can we use? max_cpu_cores = config_params.get("MaxCpuCores", None) if max_cpu_cores == "": max_cpu_cores = None if max_cpu_cores: try: max_cpu_cores = int(max_cpu_cores) if max_cpu_cores < 1: utils.fatal_error( "Config file error in MaxCpuCores parameter: %s is less than one." % max_cpu_cores) except ValueError: utils.fatal_error( "Config file error in MaxCpuCores parameter: %s is not a valid number." % max_cpu_cores) if job_queue_mgr is None: # workstation num_local_cpu_cores = psutil.cpu_count() max_cpu_cores = min( num_local_cpu_cores, max_cpu_cores) if max_cpu_cores else num_local_cpu_cores # How many CPU cores per process? if job_queue_mgr is None: # workstation cpu_cores_per_process = config_params.get( "CpuCoresPerProcessOnWorkstation", None) if cpu_cores_per_process: try: cpu_cores_per_process = int(cpu_cores_per_process) if cpu_cores_per_process < 1: utils.fatal_error( "Config file error in CpuCoresPerProcessOnWorkstation parameter: %s is less than one." % cpu_cores_per_process) except ValueError: utils.fatal_error( "Config file error in CpuCoresPerProcessOnWorkstation parameter: %s is not a valid number." % cpu_cores_per_process) else: cpu_cores_per_process = min(num_local_cpu_cores, max_cpu_cores) else: # HPC cpu_cores_per_process = config_params.get("CpuCoresPerProcessOnHPC", None) if not cpu_cores_per_process: utils.fatal_error( "Config file error. CpuCoresPerProcessOnHPC parameter must be set to a value." ) else: try: cpu_cores_per_process = int(cpu_cores_per_process) if cpu_cores_per_process < 1: utils.fatal_error( "Config file error in CpuCoresPerProcessOnHPC parameter: %s is less than one." % cpu_cores_per_process) except ValueError: utils.fatal_error( "Config file error in CpuCoresPerProcessOnHPC parameter: %s is not a valid number." % cpu_cores_per_process) # Put the configuration parameters into the process environment variables os.environ["Bowtie2Build_ExtraParams"] = config_params.get( "Bowtie2Build_ExtraParams", "") os.environ["SmaltIndex_ExtraParams"] = config_params.get( "SmaltIndex_ExtraParams", "") os.environ["CreateSequenceDictionary_ExtraParams"] = config_params.get( "CreateSequenceDictionary_ExtraParams", "") os.environ["SamtoolsFaidx_ExtraParams"] = config_params.get( "SamtoolsFaidx_ExtraParams", "") os.environ["Bowtie2Align_ExtraParams"] = config_params.get( "Bowtie2Align_ExtraParams", "") os.environ["SmaltAlign_ExtraParams"] = config_params.get( "SmaltAlign_ExtraParams", "") os.environ["SamtoolsSamFilter_ExtraParams"] = config_params.get( "SamtoolsSamFilter_ExtraParams", "") os.environ["SamtoolsSort_ExtraParams"] = config_params.get( "SamtoolsSort_ExtraParams", "") os.environ["SamtoolsIndex_ExtraParams"] = config_params.get( "SamtoolsIndex_ExtraParams", "") os.environ["RemoveDuplicateReads"] = config_params.get( "RemoveDuplicateReads", "").lower() or "true" os.environ["PicardJvm_ExtraParams"] = config_params.get( "PicardJvm_ExtraParams", "") os.environ["PicardMarkDuplicates_ExtraParams"] = config_params.get( "PicardMarkDuplicates_ExtraParams", "") os.environ["EnableLocalRealignment"] = config_params.get( "EnableLocalRealignment", "").lower() or "true" os.environ["GatkJvm_ExtraParams"] = config_params.get( "GatkJvm_ExtraParams", "") os.environ["RealignerTargetCreator_ExtraParams"] = config_params.get( "RealignerTargetCreator_ExtraParams", "") os.environ["IndelRealigner_ExtraParams"] = config_params.get( "IndelRealigner_ExtraParams", "") os.environ["SamtoolsMpileup_ExtraParams"] = config_params.get( "SamtoolsMpileup_ExtraParams", "") os.environ["VarscanMpileup2snp_ExtraParams"] = config_params.get( "VarscanMpileup2snp_ExtraParams", "") os.environ["VarscanJvm_ExtraParams"] = config_params.get( "VarscanJvm_ExtraParams", "") os.environ["FilterRegions_ExtraParams"] = config_params.get( "FilterRegions_ExtraParams", "") os.environ["MergeSites_ExtraParams"] = config_params.get( "MergeSites_ExtraParams", "") os.environ["CallConsensus_ExtraParams"] = config_params.get( "CallConsensus_ExtraParams", "") os.environ["SnpMatrix_ExtraParams"] = config_params.get( "SnpMatrix_ExtraParams", "") os.environ["BcftoolsMerge_ExtraParams"] = config_params.get( "BcftoolsMerge_ExtraParams", "") os.environ["SnpReference_ExtraParams"] = config_params.get( "SnpReference_ExtraParams", "") os.environ["MergeVcfs_ExtraParams"] = config_params.get( "MergeVcfs_ExtraParams", "") os.environ["CollectMetrics_ExtraParams"] = config_params.get( "CollectMetrics_ExtraParams", "") os.environ["CombineMetrics_ExtraParams"] = config_params.get( "CombineMetrics_ExtraParams", "") # Verify the dependencies are available on the path print("Checking dependencies...") dependencies = [ "cfsan_snp_pipeline", snp_pipeline_aligner, "java", "tabix", "bgzip", "bcftools" ] found_all_dependencies = True for executable in dependencies: if not utils.which(executable): utils.report_error(executable + " is not on the path") found_all_dependencies = False if not utils.which("samtools"): utils.report_error("samtools is not on the path") found_all_dependencies = False else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_version = version_str.split()[-1] # just the number if samtools_version < "1.4": utils.report_error( "The installed %s is not supported. Version 1.4 or higher is required." % version_str) found_all_dependencies = False jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH") if jar_file_path: stdout = command.run("java -jar " + jar_file_path + " 2>&1") if not jar_file_path or "error" in stdout.lower(): utils.report_error( "CLASSPATH is not configured with the path to VarScan.jar") found_all_dependencies = False picard_required = os.environ[ "RemoveDuplicateReads"] == "true" or os.environ[ "EnableLocalRealignment"] == "true" if picard_required: jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH") if not jar_file_path: utils.report_error( "CLASSPATH is not configured with the path to picard.jar") found_all_dependencies = False else: stdout = command.run("java -jar " + jar_file_path + " 2>&1") if stdout.lower().startswith("error"): utils.report_error(stdout) found_all_dependencies = False gatk_required = os.environ["EnableLocalRealignment"] == "true" if gatk_required: jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK", "CLASSPATH") if not jar_file_path: utils.report_error( "CLASSPATH is not configured with the path to GenomeAnalysisTK.jar" ) found_all_dependencies = False else: stdout = command.run("java -jar " + jar_file_path + " --version 2>&1") if stdout.lower().startswith("error"): utils.report_error(stdout) found_all_dependencies = False else: stdout = command.run("java -jar " + jar_file_path + " -T IndelRealigner --version 2>&1") if "not a valid command" in stdout.lower( ) or "indelrealigner is no longer included" in stdout.lower(): utils.report_error( "The installed GATK version does not support indel realignment. Try installing an older release prior to GATK v4." ) found_all_dependencies = False elif "user error has occurred" in stdout.lower(): utils.report_error(stdout) found_all_dependencies = False if not found_all_dependencies: utils.fatal_error( "Check the SNP Pipeline installation instructions here: http://snp-pipeline.readthedocs.org/en/latest/installation.html" ) else: print("OK") # Process the sample directory command line option # TODO: detect broken fastq symlinks if args.samplesDir: samples_parent_dir = args.samplesDir.rstrip( '/') # strip trailing slash if not utils.verify_non_empty_directory("Samples directory", samples_parent_dir): sys.exit(1) # verify at least one of the subdirectories contains fastq files. dir_sizes = get_sorted_sample_dirs_fastq_sizes(samples_parent_dir) dir_sizes = [(size, path) for size, path in dir_sizes if size > 0] if len(dir_sizes) == 0: utils.fatal_error( "Samples directory %s does not contain subdirectories with fastq files." % samples_parent_dir) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(samples_parent_dir, sample_dirs_file) # Process the file of sample directories command line option # TODO: detect broken fastq symlinks if args.samplesFile: sample_dirs_file = args.samplesFile if not os.path.isfile(sample_dirs_file): utils.fatal_error( "Error: the file of samples directories, %s, does not exist." % sample_dirs_file) if os.path.getsize(sample_dirs_file) == 0: utils.fatal_error( "Error: the file of samples directories, %s, is empty." % sample_dirs_file) rewrite_cleansed_file_of_sample_dirs( sample_dirs_file, os.path.join(work_dir, "sampleDirectories.txt")) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") validate_file_of_sample_dirs(sample_dirs_file) with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() sample_count = len(sample_dirs_list) # -------------------------------------------------------- if job_queue_mgr is None: progress("Step 1 - Prep work") else: print("Step 1 - Prep work") # -------------------------------------------------------- # Mirror the input reference and samples if requested # TODO: make this a pure python solution if args.mirror: if args.mirror == "soft": # soft link, subsequent freshness checks use the timestamp of original file, not the soft link mirror_flag = " -s " elif args.mirror == "hard": # hard link, automatically preserves attributes of the original file mirror_flag = " -l " else: # regular copy, -p explicitly preserves attributes of the original file mirror_flag = " -p " # flush stdout to keep the unbuffered stderr in chronological order with stdout sys.stdout.flush() # Mirror/link the reference work_reference_dir = os.path.join(work_dir, "reference") utils.mkdir_p(work_reference_dir) src_reference_file = os.path.abspath(reference_file_path) cmd = "cp -v -u -f" + mirror_flag + src_reference_file + ' ' + work_reference_dir subprocess.check_call(cmd, shell=True) # since we mirrored the reference, we need to update our reference location reference_file_path = os.path.join(work_reference_dir, reference_file_name) # Mirror/link the samples work_samples_parent_dir = os.path.join(work_dir, "samples") for directory in sample_dirs_list: basedir = os.path.basename(directory) work_sample_dir = os.path.join(work_samples_parent_dir, basedir) utils.mkdir_p(work_sample_dir) src_sample_dir = os.path.abspath(directory) # copy without stderr message and without exit error code because the fastq or fq files might not exist cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fastq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) # since we mirrored the samples, we need to update our sorted list of samples sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(work_samples_parent_dir, sample_dirs_file) # refresh the list of sample dirs -- now in sorted order with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() # get the *.fastq or *.fq files in each sample directory, possibly compresessed, on one line per sample, ready to feed to bowtie sample_full_path_names_file = os.path.join(work_dir, "sampleFullPathNames.txt") with open(sample_full_path_names_file, 'w') as f: for directory in sample_dirs_list: file_list = fastq.list_fastq_files(directory) print(' '.join(file_list), file=f) # Initialize the job runner if job_queue_mgr is None: runner = JobRunner("local", exception_handler=handle_exception, verbose=args.verbose >= 4) elif job_queue_mgr == "grid": strip_job_array_suffix = config_params.get( "GridEngine_StripJobArraySuffix", "true").lower() qsub_extra_params = config_params.get("GridEngine_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) else: strip_job_array_suffix = config_params.get( "Torque_StripJobArraySuffix", "false").lower() qsub_extra_params = config_params.get("Torque_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) progress("Step 2 - Index the reference") log_file = os.path.join(log_dir, "indexRef.log") command_line = "cfsan_snp_pipeline index_ref" + force_flag + reference_file_path job_id_index_ref = runner.run(command_line, "indexRef", log_file) progress("Step 3 - Map the sample reads to the reference") # Parse the user-specified aligner parameters to find the number of CPU cores requested, for example, "-p 16" or "-n 16" # Set the default number of CPU cores if the user did not configure a value. if snp_pipeline_aligner == "smalt": extra_params_env_var = "SmaltAlign_ExtraParams" threads_option = "-n" else: extra_params_env_var = "Bowtie2Align_ExtraParams" threads_option = "-p" aligner_max_processes, aligner_threads_per_process = utils.configure_process_threads( extra_params_env_var, threads_option, cpu_cores_per_process, max_cpu_cores) samfilter_max_processes, samfilter_threads_per_process = utils.configure_process_threads( "SamtoolsSamFilter_ExtraParams", ["-@", "--threads"], cpu_cores_per_process, max_cpu_cores) samsort_max_processes, samsort_threads_per_process = utils.configure_process_threads( "SamtoolsSort_ExtraParams", ["-@", "--threads"], cpu_cores_per_process, max_cpu_cores) samindex_max_processes, samindex_threads_per_process = utils.configure_process_threads( "SamtoolsIndex_ExtraParams", ["-@"], cpu_cores_per_process, max_cpu_cores) realigner_max_processes, realigner_threads_per_process = utils.configure_process_threads( "RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"], cpu_cores_per_process, max_cpu_cores) # There are multiple processes within map_reads, each with multiple threads. # The CPU allocation must be enough for the process needing the largest number of threads. max_processes_list = [ aligner_max_processes, samfilter_max_processes, samsort_max_processes, samindex_max_processes, realigner_max_processes ] if all([i is None for i in max_processes_list]): max_processes = None else: max_processes = min([i for i in max_processes_list if i is not None]) threads_per_process = max(aligner_threads_per_process, samfilter_threads_per_process, samsort_threads_per_process, samindex_threads_per_process, realigner_threads_per_process) parallel_environment = config_params.get("GridEngine_PEname", None) log_file = os.path.join(log_dir, "mapReads.log") command_line = "cfsan_snp_pipeline map_reads --threads " + str( threads_per_process) + force_flag + reference_file_path + " {1} {2}" job_id_map_reads = runner.run_array( command_line, "mapReads", log_file, sample_full_path_names_file, max_processes=max_processes, wait_for=[job_id_index_ref], threads=threads_per_process, parallel_environment=parallel_environment) progress("Step 4 - Find sites with SNPs in each sample") if job_queue_mgr in ["grid", "torque"]: time.sleep( 1.0 + float(sample_count) / 150 ) # workaround torque bug when submitting two large consecutive array jobs, potential bug for grid log_file = os.path.join(log_dir, "callSites.log") command_line = "cfsan_snp_pipeline call_sites" + force_flag + reference_file_path + " {1}" job_id_call_sites = runner.run_array(command_line, "callSites", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_map_reads], slot_dependency=True) progress("Step 5 - Filter abnormal SNP regions") log_file = os.path.join(log_dir, "filterRegions.log") extra_params = os.environ.get("FilterRegions_ExtraParams", "") command_line = "cfsan_snp_pipeline filter_regions" + force_flag + "-n var.flt.vcf " + sample_dirs_file + ' ' + reference_file_path + ' ' + extra_params job_id_filter_regions = runner.run(command_line, "filterRegions", log_file, wait_for_array=[job_id_call_sites]) # Starting from here, there are 2 threads: # Thread X.1: the thread processing the original VCF files and corresponding downstream results # Thread X.2: the thread processing the preserved VCF files and corresponding downstream results progress( "Step 6.1 - Merge the SNP sites across all samples into the SNP list file" ) # The mergeSites process creates the filtered list of sample directories. It is the list of samples not having excessive snps. # When running on a workstation, the file exists at this point during the script execution, but on grid or torque, it has not yet been created. However, # we know the path to the file regardless of whether it exists yet. filtered_sample_dirs_file = sample_dirs_file + ".OrigVCF.filtered" # touch $filtered_sample_dirs_file # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites.log") output_file = os.path.join(work_dir, "snplist.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file job_id_merge_sites = runner.run(command_line, "mergeSites", log_file, wait_for=[job_id_filter_regions]) progress( "Step 6.2 - Merge the SNP sites across all samples into the SNP list file" ) # Create another copy of sample directories file, for the thread processing preserved snp files. filtered_sample_dirs_file2 = sample_dirs_file + ".PresVCF.filtered" # touch $filtered_sample_dirs_file2 # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites_preserved.log") output_file = os.path.join(work_dir, "snplist_preserved.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file2 job_id_merge_sites2 = runner.run(command_line, "mergeSites_preserved", log_file, wait_for=[job_id_filter_regions]) progress("Step 7.1 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = "{1}/consensus.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus.vcf {1}/reads.all.pileup" job_id_call_consensus = runner.run_array(command_line, "callConsensus", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites]) progress("Step 7.2 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = "{1}/consensus_preserved.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " -e {1}/var.flt_removed.vcf --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus_preserved.vcf {1}/reads.all.pileup" job_id_call_consensus2 = runner.run_array(command_line, "callConsensus_preserved", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites2]) progress("Step 8.1 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix.log") output_file = os.path.join(work_dir, "snpma.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_snp_matrix = runner.run(command_line, "snpMatrix", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 8.2 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus_preserved.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_snp_matrix2 = runner.run(command_line, "snpMatrix_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 9.1 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = os.path.join(work_dir, "referenceSNP.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference = runner.run(command_line, "snpReference", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 9.2 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = os.path.join(work_dir, "referenceSNP_preserved.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference2 = runner.run(command_line, "snpReference_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 10.1 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs.log") output_file = os.path.join(work_dir, "snpma.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_merge_vcfs = runner.run(command_line, "mergeVcfs", log_file, wait_for_array=[job_id_call_consensus]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 10.2 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-n consensus_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_merge_vcfs2 = runner.run( command_line, "mergeVcfs_preserved", log_file, wait_for_array=[job_id_call_consensus2]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 11.1 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance.log") input_file = os.path.join(work_dir, "snpma.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance = runner.run(command_line, "distance", log_file, wait_for=[job_id_snp_matrix]) progress("Step 11.2 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance_preserved.log") input_file = os.path.join(work_dir, "snpma_preserved.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise_preserved.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix_preserved.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance2 = runner.run(command_line, "distance_preserved", log_file, wait_for=[job_id_snp_matrix2]) progress("Step 12 - Collect metrics for each sample") log_file = os.path.join(log_dir, "collectMetrics.log") output_file = "{1}/metrics" extra_params = os.environ.get("CollectMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline collect_metrics" + force_flag + "-o " + output_file + ' ' + extra_params + " {1} " + reference_file_path job_id_collect_metrics = runner.run_array( command_line, "collectMetrics", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_call_consensus, job_id_call_consensus2], slot_dependency=True) progress( "Step 13 - Combine the metrics across all samples into the metrics table" ) log_file = os.path.join(log_dir, "combineMetrics.log") output_file = os.path.join(work_dir, "metrics.tsv") extra_params = os.environ.get("CombineMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline combine_metrics" + force_flag + "-n metrics -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file combine_metrics_job_id = runner.run( command_line, "combineMetrics", log_file, wait_for_array=[job_id_collect_metrics]) # Decide whether to purge the intermediate output files upon successful completion. # Case 1: we are running on the HPC. We always need to submit the purge task. It will decide to do nothing if there were errors. if job_queue_mgr is not None: # HPC need_purge = args.purge # need to submit the purge task, it might decide to do nothing if there were errors # Case 2: we are running locally and we know right now whether there were any errors. # Case 2a: We are configured to stop on error, but the fact that we got this far means there were no errors -- so we need to purge. # Case 2b: We are configured to ignore errors, so now we look for evidence of errors and purge if there were no errors. else: errors_detected = os.path.isfile(error_output_file) need_purge = args.purge and not errors_detected if need_purge: progress("Step 14 - Purge the intermediate output files") log_file = os.path.join(log_dir, "purge.log") command_line = "cfsan_snp_pipeline purge " + work_dir purge_job_id = runner.run(command_line, "purge", log_file, wait_for=[combine_metrics_job_id]) # Step 15 - Notify user of any non-fatal errors accumulated during processing if os.path.isfile(error_output_file) and os.path.getsize( error_output_file) > 0 and not stop_on_error: print( "\nThere were errors processing some samples.\nSee the log file %s for a summary of errors." % error_output_file, file=sys.stderr) # Exit here to prevent showing the "cfsan_snp_pipeline run finished" message. The jobs are queued, not finished yet. if job_queue_mgr is not None: # HPC sys.exit(0) else: end_time = time.time() elapsed_time = end_time - start_time print("Elapsed time =", elapsed_time)
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Calculate number of reads, %mapped, %proper pair, and ave insert size from sam file" )) #------------------------- num_reads = "" percent_reads_mapped = "" percent_proper_pair = "" ave_insert_size = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get( "percentReadsMapped", "") # reuse already fresh metrics percent_proper_pair = metrics.get( "percentProperPair", "") # reuse already fresh metrics ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if not missing_any_metrics: verbose_print( "Reusing previously calculated number of reads, %mapped, %proper pair, and ave insert size" ) else: tempfile_path = os.path.join(sample_dir, "tmp.sam.stats") try: command.run("samtools stats " + file, tempfile_path) except subprocess.CalledProcessError: pass # the error message has already been printed to stderr with open(tempfile_path) as f: for line in f: lower_line = line.lower() split_line = line.strip().split('\t') if "raw total sequences:" in lower_line: num_reads = split_line[2] continue if "reads mapped:" in lower_line: reads_mapped = split_line[2] try: percent_reads_mapped = 100.0 * float( reads_mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: percent_reads_mapped = "" continue if "reads properly paired:" in lower_line: proper_pairs = split_line[2] try: percent_proper_pair = 100.0 * float( proper_pairs) / float(num_reads) percent_proper_pair = "%.2f" % percent_proper_pair except ValueError: percent_proper_pair = "" continue if "insert size average:" in lower_line: ave_insert_size = split_line[2] continue os.unlink(tempfile_path) missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if missing_any_metrics: missing_list = [] if not num_reads: missing_list.append("number of reads") if not percent_reads_mapped: missing_list.append("percent reads mapped") if not percent_proper_pair: missing_list.append("percent proper pair") if not ave_insert_size: missing_list.append("ave insert size") error_text = "Cannot calculate " + ", ".join( missing_list) + '.' handle_error(error_text) #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print( "# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print( "Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file" )) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get( "phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print( "Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get( "snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print( "Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print( "# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print( "Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions( file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get( "missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print( "Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions( file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("percentProperPair=" + percent_proper_pair, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def combine_metrics(args): """Combine the per-sample metrics files into a single table of metrics for all samples. This function expects, or creates '(*)', the following files arranged in the following way: samples sample_name_one/metrics metrics.tsv All the input files are created outside of this function. Before running this command, the metrics file for each sample must be created by the collect_metrics command. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace sampleDirsFile : Path to file containing a list of directories -- one per sample metricsFileName : File name of the metrics files which must exist in each of the sample directories mergedMetricsFile : Path to the output merged metrics file """ utils.print_log_header() utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== sample_directories_list_path = args.sampleDirsFile utils.verify_non_empty_input_files("File of sample directories", [sample_directories_list_path], error_handler="global") metrics_file_name = args.metricsFileName merged_metrics_path = args.mergedMetricsFile with open(sample_directories_list_path, "r") as f: sample_directories = [line.rstrip() for line in f] sample_directories = [d for d in sample_directories if d] metrics_files = [ os.path.join(d, metrics_file_name) for d in sample_directories ] #========================================================================== # Check if merge has already been done #========================================================================== needs_rebuild = utils.target_needs_rebuild(metrics_files, merged_metrics_path) if not args.forceFlag and not needs_rebuild: verbose_print( "# The merged metrics file is already freshly created. Use the -f option to force a rebuild." ) return #========================================================================== # Parse the metrics files and print the tabular results #========================================================================== with open(merged_metrics_path, 'w') as f: # Emit the column headings column_headings = [ "Sample", "Fastq Files", "Fastq File Size", "Machine", "Flowcell", "Number of Reads", "Duplicate Reads", "Percent of Reads Mapped", "Percent Proper Pair", "Average Insert Size", "Average Pileup Depth", "Phase1 SNPs", "Phase1 Preserved SNPs", "Phase2 SNPs", "Phase2 Preserved SNPs", "Missing SNP Matrix Positions", "Missing Preserved SNP Matrix Positions", "Excluded Sample", "Excluded Preserved Sample", "Warnings and Errors" ] if not args.spaceHeadings: column_headings = [ heading.replace(' ', '_') for heading in column_headings ] tabbed_headings = '\t'.join(column_headings) f.write(tabbed_headings + '\n') # Reads the metrics from each sample, and emit the values for metrics_file in metrics_files: verbose_print("Processing " + metrics_file) message = None if not os.path.isfile(metrics_file): message = "Sample metrics file %s does not exist." % metrics_file elif os.path.getsize(metrics_file) == 0: message = "Sample metrics file %s is empty." % metrics_file if message: f.write(message + '\n') utils.sample_warning(message) continue metrics = utils.read_properties(metrics_file) f.write(quoted(metrics.get("sample", "")) + '\t') f.write(quoted(metrics.get("fastqFileList", "")) + '\t') f.write(metrics.get("fastqFileSize", "") + '\t') f.write(metrics.get("machine", "") + '\t') f.write(metrics.get("flowcell", "") + '\t') f.write(metrics.get("numberReads", "") + '\t') f.write(metrics.get("numberDupReads", "") + '\t') f.write(metrics.get("percentReadsMapped", "") + '\t') f.write(metrics.get("percentProperPair", "") + '\t') f.write(metrics.get("aveInsertSize", "") + '\t') f.write(metrics.get("avePileupDepth", "") + '\t') f.write(metrics.get("phase1Snps", "") + '\t') f.write(metrics.get("phase1SnpsPreserved", "") + '\t') f.write(metrics.get("snps", "") + '\t') f.write(metrics.get("snpsPreserved", "") + '\t') f.write(metrics.get("missingPos", "") + '\t') f.write(metrics.get("missingPosPreserved", "") + '\t') f.write(metrics.get("excludedSample", "") + '\t') f.write(metrics.get("excludedSamplePreserved", "") + '\t') f.write(quoted(metrics.get("errorList", "")) + '\n')