def validate_file_of_sample_dirs(sample_dirs_file): """Verify each of the sample directories in the sample directory file is not empty and contains fastq's. Parameters ---------- sample_dirs_file : str Path to the file of sample directories """ found_error = False with open(sample_dirs_file) as f: for directory in f: directory = directory.strip() if not utils.verify_non_empty_directory("Sample directory", directory): found_error = True else: files = fastq.list_fastq_files(directory) if len(files) == 0: utils.report_error("Sample directory %s does not contain any fastq files." % directory) found_error = True if found_error: if os.environ.get("StopOnSampleError") == "true": sys.exit(1) else: log_error("================================================================================")
def get_sorted_sample_dirs_fastq_sizes(samples_parent_dir): """Given a parent directory containing multiple sample directories, return a list of the sample subdirectories with the size of the fastq files in each. Parameters ---------- samples_parent_dir : str Path to the parent directory containing multiple sample directories Returns ------- dir_sizes : list of tuples Sorted list of (size, path) tuples, largest first. """ sub_dirs = [ os.path.join(samples_parent_dir, d) for d in os.listdir(samples_parent_dir) if os.path.isdir(os.path.join(samples_parent_dir, d)) ] dir_sizes = [] for d in sub_dirs: size = sum(map(os.path.getsize, fastq.list_fastq_files(d))) dir_sizes.append((size, d)) dir_sizes.sort(reverse=True) return dir_sizes
def validate_file_of_sample_dirs(sample_dirs_file): """Verify each of the sample directories in the sample directory file is not empty and contains fastq's. Parameters ---------- sample_dirs_file : str Path to the file of sample directories """ found_error = False with open(sample_dirs_file) as f: for directory in f: directory = directory.strip() if not utils.verify_non_empty_directory("Sample directory", directory): found_error = True else: files = fastq.list_fastq_files(directory) if len(files) == 0: utils.report_error( "Sample directory %s does not contain any fastq files." % directory) found_error = True if found_error: if os.environ.get("StopOnSampleError") == "true": sys.exit(1) else: log_error( "================================================================================" )
def get_sorted_sample_dirs_fastq_sizes(samples_parent_dir): """Given a parent directory containing multiple sample directories, return a list of the sample subdirectories with the size of the fastq files in each. Parameters ---------- samples_parent_dir : str Path to the parent directory containing multiple sample directories Returns ------- dir_sizes : list of tuples Sorted list of (size, path) tuples, largest first. """ sub_dirs = [os.path.join(samples_parent_dir, d) for d in os.listdir(samples_parent_dir) if os.path.isdir(os.path.join(samples_parent_dir, d))] dir_sizes = [] for d in sub_dirs: size = sum(map(os.path.getsize, fastq.list_fastq_files(d))) dir_sizes.append((size, d)) dir_sizes.sort(reverse=True) return dir_sizes
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of reads and %mapped from sam file")) #------------------------- num_reads = "" percent_reads_mapped = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get("percentReadsMapped", "") # reuse already fresh metrics if num_reads and percent_reads_mapped: verbose_print("Reusing previously calculated number of reads and %mapped") else: num_reads = command.run("samtools view -S -c " + file) num_reads = num_reads.strip() mapped = command.run("samtools view -S -c -F 4 " + file) mapped = mapped.strip() try: percent_reads_mapped = 100.0 * float(mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: handle_error("Cannot calculate number of reads and %mapped.") #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print("# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print("Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean insert size from bam file")) #------------------------- ave_insert_size = "" file = os.path.join(sample_dir, "reads.sorted.bam") if verify_input_file("BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics if ave_insert_size: verbose_print("Reusing previously calculated mean insert size") else: # Extract inferred insert sizes (TLEN, column 9 of BAM file) for reads "mapped in proper pair" (2) and "first in pair" (64) = 66 tempfile = NamedTemporaryFile(delete=False, dir=sample_dir, prefix="tmp.inserts.", mode='w') command.run("samtools view -f 66 " + file + " | cut -f 9 | sed 's/^-//'", tempfile.name) insert_count = 0 insert_sum = 0 with open(tempfile.name) as f: for line in f: try: insert_sum += int(line) insert_count += 1 except ValueError: pass os.unlink(tempfile.name) if insert_count > 0 and insert_sum > 0: ave_insert_size = float(insert_sum) / float(insert_count) ave_insert_size = "%.2f" % ave_insert_size else: handle_error("Cannot calculate mean insert size.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get("phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print("Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get("snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print("Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print("Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions(file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get("missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print("Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions(file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)
def run(args): """Run all the steps of the snp pipeline in th correct order. Parameters ---------- args : Namespace referenceFile : str Relative or absolute path to the reference fasta file forceFlag : bool Force processing even when result files already exist and are newer than inputs mirror : str Mode to create a mirror copy of the reference directory and all the sample directories. Possible values: {soft, hard, copy} configFile : str Relative or absolute path to a configuration file for overriding defaults and defining extra parameters for the tools and scripts within the pipeline. jobQueueMgr : str Job queue manager for remote parallel job execution in an HPC environment. Currently "torque" and "grid" are supported. If not specified, the pipeline will execute locally. workDir : str Output directory for the result files. samplesDir : str Relative or absolute path to the parent directory of all the sample directories. samplesFile : str Relative or absolute path to a file listing all of the sample directories. """ global log_dir global job_queue_mgr # Where are we running: grid, torque, or None (local) job_queue_mgr = args.jobQueueMgr # Erase any left-over error log environment variable from a previous run os.environ.pop("errorOutputFile", None) # the 2nd arg avoids an exception when not in dict # Handle output working directory. Create the directory if it does not exist. # Any errors creating the work_dir will not be logged to the error log because # the error log belongs in the work_dir. work_dir = args.workDir try: utils.mkdir_p(work_dir) except OSError as exc: utils.fatal_error("Error: could not create the output directory %s" % work_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: output directory % is not writable." % work_dir) # The error log is in the main workdir error_output_file = os.path.join(work_dir, "error.log") os.environ["errorOutputFile"] = error_output_file # TODO: copy old error log to old logs directory, because otherwise it will be removed and lost forever if os.path.isfile(error_output_file): os.remove(error_output_file) # Validate reference fasta file reference_file_path = args.referenceFile if not os.path.isfile(reference_file_path): utils.fatal_error("Error: reference file %s does not exist." % reference_file_path) if os.path.getsize(reference_file_path) == 0: utils.fatal_error("Error: reference file %s is empty." % reference_file_path) reference_file_name = os.path.basename(reference_file_path) # Force rebuild flag is passed to all the subtask commands below force_flag = " -f " if args.forceFlag else " " # Create the logs directory with name like "logs-20170215.144253" run_time_stamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) log_dir = os.path.join(work_dir, "logs-" + run_time_stamp) try: utils.mkdir_p(log_dir) except OSError as exc: utils.fatal_error("Error: could not create the logs directory %s" % log_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: logs directory % is not writable." % log_dir) # Handle configuration file, use the specified file, or create a default file if args.configFile: config_file_path = args.configFile if not os.path.isfile(config_file_path): utils.fatal_error("Error: configuration file %s does not exist." % config_file_path) if os.path.getsize(config_file_path) == 0: utils.fatal_error("Error: configuration file %s is empty." % config_file_path) shutil.copy2(config_file_path, log_dir) # copy2 tries to preserve timestamps config_params = utils.read_properties(config_file_path, recognize_vars=True) validate_properties(config_params) else: command.run("cfsan_snp_pipeline data configurationFile " + log_dir, outfile=sys.stdout) config_file_path = os.path.join(log_dir, "snppipeline.conf") config_params = utils.read_properties(config_file_path, recognize_vars=True) # Validate the configured aligner choice snp_pipeline_aligner = config_params.get("SnpPipeline_Aligner", "").lower() or "bowtie2" if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.fatal_error("Config file error in SnpPipeline_Aligner parameter: only bowtie2 and smalt aligners are supported.") os.environ["SnpPipeline_Aligner"] = snp_pipeline_aligner # Stop the pipeline by default upon single sample errors if not configured either way # The environment variable is used by called processes stop_on_error = config_params.get("StopOnSampleError", "").lower() or "true" os.environ["StopOnSampleError"] = stop_on_error # Convert the stop_on_error flag to boolean for internal use in this function stop_on_error = stop_on_error == "true" # How many CPU cores can we use? max_cpu_cores = config_params.get("MaxCpuCores", None) if max_cpu_cores == "": max_cpu_cores = None if max_cpu_cores: try: max_cpu_cores = int(max_cpu_cores) if max_cpu_cores < 1: utils.fatal_error("Config file error in MaxCpuCores parameter: %s is less than one." % max_cpu_cores) except ValueError: utils.fatal_error("Config file error in MaxCpuCores parameter: %s is not a valid number." % max_cpu_cores) if job_queue_mgr is None: # workstation num_local_cpu_cores = psutil.cpu_count() max_cpu_cores = min(num_local_cpu_cores, max_cpu_cores) if max_cpu_cores else num_local_cpu_cores # Put the configuration parameters into the process environment variables os.environ["Bowtie2Build_ExtraParams"] = config_params.get("Bowtie2Build_ExtraParams", "") os.environ["SmaltIndex_ExtraParams"] = config_params.get("SmaltIndex_ExtraParams", "") os.environ["SamtoolsFaidx_ExtraParams"] = config_params.get("SamtoolsFaidx_ExtraParams", "") os.environ["Bowtie2Align_ExtraParams"] = config_params.get("Bowtie2Align_ExtraParams", "") os.environ["SmaltAlign_ExtraParams"] = config_params.get("SmaltAlign_ExtraParams", "") os.environ["SamtoolsSamFilter_ExtraParams"] = config_params.get("SamtoolsSamFilter_ExtraParams", "") os.environ["SamtoolsSort_ExtraParams"] = config_params.get("SamtoolsSort_ExtraParams", "") os.environ["RemoveDuplicateReads"] = config_params.get("RemoveDuplicateReads", "").lower() or "true" os.environ["PicardMarkDuplicates_ExtraParams"] = config_params.get("PicardMarkDuplicates_ExtraParams", "") os.environ["PicardJvm_ExtraParams"] = config_params.get("PicardJvm_ExtraParams", "") os.environ["SamtoolsMpileup_ExtraParams"] = config_params.get("SamtoolsMpileup_ExtraParams", "") os.environ["VarscanMpileup2snp_ExtraParams"] = config_params.get("VarscanMpileup2snp_ExtraParams", "") os.environ["VarscanJvm_ExtraParams"] = config_params.get("VarscanJvm_ExtraParams", "") os.environ["FilterRegions_ExtraParams"] = config_params.get("FilterRegions_ExtraParams", "") os.environ["MergeSites_ExtraParams"] = config_params.get("MergeSites_ExtraParams", "") os.environ["CallConsensus_ExtraParams"] = config_params.get("CallConsensus_ExtraParams", "") os.environ["SnpMatrix_ExtraParams"] = config_params.get("SnpMatrix_ExtraParams", "") os.environ["BcftoolsMerge_ExtraParams"] = config_params.get("BcftoolsMerge_ExtraParams", "") os.environ["SnpReference_ExtraParams"] = config_params.get("SnpReference_ExtraParams", "") os.environ["MergeVcfs_ExtraParams"] = config_params.get("MergeVcfs_ExtraParams", "") os.environ["CollectMetrics_ExtraParams"] = config_params.get("CollectMetrics_ExtraParams", "") os.environ["CombineMetrics_ExtraParams"] = config_params.get("CombineMetrics_ExtraParams", "") # Verify the dependencies are available on the path dependencies = ["cfsan_snp_pipeline", snp_pipeline_aligner, "samtools", "java", "tabix", "bgzip", "bcftools"] found_all_dependencies = True for executable in dependencies: if not utils.which(executable): utils.report_error(executable + " is not on the path") found_all_dependencies = False stdout = command.run("java net.sf.varscan.VarScan 2>&1") if "Error" in stdout: utils.report_error("CLASSPATH is not configured with the path to VarScan") found_all_dependencies = False if os.environ["RemoveDuplicateReads"] == "true": stdout = command.run("java picard.cmdline.PicardCommandLine 2>&1") if "Error" in stdout: utils.report_error("CLASSPATH is not configured with the path to Picard") found_all_dependencies = False if not found_all_dependencies: utils.fatal_error("Check the SNP Pipeline installation instructions here: http://snp-pipeline.readthedocs.org/en/latest/installation.html") # Process the sample directory command line option # TODO: detect broken fastq symlinks if args.samplesDir: samples_parent_dir = args.samplesDir.rstrip('/') # strip trailing slash if not utils.verify_non_empty_directory("Samples directory", samples_parent_dir): sys.exit(1) # verify at least one of the subdirectories contains fastq files. dir_sizes = get_sorted_sample_dirs_fastq_sizes(samples_parent_dir) dir_sizes = [(size, path) for size, path in dir_sizes if size > 0] if len(dir_sizes) == 0: utils.fatal_error("Samples directory %s does not contain subdirectories with fastq files." % samples_parent_dir) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(samples_parent_dir, sample_dirs_file) # Process the file of sample directories command line option # TODO: detect broken fastq symlinks if args.samplesFile: sample_dirs_file = args.samplesFile if not os.path.isfile(sample_dirs_file): utils.fatal_error("Error: the file of samples directories, %s, does not exist." % sample_dirs_file) if os.path.getsize(sample_dirs_file) == 0: utils.fatal_error("Error: the file of samples directories, %s, is empty." % sample_dirs_file) rewrite_cleansed_file_of_sample_dirs(sample_dirs_file, os.path.join(work_dir, "sampleDirectories.txt")) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") validate_file_of_sample_dirs(sample_dirs_file) with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() sample_count = len(sample_dirs_list) # -------------------------------------------------------- if job_queue_mgr is None: progress("Step 1 - Prep work") else: print("Step 1 - Prep work") # -------------------------------------------------------- # Mirror the input reference and samples if requested # TODO: make this a pure python solution if args.mirror: if args.mirror == "soft": # soft link, subsequent freshness checks use the timestamp of original file, not the soft link mirror_flag = " -s " elif args.mirror == "hard": # hard link, automatically preserves attributes of the original file mirror_flag = " -l " else: # regular copy, -p explicitly preserves attributes of the original file mirror_flag = " -p " # flush stdout to keep the unbuffered stderr in chronological order with stdout sys.stdout.flush() # Mirror/link the reference work_reference_dir = os.path.join(work_dir, "reference") utils.mkdir_p(work_reference_dir) src_reference_file = os.path.abspath(reference_file_path) cmd = "cp -v -u -f" + mirror_flag + src_reference_file + ' ' + work_reference_dir subprocess.check_call(cmd, shell=True) # since we mirrored the reference, we need to update our reference location reference_file_path = os.path.join(work_reference_dir, reference_file_name) # Mirror/link the samples work_samples_parent_dir = os.path.join(work_dir, "samples") for directory in sample_dirs_list: basedir = os.path.basename(directory) work_sample_dir = os.path.join(work_samples_parent_dir, basedir) utils.mkdir_p(work_sample_dir) src_sample_dir = os.path.abspath(directory) # copy without stderr message and without exit error code because the fastq or fq files might not exist cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fastq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) # since we mirrored the samples, we need to update our sorted list of samples sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(work_samples_parent_dir, sample_dirs_file) # refresh the list of sample dirs -- now in sorted order with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() # get the *.fastq or *.fq files in each sample directory, possibly compresessed, on one line per sample, ready to feed to bowtie sample_full_path_names_file = os.path.join(work_dir, "sampleFullPathNames.txt") with open(sample_full_path_names_file, 'w') as f: for directory in sample_dirs_list: file_list = fastq.list_fastq_files(directory) print(' '.join(file_list), file=f) # Initialize the job runner if job_queue_mgr is None: runner = JobRunner("local", exception_handler=handle_exception, verbose=args.verbose >= 4) elif job_queue_mgr == "grid": strip_job_array_suffix = config_params.get("GridEngine_StripJobArraySuffix", "true").lower() qsub_extra_params = config_params.get("GridEngine_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) else: strip_job_array_suffix = config_params.get("Torque_StripJobArraySuffix", "false").lower() qsub_extra_params = config_params.get("Torque_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) progress("Step 2 - Index the reference") log_file = os.path.join(log_dir, "indexRef.log") command_line = "cfsan_snp_pipeline index_ref" + force_flag + reference_file_path job_id_index_ref = runner.run(command_line, "indexRef", log_file) progress("Step 3 - Map the sample reads to the reference") # Parse the user-specified aligner parameters to find the number of CPU cores requested, for example, "-p 16" or "-n 16" # Set the default number of CPU cores if the user did not configure a value. if snp_pipeline_aligner == "smalt": extra_params_env_var = "SmaltAlign_ExtraParams" threads_option = "-n" else: extra_params_env_var = "Bowtie2Align_ExtraParams" threads_option = "-p" max_processes, threads_per_process = configure_process_threads(extra_params_env_var, threads_option, 8, max_cpu_cores) parallel_environment = config_params.get("GridEngine_PEname", None) log_file = os.path.join(log_dir, "mapReads.log") command_line = "cfsan_snp_pipeline map_reads" + force_flag + reference_file_path + " {1} {2}" job_id_map_reads = runner.run_array(command_line, "mapReads", log_file, sample_full_path_names_file, max_processes=max_processes, wait_for=[job_id_index_ref], threads=threads_per_process, parallel_environment=parallel_environment) progress("Step 4 - Find sites with SNPs in each sample") if job_queue_mgr in ["grid", "torque"]: time.sleep(1.0 + float(sample_count) / 150) # workaround torque bug when submitting two large consecutive array jobs, potential bug for grid log_file = os.path.join(log_dir, "callSites.log") command_line = "cfsan_snp_pipeline call_sites" + force_flag + reference_file_path + " {1}" job_id_call_sites = runner.run_array(command_line, "callSites", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_map_reads], slot_dependency=True) progress("Step 5 - Filter abnormal SNP regions") log_file = os.path.join(log_dir, "filterRegions.log") extra_params = os.environ.get("FilterRegions_ExtraParams", "") command_line = "cfsan_snp_pipeline filter_regions" + force_flag + "-n var.flt.vcf " + sample_dirs_file + ' ' + reference_file_path + ' ' + extra_params job_id_filter_regions = runner.run(command_line, "filterRegions", log_file, wait_for_array=[job_id_call_sites]) # Starting from here, there are 2 threads: # Thread X.1: the thread processing the original VCF files and corresponding downstream results # Thread X.2: the thread processing the preserved VCF files and corresponding downstream results progress("Step 6.1 - Merge the SNP sites across all samples into the SNP list file") # The mergeSites process creates the filtered list of sample directories. It is the list of samples not having excessive snps. # When running on a workstation, the file exists at this point during the script execution, but on grid or torque, it has not yet been created. However, # we know the path to the file regardless of whether it exists yet. filtered_sample_dirs_file = sample_dirs_file + ".OrigVCF.filtered" # touch $filtered_sample_dirs_file # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites.log") output_file = os.path.join(work_dir, "snplist.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file job_id_merge_sites = runner.run(command_line, "mergeSites", log_file, wait_for=[job_id_filter_regions]) progress("Step 6.2 - Merge the SNP sites across all samples into the SNP list file") # Create another copy of sample directories file, for the thread processing preserved snp files. filtered_sample_dirs_file2 = sample_dirs_file + ".PresVCF.filtered" # touch $filtered_sample_dirs_file2 # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites_preserved.log") output_file = os.path.join(work_dir, "snplist_preserved.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file2 job_id_merge_sites2 = runner.run(command_line, "mergeSites_preserved", log_file, wait_for=[job_id_filter_regions]) progress("Step 7.1 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = "{1}/consensus.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus.vcf {1}/reads.all.pileup" job_id_call_consensus = runner.run_array(command_line, "callConsensus", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites]) progress("Step 7.2 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = "{1}/consensus_preserved.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " -e {1}/var.flt_removed.vcf --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus_preserved.vcf {1}/reads.all.pileup" job_id_call_consensus2 = runner.run_array(command_line, "callConsensus_preserved", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites2]) progress("Step 8.1 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix.log") output_file = os.path.join(work_dir, "snpma.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_snp_matrix = runner.run(command_line, "snpMatrix", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 8.2 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus_preserved.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_snp_matrix2 = runner.run(command_line, "snpMatrix_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 9.1 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = os.path.join(work_dir, "referenceSNP.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference = runner.run(command_line, "snpReference", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 9.2 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = os.path.join(work_dir, "referenceSNP_preserved.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference2 = runner.run(command_line, "snpReference_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 10.1 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs.log") output_file = os.path.join(work_dir, "snpma.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_merge_vcfs = runner.run(command_line, "mergeVcfs", log_file, wait_for_array=[job_id_call_consensus]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 10.2 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-n consensus_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_merge_vcfs2 = runner.run(command_line, "mergeVcfs_preserved", log_file, wait_for_array=[job_id_call_consensus2]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 11.1 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance.log") input_file = os.path.join(work_dir, "snpma.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance = runner.run(command_line, "distance", log_file, wait_for=[job_id_snp_matrix]) progress("Step 11.2 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance_preserved.log") input_file = os.path.join(work_dir, "snpma_preserved.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise_preserved.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix_preserved.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance2 = runner.run(command_line, "distance_preserved", log_file, wait_for=[job_id_snp_matrix2]) progress("Step 12 - Collect metrics for each sample") log_file = os.path.join(log_dir, "collectMetrics.log") output_file = "{1}/metrics" extra_params = os.environ.get("CollectMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline collect_metrics" + force_flag + "-o " + output_file + ' ' + extra_params + " {1} " + reference_file_path job_id_collect_metrics = runner.run_array(command_line, "collectMetrics", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_call_consensus, job_id_call_consensus2], slot_dependency=True) progress("Step 13 - Combine the metrics across all samples into the metrics table") log_file = os.path.join(log_dir, "combineMetrics.log") output_file = os.path.join(work_dir, "metrics.tsv") extra_params = os.environ.get("CombineMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline combine_metrics" + force_flag + "-n metrics -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file combine_metrics_job_id = runner.run(command_line, "combineMetrics", log_file, wait_for_array=[job_id_collect_metrics]) # Step 14 - Notify user of any non-fatal errors accumulated during processing if os.path.isfile(error_output_file) and os.path.getsize(error_output_file) > 0 and not stop_on_error: print("\nThere were errors processing some samples.\nSee the log file %s for a summary of errors." % error_output_file, file=sys.stderr) # Exit here to prevent showing the "cfsan_snp_pipeline run finished" message. The jobs are queued, not finished yet. if job_queue_mgr is not None: # HPC sys.exit(0)
def run(args): """Run all the steps of the snp pipeline in th correct order. Parameters ---------- args : Namespace referenceFile : str Relative or absolute path to the reference fasta file forceFlag : bool Force processing even when result files already exist and are newer than inputs mirror : str Mode to create a mirror copy of the reference directory and all the sample directories. Possible values: {soft, hard, copy} configFile : str Relative or absolute path to a configuration file for overriding defaults and defining extra parameters for the tools and scripts within the pipeline. jobQueueMgr : str Job queue manager for remote parallel job execution in an HPC environment. Currently "torque" and "grid" are supported. If not specified, the pipeline will execute locally. workDir : str Output directory for the result files. samplesDir : str Relative or absolute path to the parent directory of all the sample directories. samplesFile : str Relative or absolute path to a file listing all of the sample directories. purge : bool Purge the intermediate output files when the pipeline completes successfully. """ global log_dir global job_queue_mgr start_time = time.time() # Where are we running: grid, torque, or None (local) job_queue_mgr = args.jobQueueMgr # Erase any left-over error log environment variable from a previous run os.environ.pop("errorOutputFile", None) # the 2nd arg avoids an exception when not in dict # Handle output working directory. Create the directory if it does not exist. # Any errors creating the work_dir will not be logged to the error log because # the error log belongs in the work_dir. work_dir = args.workDir try: utils.mkdir_p(work_dir) except OSError as exc: utils.fatal_error("Error: could not create the output directory %s" % work_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: output directory % is not writable." % work_dir) # The error log is in the main workdir error_output_file = os.path.join(work_dir, "error.log") os.environ["errorOutputFile"] = error_output_file # TODO: copy old error log to old logs directory, because otherwise it will be removed and lost forever if os.path.isfile(error_output_file): os.remove(error_output_file) # Validate reference fasta file reference_file_path = args.referenceFile if not os.path.isfile(reference_file_path): utils.fatal_error("Error: reference file %s does not exist." % reference_file_path) if os.path.getsize(reference_file_path) == 0: utils.fatal_error("Error: reference file %s is empty." % reference_file_path) reference_file_name = os.path.basename(reference_file_path) # Force rebuild flag is passed to all the subtask commands below force_flag = " -f " if args.forceFlag else " " # Create the logs directory with name like "logs-20170215.144253" run_time_stamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) log_dir = os.path.join(work_dir, "logs-" + run_time_stamp) try: utils.mkdir_p(log_dir) except OSError as exc: utils.fatal_error("Error: could not create the logs directory %s" % log_dir) if not utils.is_directory_writeable(work_dir): utils.fatal_error("Error: logs directory % is not writable." % log_dir) # Handle configuration file, use the specified file, or create a default file if args.configFile: config_file_path = args.configFile if not os.path.isfile(config_file_path): utils.fatal_error("Error: configuration file %s does not exist." % config_file_path) if os.path.getsize(config_file_path) == 0: utils.fatal_error("Error: configuration file %s is empty." % config_file_path) shutil.copy2(config_file_path, log_dir) # copy2 tries to preserve timestamps config_params = utils.read_properties(config_file_path, recognize_vars=True) validate_properties(config_params) else: command.run("cfsan_snp_pipeline data configurationFile " + log_dir, outfile=sys.stdout) config_file_path = os.path.join(log_dir, "snppipeline.conf") config_params = utils.read_properties(config_file_path, recognize_vars=True) # Validate the configured aligner choice snp_pipeline_aligner = config_params.get("SnpPipeline_Aligner", "").lower() or "bowtie2" if snp_pipeline_aligner not in ["bowtie2", "smalt"]: utils.fatal_error( "Config file error in SnpPipeline_Aligner parameter: only bowtie2 and smalt aligners are supported." ) os.environ["SnpPipeline_Aligner"] = snp_pipeline_aligner # Stop the pipeline by default upon single sample errors if not configured either way # The environment variable is used by called processes stop_on_error = config_params.get("StopOnSampleError", "").lower() or "true" os.environ["StopOnSampleError"] = stop_on_error # Convert the stop_on_error flag to boolean for internal use in this function stop_on_error = stop_on_error == "true" # How many CPU cores can we use? max_cpu_cores = config_params.get("MaxCpuCores", None) if max_cpu_cores == "": max_cpu_cores = None if max_cpu_cores: try: max_cpu_cores = int(max_cpu_cores) if max_cpu_cores < 1: utils.fatal_error( "Config file error in MaxCpuCores parameter: %s is less than one." % max_cpu_cores) except ValueError: utils.fatal_error( "Config file error in MaxCpuCores parameter: %s is not a valid number." % max_cpu_cores) if job_queue_mgr is None: # workstation num_local_cpu_cores = psutil.cpu_count() max_cpu_cores = min( num_local_cpu_cores, max_cpu_cores) if max_cpu_cores else num_local_cpu_cores # How many CPU cores per process? if job_queue_mgr is None: # workstation cpu_cores_per_process = config_params.get( "CpuCoresPerProcessOnWorkstation", None) if cpu_cores_per_process: try: cpu_cores_per_process = int(cpu_cores_per_process) if cpu_cores_per_process < 1: utils.fatal_error( "Config file error in CpuCoresPerProcessOnWorkstation parameter: %s is less than one." % cpu_cores_per_process) except ValueError: utils.fatal_error( "Config file error in CpuCoresPerProcessOnWorkstation parameter: %s is not a valid number." % cpu_cores_per_process) else: cpu_cores_per_process = min(num_local_cpu_cores, max_cpu_cores) else: # HPC cpu_cores_per_process = config_params.get("CpuCoresPerProcessOnHPC", None) if not cpu_cores_per_process: utils.fatal_error( "Config file error. CpuCoresPerProcessOnHPC parameter must be set to a value." ) else: try: cpu_cores_per_process = int(cpu_cores_per_process) if cpu_cores_per_process < 1: utils.fatal_error( "Config file error in CpuCoresPerProcessOnHPC parameter: %s is less than one." % cpu_cores_per_process) except ValueError: utils.fatal_error( "Config file error in CpuCoresPerProcessOnHPC parameter: %s is not a valid number." % cpu_cores_per_process) # Put the configuration parameters into the process environment variables os.environ["Bowtie2Build_ExtraParams"] = config_params.get( "Bowtie2Build_ExtraParams", "") os.environ["SmaltIndex_ExtraParams"] = config_params.get( "SmaltIndex_ExtraParams", "") os.environ["CreateSequenceDictionary_ExtraParams"] = config_params.get( "CreateSequenceDictionary_ExtraParams", "") os.environ["SamtoolsFaidx_ExtraParams"] = config_params.get( "SamtoolsFaidx_ExtraParams", "") os.environ["Bowtie2Align_ExtraParams"] = config_params.get( "Bowtie2Align_ExtraParams", "") os.environ["SmaltAlign_ExtraParams"] = config_params.get( "SmaltAlign_ExtraParams", "") os.environ["SamtoolsSamFilter_ExtraParams"] = config_params.get( "SamtoolsSamFilter_ExtraParams", "") os.environ["SamtoolsSort_ExtraParams"] = config_params.get( "SamtoolsSort_ExtraParams", "") os.environ["SamtoolsIndex_ExtraParams"] = config_params.get( "SamtoolsIndex_ExtraParams", "") os.environ["RemoveDuplicateReads"] = config_params.get( "RemoveDuplicateReads", "").lower() or "true" os.environ["PicardJvm_ExtraParams"] = config_params.get( "PicardJvm_ExtraParams", "") os.environ["PicardMarkDuplicates_ExtraParams"] = config_params.get( "PicardMarkDuplicates_ExtraParams", "") os.environ["EnableLocalRealignment"] = config_params.get( "EnableLocalRealignment", "").lower() or "true" os.environ["GatkJvm_ExtraParams"] = config_params.get( "GatkJvm_ExtraParams", "") os.environ["RealignerTargetCreator_ExtraParams"] = config_params.get( "RealignerTargetCreator_ExtraParams", "") os.environ["IndelRealigner_ExtraParams"] = config_params.get( "IndelRealigner_ExtraParams", "") os.environ["SamtoolsMpileup_ExtraParams"] = config_params.get( "SamtoolsMpileup_ExtraParams", "") os.environ["VarscanMpileup2snp_ExtraParams"] = config_params.get( "VarscanMpileup2snp_ExtraParams", "") os.environ["VarscanJvm_ExtraParams"] = config_params.get( "VarscanJvm_ExtraParams", "") os.environ["FilterRegions_ExtraParams"] = config_params.get( "FilterRegions_ExtraParams", "") os.environ["MergeSites_ExtraParams"] = config_params.get( "MergeSites_ExtraParams", "") os.environ["CallConsensus_ExtraParams"] = config_params.get( "CallConsensus_ExtraParams", "") os.environ["SnpMatrix_ExtraParams"] = config_params.get( "SnpMatrix_ExtraParams", "") os.environ["BcftoolsMerge_ExtraParams"] = config_params.get( "BcftoolsMerge_ExtraParams", "") os.environ["SnpReference_ExtraParams"] = config_params.get( "SnpReference_ExtraParams", "") os.environ["MergeVcfs_ExtraParams"] = config_params.get( "MergeVcfs_ExtraParams", "") os.environ["CollectMetrics_ExtraParams"] = config_params.get( "CollectMetrics_ExtraParams", "") os.environ["CombineMetrics_ExtraParams"] = config_params.get( "CombineMetrics_ExtraParams", "") # Verify the dependencies are available on the path print("Checking dependencies...") dependencies = [ "cfsan_snp_pipeline", snp_pipeline_aligner, "java", "tabix", "bgzip", "bcftools" ] found_all_dependencies = True for executable in dependencies: if not utils.which(executable): utils.report_error(executable + " is not on the path") found_all_dependencies = False if not utils.which("samtools"): utils.report_error("samtools is not on the path") found_all_dependencies = False else: version_str = utils.extract_version_str("SAMtools", "samtools 2>&1 > /dev/null") samtools_version = version_str.split()[-1] # just the number if samtools_version < "1.4": utils.report_error( "The installed %s is not supported. Version 1.4 or higher is required." % version_str) found_all_dependencies = False jar_file_path = utils.find_path_in_path_list("VarScan", "CLASSPATH") if jar_file_path: stdout = command.run("java -jar " + jar_file_path + " 2>&1") if not jar_file_path or "error" in stdout.lower(): utils.report_error( "CLASSPATH is not configured with the path to VarScan.jar") found_all_dependencies = False picard_required = os.environ[ "RemoveDuplicateReads"] == "true" or os.environ[ "EnableLocalRealignment"] == "true" if picard_required: jar_file_path = utils.find_path_in_path_list("picard", "CLASSPATH") if not jar_file_path: utils.report_error( "CLASSPATH is not configured with the path to picard.jar") found_all_dependencies = False else: stdout = command.run("java -jar " + jar_file_path + " 2>&1") if stdout.lower().startswith("error"): utils.report_error(stdout) found_all_dependencies = False gatk_required = os.environ["EnableLocalRealignment"] == "true" if gatk_required: jar_file_path = utils.find_path_in_path_list("GenomeAnalysisTK", "CLASSPATH") if not jar_file_path: utils.report_error( "CLASSPATH is not configured with the path to GenomeAnalysisTK.jar" ) found_all_dependencies = False else: stdout = command.run("java -jar " + jar_file_path + " --version 2>&1") if stdout.lower().startswith("error"): utils.report_error(stdout) found_all_dependencies = False else: stdout = command.run("java -jar " + jar_file_path + " -T IndelRealigner --version 2>&1") if "not a valid command" in stdout.lower( ) or "indelrealigner is no longer included" in stdout.lower(): utils.report_error( "The installed GATK version does not support indel realignment. Try installing an older release prior to GATK v4." ) found_all_dependencies = False elif "user error has occurred" in stdout.lower(): utils.report_error(stdout) found_all_dependencies = False if not found_all_dependencies: utils.fatal_error( "Check the SNP Pipeline installation instructions here: http://snp-pipeline.readthedocs.org/en/latest/installation.html" ) else: print("OK") # Process the sample directory command line option # TODO: detect broken fastq symlinks if args.samplesDir: samples_parent_dir = args.samplesDir.rstrip( '/') # strip trailing slash if not utils.verify_non_empty_directory("Samples directory", samples_parent_dir): sys.exit(1) # verify at least one of the subdirectories contains fastq files. dir_sizes = get_sorted_sample_dirs_fastq_sizes(samples_parent_dir) dir_sizes = [(size, path) for size, path in dir_sizes if size > 0] if len(dir_sizes) == 0: utils.fatal_error( "Samples directory %s does not contain subdirectories with fastq files." % samples_parent_dir) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(samples_parent_dir, sample_dirs_file) # Process the file of sample directories command line option # TODO: detect broken fastq symlinks if args.samplesFile: sample_dirs_file = args.samplesFile if not os.path.isfile(sample_dirs_file): utils.fatal_error( "Error: the file of samples directories, %s, does not exist." % sample_dirs_file) if os.path.getsize(sample_dirs_file) == 0: utils.fatal_error( "Error: the file of samples directories, %s, is empty." % sample_dirs_file) rewrite_cleansed_file_of_sample_dirs( sample_dirs_file, os.path.join(work_dir, "sampleDirectories.txt")) sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") validate_file_of_sample_dirs(sample_dirs_file) with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() sample_count = len(sample_dirs_list) # -------------------------------------------------------- if job_queue_mgr is None: progress("Step 1 - Prep work") else: print("Step 1 - Prep work") # -------------------------------------------------------- # Mirror the input reference and samples if requested # TODO: make this a pure python solution if args.mirror: if args.mirror == "soft": # soft link, subsequent freshness checks use the timestamp of original file, not the soft link mirror_flag = " -s " elif args.mirror == "hard": # hard link, automatically preserves attributes of the original file mirror_flag = " -l " else: # regular copy, -p explicitly preserves attributes of the original file mirror_flag = " -p " # flush stdout to keep the unbuffered stderr in chronological order with stdout sys.stdout.flush() # Mirror/link the reference work_reference_dir = os.path.join(work_dir, "reference") utils.mkdir_p(work_reference_dir) src_reference_file = os.path.abspath(reference_file_path) cmd = "cp -v -u -f" + mirror_flag + src_reference_file + ' ' + work_reference_dir subprocess.check_call(cmd, shell=True) # since we mirrored the reference, we need to update our reference location reference_file_path = os.path.join(work_reference_dir, reference_file_name) # Mirror/link the samples work_samples_parent_dir = os.path.join(work_dir, "samples") for directory in sample_dirs_list: basedir = os.path.basename(directory) work_sample_dir = os.path.join(work_samples_parent_dir, basedir) utils.mkdir_p(work_sample_dir) src_sample_dir = os.path.abspath(directory) # copy without stderr message and without exit error code because the fastq or fq files might not exist cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fastq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) cmd = "cp -r -v -u -f" + mirror_flag + src_sample_dir + "/*.fq* " + work_sample_dir + " 2> /dev/null || true" subprocess.check_call(cmd, shell=True) # since we mirrored the samples, we need to update our sorted list of samples sample_dirs_file = os.path.join(work_dir, "sampleDirectories.txt") persist_sorted_sample_dirs_file(work_samples_parent_dir, sample_dirs_file) # refresh the list of sample dirs -- now in sorted order with open(sample_dirs_file) as f: sample_dirs_list = f.read().splitlines() # get the *.fastq or *.fq files in each sample directory, possibly compresessed, on one line per sample, ready to feed to bowtie sample_full_path_names_file = os.path.join(work_dir, "sampleFullPathNames.txt") with open(sample_full_path_names_file, 'w') as f: for directory in sample_dirs_list: file_list = fastq.list_fastq_files(directory) print(' '.join(file_list), file=f) # Initialize the job runner if job_queue_mgr is None: runner = JobRunner("local", exception_handler=handle_exception, verbose=args.verbose >= 4) elif job_queue_mgr == "grid": strip_job_array_suffix = config_params.get( "GridEngine_StripJobArraySuffix", "true").lower() qsub_extra_params = config_params.get("GridEngine_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) else: strip_job_array_suffix = config_params.get( "Torque_StripJobArraySuffix", "false").lower() qsub_extra_params = config_params.get("Torque_QsubExtraParams") runner = JobRunner(job_queue_mgr, strip_job_array_suffix == "true", qsub_extra_params=qsub_extra_params, verbose=args.verbose >= 4) progress("Step 2 - Index the reference") log_file = os.path.join(log_dir, "indexRef.log") command_line = "cfsan_snp_pipeline index_ref" + force_flag + reference_file_path job_id_index_ref = runner.run(command_line, "indexRef", log_file) progress("Step 3 - Map the sample reads to the reference") # Parse the user-specified aligner parameters to find the number of CPU cores requested, for example, "-p 16" or "-n 16" # Set the default number of CPU cores if the user did not configure a value. if snp_pipeline_aligner == "smalt": extra_params_env_var = "SmaltAlign_ExtraParams" threads_option = "-n" else: extra_params_env_var = "Bowtie2Align_ExtraParams" threads_option = "-p" aligner_max_processes, aligner_threads_per_process = utils.configure_process_threads( extra_params_env_var, threads_option, cpu_cores_per_process, max_cpu_cores) samfilter_max_processes, samfilter_threads_per_process = utils.configure_process_threads( "SamtoolsSamFilter_ExtraParams", ["-@", "--threads"], cpu_cores_per_process, max_cpu_cores) samsort_max_processes, samsort_threads_per_process = utils.configure_process_threads( "SamtoolsSort_ExtraParams", ["-@", "--threads"], cpu_cores_per_process, max_cpu_cores) samindex_max_processes, samindex_threads_per_process = utils.configure_process_threads( "SamtoolsIndex_ExtraParams", ["-@"], cpu_cores_per_process, max_cpu_cores) realigner_max_processes, realigner_threads_per_process = utils.configure_process_threads( "RealignerTargetCreator_ExtraParams", ["-nt", "--num_threads"], cpu_cores_per_process, max_cpu_cores) # There are multiple processes within map_reads, each with multiple threads. # The CPU allocation must be enough for the process needing the largest number of threads. max_processes_list = [ aligner_max_processes, samfilter_max_processes, samsort_max_processes, samindex_max_processes, realigner_max_processes ] if all([i is None for i in max_processes_list]): max_processes = None else: max_processes = min([i for i in max_processes_list if i is not None]) threads_per_process = max(aligner_threads_per_process, samfilter_threads_per_process, samsort_threads_per_process, samindex_threads_per_process, realigner_threads_per_process) parallel_environment = config_params.get("GridEngine_PEname", None) log_file = os.path.join(log_dir, "mapReads.log") command_line = "cfsan_snp_pipeline map_reads --threads " + str( threads_per_process) + force_flag + reference_file_path + " {1} {2}" job_id_map_reads = runner.run_array( command_line, "mapReads", log_file, sample_full_path_names_file, max_processes=max_processes, wait_for=[job_id_index_ref], threads=threads_per_process, parallel_environment=parallel_environment) progress("Step 4 - Find sites with SNPs in each sample") if job_queue_mgr in ["grid", "torque"]: time.sleep( 1.0 + float(sample_count) / 150 ) # workaround torque bug when submitting two large consecutive array jobs, potential bug for grid log_file = os.path.join(log_dir, "callSites.log") command_line = "cfsan_snp_pipeline call_sites" + force_flag + reference_file_path + " {1}" job_id_call_sites = runner.run_array(command_line, "callSites", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_map_reads], slot_dependency=True) progress("Step 5 - Filter abnormal SNP regions") log_file = os.path.join(log_dir, "filterRegions.log") extra_params = os.environ.get("FilterRegions_ExtraParams", "") command_line = "cfsan_snp_pipeline filter_regions" + force_flag + "-n var.flt.vcf " + sample_dirs_file + ' ' + reference_file_path + ' ' + extra_params job_id_filter_regions = runner.run(command_line, "filterRegions", log_file, wait_for_array=[job_id_call_sites]) # Starting from here, there are 2 threads: # Thread X.1: the thread processing the original VCF files and corresponding downstream results # Thread X.2: the thread processing the preserved VCF files and corresponding downstream results progress( "Step 6.1 - Merge the SNP sites across all samples into the SNP list file" ) # The mergeSites process creates the filtered list of sample directories. It is the list of samples not having excessive snps. # When running on a workstation, the file exists at this point during the script execution, but on grid or torque, it has not yet been created. However, # we know the path to the file regardless of whether it exists yet. filtered_sample_dirs_file = sample_dirs_file + ".OrigVCF.filtered" # touch $filtered_sample_dirs_file # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites.log") output_file = os.path.join(work_dir, "snplist.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file job_id_merge_sites = runner.run(command_line, "mergeSites", log_file, wait_for=[job_id_filter_regions]) progress( "Step 6.2 - Merge the SNP sites across all samples into the SNP list file" ) # Create another copy of sample directories file, for the thread processing preserved snp files. filtered_sample_dirs_file2 = sample_dirs_file + ".PresVCF.filtered" # touch $filtered_sample_dirs_file2 # TODO: why was this touch here in the old run_snp_pipeline.sh script? log_file = os.path.join(log_dir, "mergeSites_preserved.log") output_file = os.path.join(work_dir, "snplist_preserved.txt") extra_params = os.environ.get("MergeSites_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_sites" + force_flag + "-n var.flt_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file + ' ' + filtered_sample_dirs_file2 job_id_merge_sites2 = runner.run(command_line, "mergeSites_preserved", log_file, wait_for=[job_id_filter_regions]) progress("Step 7.1 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = "{1}/consensus.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus.vcf {1}/reads.all.pileup" job_id_call_consensus = runner.run_array(command_line, "callConsensus", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites]) progress("Step 7.2 - Call the consensus SNPs for each sample") log_file = os.path.join(log_dir, "callConsensus_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = "{1}/consensus_preserved.fasta" extra_params = os.environ.get("CallConsensus_ExtraParams", "") command_line = "cfsan_snp_pipeline call_consensus" + force_flag + "-l " + list_file + " -o " + output_file + " -e {1}/var.flt_removed.vcf --vcfRefName " + reference_file_name + ' ' + extra_params + " --vcfFileName consensus_preserved.vcf {1}/reads.all.pileup" job_id_call_consensus2 = runner.run_array(command_line, "callConsensus_preserved", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for=[job_id_merge_sites2]) progress("Step 8.1 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix.log") output_file = os.path.join(work_dir, "snpma.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_snp_matrix = runner.run(command_line, "snpMatrix", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 8.2 - Create the SNP matrix") log_file = os.path.join(log_dir, "snpMatrix_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.fasta") extra_params = os.environ.get("SnpMatrix_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_matrix" + force_flag + "-c consensus_preserved.fasta -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_snp_matrix2 = runner.run(command_line, "snpMatrix_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 9.1 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference.log") list_file = os.path.join(work_dir, "snplist.txt") output_file = os.path.join(work_dir, "referenceSNP.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference = runner.run(command_line, "snpReference", log_file, wait_for_array=[job_id_call_consensus]) progress("Step 9.2 - Create the reference sequence at SNP sites") log_file = os.path.join(log_dir, "snpReference_preserved.log") list_file = os.path.join(work_dir, "snplist_preserved.txt") output_file = os.path.join(work_dir, "referenceSNP_preserved.fasta") extra_params = os.environ.get("SnpReference_ExtraParams", "") command_line = "cfsan_snp_pipeline snp_reference" + force_flag + "-l " + list_file + " -o " + output_file + ' ' + extra_params + ' ' + reference_file_path job_id_snp_reference2 = runner.run(command_line, "snpReference_preserved", log_file, wait_for_array=[job_id_call_consensus2]) progress("Step 10.1 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs.log") output_file = os.path.join(work_dir, "snpma.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file job_id_merge_vcfs = runner.run(command_line, "mergeVcfs", log_file, wait_for_array=[job_id_call_consensus]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 10.2 - Merge sample VCFs to create the multi-VCF file") if "--vcfFileName" in os.environ.get("CallConsensus_ExtraParams", ""): log_file = os.path.join(log_dir, "mergeVcfs_preserved.log") output_file = os.path.join(work_dir, "snpma_preserved.vcf") extra_params = os.environ.get("MergeVcfs_ExtraParams", "") command_line = "cfsan_snp_pipeline merge_vcfs" + force_flag + "-n consensus_preserved.vcf -o " + output_file + ' ' + extra_params + ' ' + filtered_sample_dirs_file2 job_id_merge_vcfs2 = runner.run( command_line, "mergeVcfs_preserved", log_file, wait_for_array=[job_id_call_consensus2]) else: print("Skipped per CallConsensus_ExtraParams configuration") progress("Step 11.1 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance.log") input_file = os.path.join(work_dir, "snpma.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance = runner.run(command_line, "distance", log_file, wait_for=[job_id_snp_matrix]) progress("Step 11.2 - Calculate SNP distance matrix") log_file = os.path.join(log_dir, "distance_preserved.log") input_file = os.path.join(work_dir, "snpma_preserved.fasta") pair_output_file = os.path.join(work_dir, "snp_distance_pairwise_preserved.tsv") matrix_output_file = os.path.join(work_dir, "snp_distance_matrix_preserved.tsv") command_line = "cfsan_snp_pipeline distance" + force_flag + "-p " + pair_output_file + " -m " + matrix_output_file + ' ' + input_file job_id_distance2 = runner.run(command_line, "distance_preserved", log_file, wait_for=[job_id_snp_matrix2]) progress("Step 12 - Collect metrics for each sample") log_file = os.path.join(log_dir, "collectMetrics.log") output_file = "{1}/metrics" extra_params = os.environ.get("CollectMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline collect_metrics" + force_flag + "-o " + output_file + ' ' + extra_params + " {1} " + reference_file_path job_id_collect_metrics = runner.run_array( command_line, "collectMetrics", log_file, sample_dirs_file, max_processes=max_cpu_cores, wait_for_array=[job_id_call_consensus, job_id_call_consensus2], slot_dependency=True) progress( "Step 13 - Combine the metrics across all samples into the metrics table" ) log_file = os.path.join(log_dir, "combineMetrics.log") output_file = os.path.join(work_dir, "metrics.tsv") extra_params = os.environ.get("CombineMetrics_ExtraParams", "") command_line = "cfsan_snp_pipeline combine_metrics" + force_flag + "-n metrics -o " + output_file + ' ' + extra_params + ' ' + sample_dirs_file combine_metrics_job_id = runner.run( command_line, "combineMetrics", log_file, wait_for_array=[job_id_collect_metrics]) # Decide whether to purge the intermediate output files upon successful completion. # Case 1: we are running on the HPC. We always need to submit the purge task. It will decide to do nothing if there were errors. if job_queue_mgr is not None: # HPC need_purge = args.purge # need to submit the purge task, it might decide to do nothing if there were errors # Case 2: we are running locally and we know right now whether there were any errors. # Case 2a: We are configured to stop on error, but the fact that we got this far means there were no errors -- so we need to purge. # Case 2b: We are configured to ignore errors, so now we look for evidence of errors and purge if there were no errors. else: errors_detected = os.path.isfile(error_output_file) need_purge = args.purge and not errors_detected if need_purge: progress("Step 14 - Purge the intermediate output files") log_file = os.path.join(log_dir, "purge.log") command_line = "cfsan_snp_pipeline purge " + work_dir purge_job_id = runner.run(command_line, "purge", log_file, wait_for=[combine_metrics_job_id]) # Step 15 - Notify user of any non-fatal errors accumulated during processing if os.path.isfile(error_output_file) and os.path.getsize( error_output_file) > 0 and not stop_on_error: print( "\nThere were errors processing some samples.\nSee the log file %s for a summary of errors." % error_output_file, file=sys.stderr) # Exit here to prevent showing the "cfsan_snp_pipeline run finished" message. The jobs are queued, not finished yet. if job_queue_mgr is not None: # HPC sys.exit(0) else: end_time = time.time() elapsed_time = end_time - start_time print("Elapsed time =", elapsed_time)
def collect_metrics(args): """Collect the quality metrics and SNP metrics for a sample. This function expects, or creates '(*)', the following files arranged in the following way: reference referenceFile.fasta samples sample_name_one/*.fastq.gz sample_name_one/reads.sam sample_name_one/reads.sorted.deduped.bam sample_name_one/reads.sorted.bam sample_name_one/reads.all.pileup sample_name_one/var.flt.vcf sample_name_one/var.flt_preserved.vcf sample_name_one/consensus.fasta sample_name_one/consensus_preserved.fasta sample_name_one/consensus.vcf sample_name_one/consensus_preserved.vcf sample_name_one/metrics* The input files are created outside of this function. The package documentation provides an example of preparing these files based on the lambda_virus sequence that is used as one test for this package. Parameters ---------- args : argparse.Namespace referenceFile : File path of the reference fasta file sampleDir : Relative or absolute directory of the sample consensusFastaFileName : File name of the consensus fasta file which must exist in the sample directory consensusPreservedFastaFileName : File name of the consensus preserved fasta file which must exist in the sample directory consensusVcfFileName : File name of the consensus vcf file which must exist in the sample directory consensusPreservedVcfFileName : File name of the consensus preserved vcf file which must exist in the sample directory maxSnps : Maximum allowed number of SNPs per sample metricsFile : Output file. Relative or absolute path to the metrics file """ utils.print_log_header(classpath=True) utils.print_arguments(args) #========================================================================== # Validate inputs #========================================================================== # Verify reference fasta file exists and is not empty reference_file_path = args.referenceFile utils.verify_non_empty_input_files("Reference file", [reference_file_path], error_handler="global") sample_dir = args.sampleDir utils.verify_non_empty_directory("Sample directory", sample_dir, error_handler="sample", continue_possible=False) metrics_file_path = args.metricsFile max_allowed_snps = args.maxSnps consensus_vcf_file_name = args.consensusVcfFileName consensus_preserved_vcf_file_name = args.consensusPreservedVcfFileName consensus_fasta_file_name = args.consensusFastaFileName consensus_preserved_fasta_file_name = args.consensusPreservedFastaFileName sample_id = utils.sample_id_from_dir(sample_dir) #========================================================================== # Read existing metrics file so some metrics can be reused #========================================================================== try: metrics = utils.read_properties(metrics_file_path) except IOError: metrics = dict() #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Get machine and flowcell from fastq header")) #------------------------- machine = "" flowcell = "" fastq_files = fastq.list_fastq_files(sample_dir) fastq_files = [f for f in fastq_files if os.path.isfile(f)] # Exclude broken symlinks if not fastq_files: handle_error("No fastq files were found.") else: tags = fastq.extract_metadata_tags(fastq_files[0]) if tags: machine = tags.instrument or "" flowcell = tags.flow_cell or "" #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Sum file sizes of paired fastq files")) #------------------------- fastq_file_size = "" fastq_file_list = "" if fastq_files: fastq_file_size = sum([os.path.getsize(file) for file in fastq_files]) # Make a comma separated list of just the fastq file names without directories fastq_file_list = [os.path.basename(file) for file in fastq_files] fastq_file_list = ", ".join(fastq_file_list) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Calculate number of reads, %mapped, %proper pair, and ave insert size from sam file" )) #------------------------- num_reads = "" percent_reads_mapped = "" percent_proper_pair = "" ave_insert_size = "" file = os.path.join(sample_dir, "reads.sam") if verify_input_file("SAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_reads = metrics.get("numberReads", "") # reuse already fresh metrics percent_reads_mapped = metrics.get( "percentReadsMapped", "") # reuse already fresh metrics percent_proper_pair = metrics.get( "percentProperPair", "") # reuse already fresh metrics ave_insert_size = metrics.get("aveInsertSize", "") # reuse already fresh metrics missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if not missing_any_metrics: verbose_print( "Reusing previously calculated number of reads, %mapped, %proper pair, and ave insert size" ) else: tempfile_path = os.path.join(sample_dir, "tmp.sam.stats") try: command.run("samtools stats " + file, tempfile_path) except subprocess.CalledProcessError: pass # the error message has already been printed to stderr with open(tempfile_path) as f: for line in f: lower_line = line.lower() split_line = line.strip().split('\t') if "raw total sequences:" in lower_line: num_reads = split_line[2] continue if "reads mapped:" in lower_line: reads_mapped = split_line[2] try: percent_reads_mapped = 100.0 * float( reads_mapped) / float(num_reads) percent_reads_mapped = "%.2f" % percent_reads_mapped except ValueError: percent_reads_mapped = "" continue if "reads properly paired:" in lower_line: proper_pairs = split_line[2] try: percent_proper_pair = 100.0 * float( proper_pairs) / float(num_reads) percent_proper_pair = "%.2f" % percent_proper_pair except ValueError: percent_proper_pair = "" continue if "insert size average:" in lower_line: ave_insert_size = split_line[2] continue os.unlink(tempfile_path) missing_any_metrics = not all([ num_reads, percent_reads_mapped, percent_proper_pair, ave_insert_size ]) if missing_any_metrics: missing_list = [] if not num_reads: missing_list.append("number of reads") if not percent_reads_mapped: missing_list.append("percent reads mapped") if not percent_proper_pair: missing_list.append("percent proper pair") if not ave_insert_size: missing_list.append("ave insert size") error_text = "Cannot calculate " + ", ".join( missing_list) + '.' handle_error(error_text) #------------------------- # Calculate number of duplicate reads from deduped bam file #------------------------- num_dup_reads = "" remove_duplicate_reads = os.environ.get("RemoveDuplicateReads") or "true" remove_duplicate_reads = remove_duplicate_reads.lower() if remove_duplicate_reads == "true": verbose_print( "# %s %s" % (utils.timestamp(), "Calculate number of duplicate reads from deduped bam file")) file = os.path.join(sample_dir, "reads.sorted.deduped.bam") if verify_input_file("Deduped BAM file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: num_dup_reads = metrics.get("numberDupReads", "") # reuse already fresh metrics if num_dup_reads: verbose_print( "Reusing previously calculated number of duplicate reads") else: num_dup_reads = command.run("samtools view -S -c -f 1024 " + file) num_dup_reads = num_dup_reads.strip() #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Calculate mean depth from pileup file")) #------------------------- ave_pileup_depth = "" file = os.path.join(sample_dir, "reads.all.pileup") if verify_input_file("Pileup file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: ave_pileup_depth = metrics.get("avePileupDepth", "") # reuse already fresh metrics if ave_pileup_depth: verbose_print("Reusing previously calculated mean pileup depth") else: depth_sum = 0 with open(file) as f: for line in f: tokens = line.split() try: depth_sum += int(tokens[3]) except (ValueError, IndexError): pass reference_length = 0 for record in SeqIO.parse(reference_file_path, "fasta"): reference_length += len(record) if depth_sum > 0 and reference_length > 0: #print("depth_sum=%i" % depth_sum); #print("reference_length=%i" % reference_length) ave_pileup_depth = float(depth_sum) / float(reference_length) ave_pileup_depth = "%.2f" % ave_pileup_depth else: handle_error("Cannot calculate mean pileup depth.") #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of high confidence SNP positions from phase 1 vcf file")) #------------------------- phase1_snps = "" excluded_sample = "" file = os.path.join(sample_dir, "var.flt.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps = metrics.get("phase1Snps", "") # reuse already fresh metrics if phase1_snps: verbose_print("Reusing previously calculated phase1 snps") else: phase1_snps = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps > max_allowed_snps: excluded_sample = "Excluded" handle_error("Excluded: exceeded %i maxsnps." % max_allowed_snps) phase1_snps = str(phase1_snps) #------------------------- verbose_print("# %s %s" % (utils.timestamp( ), "Count number of filter_regions preserved high confidence SNP positions from phase 1 vcf file" )) #------------------------- phase1_snps_preserved = "" excluded_sample_preserved = "" file = os.path.join(sample_dir, "var.flt_preserved.vcf") if verify_input_file("VCF file", file): # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase1_snps_preserved = metrics.get( "phase1SnpsPreserved", "") # reuse already fresh metrics if phase1_snps_preserved: verbose_print( "Reusing previously calculated preserved phase1 snps") else: phase1_snps_preserved = count_vcf_file_snps(file) # Flag excessive snps if max_allowed_snps > 0 and phase1_snps_preserved > max_allowed_snps: excluded_sample_preserved = "Excluded" handle_error("Excluded: preserved exceeded %i maxsnps." % max_allowed_snps) phase1_snps_preserved = str(phase1_snps_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Count number of consensus snps from consensus vcf file")) #------------------------- phase2_snps = "" file = os.path.join(sample_dir, consensus_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps = metrics.get("snps", "") # reuse already fresh metrics if phase2_snps: verbose_print("Reusing previously calculated phase2 snps") else: phase2_snps = count_vcf_file_snps(file) phase2_snps = str(phase2_snps) #------------------------- verbose_print( "# %s %s" % (utils.timestamp(), "Count number of preserved consensus snps from consensus vcf file")) #------------------------- phase2_snps_preserved = "" file = os.path.join(sample_dir, consensus_preserved_vcf_file_name) if verify_input_file("Consensus VCF file", file): # Omit the phase2 snp count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: phase2_snps_preserved = metrics.get( "snpsPreserved", "") # reuse already fresh metrics if phase2_snps_preserved: verbose_print( "Reusing previously calculated preserved phase2 snps") else: phase2_snps_preserved = count_vcf_file_snps(file) phase2_snps_preserved = str(phase2_snps_preserved) #------------------------------------------ verbose_print( "# %s %s" % (utils.timestamp(), "Count missing positions in the snp matrix")) #------------------------------------------ missing_pos = "" file = os.path.join(sample_dir, consensus_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos = metrics.get("missingPos", "") # reuse already fresh metrics if missing_pos: verbose_print( "Reusing previously calculated missing positions") else: missing_pos = count_missing_snp_matrix_positions( file, sample_id) missing_pos = str(missing_pos) #------------------------------------------ verbose_print("# %s %s" % (utils.timestamp(), "Count missing positions in the preserved snp matrix")) #------------------------------------------ missing_pos_preserved = "" file = os.path.join(sample_dir, consensus_preserved_fasta_file_name) if verify_input_file("Consensus fasta file", file): # Omit the phase2 gap count if the sample is excluded. # It will be meaningless since this sample's phase1 snps are excluded from the snplist. if excluded_sample_preserved != "Excluded": # Metrics already freshly collected? needs_rebuild = utils.target_needs_rebuild([file], metrics_file_path) if not args.forceFlag and not needs_rebuild: missing_pos_preserved = metrics.get( "missingPosPreserved", "") # reuse already fresh metrics if missing_pos_preserved: verbose_print( "Reusing previously calculated missing positions") else: missing_pos_preserved = count_missing_snp_matrix_positions( file, sample_id) missing_pos_preserved = str(missing_pos_preserved) #------------------------- verbose_print("# %s %s" % (utils.timestamp(), "Print results")) #------------------------- with open(metrics_file_path, "w") as f: print("sample=" + '"' + sample_id + '"', file=f) print("fastqFileList=" + '"' + fastq_file_list + '"', file=f) print("fastqFileSize=" + str(fastq_file_size), file=f) print("machine=" + machine, file=f) print("flowcell=" + flowcell, file=f) print("numberReads=" + num_reads, file=f) print("numberDupReads=" + num_dup_reads, file=f) print("percentReadsMapped=" + percent_reads_mapped, file=f) print("percentProperPair=" + percent_proper_pair, file=f) print("aveInsertSize=" + ave_insert_size, file=f) print("avePileupDepth=" + ave_pileup_depth, file=f) print("phase1Snps=" + phase1_snps, file=f) print("phase1SnpsPreserved=" + phase1_snps_preserved, file=f) print("snps=" + phase2_snps, file=f) print("snpsPreserved=" + phase2_snps_preserved, file=f) print("missingPos=" + missing_pos, file=f) print("missingPosPreserved=" + missing_pos_preserved, file=f) print("excludedSample=" + excluded_sample, file=f) print("excludedSamplePreserved=" + excluded_sample_preserved, file=f) print("errorList=" + '"' + ' '.join(error_list) + '"', file=f)