def create_bcl2fastq_command_line(run_id, command_path, input_run_data_path, fastq_output_dir, samplesheet_csv_path, tmp_path, nb_mismatch, conf): nb_threads = str(get_cpu_count(conf)) if command_path is None: final_command_path = 'bcl2fastq' else: final_command_path = command_path # List arg args = [] args.append(final_command_path) args.extend(['--loading-threads', nb_threads]) args.extend(['--demultiplexing-threads', nb_threads]) args.extend(['--processing-threads', nb_threads]) args.extend(['--writing-threads', nb_threads]) args.extend(['--sample-sheet', samplesheet_csv_path]) args.extend(['--barcode-mismatches', nb_mismatch]) # Common parameter, setting per default args.extend(['--input-dir', quote(input_run_data_path + '/Data/Intensities/BaseCalls')]) args.extend(['--output-dir', quote(fastq_output_dir)]) if common.is_conf_value_equals_true(BCL2FASTQ_WITH_FAILED_READS_KEY, conf): args.append('--with-failed-reads') # Specific parameter args.extend(['--runfolder-dir', quote(input_run_data_path)]) args.extend(['--interop-dir', quote(fastq_output_dir + '/InterOp')]) args.extend(['--min-log-level', 'TRACE']) # args.extend(['--stats-dir', fastq_output_dir + '/Stats']) # args.extend(['--reports-dir', fastq_output_dir + '/Reports']) # Set the compression level if common.is_conf_key_exists(BCL2FASTQ_COMPRESSION_LEVEL_KEY, conf): level_str = conf[BCL2FASTQ_COMPRESSION_LEVEL_KEY].strip() try: level_int = int(level_str) if level_int > 0 and level_int < 10: args.extend(['--fastq-compression-level', str(level_int)]) except ValueError: pass if common.is_conf_key_exists(BCL2FASTQ_ADDITIONNAL_ARGUMENTS_KEY, conf): additional_args = conf[BCL2FASTQ_ADDITIONNAL_ARGUMENTS_KEY] additional_args = re.sub('\\s+', ' ', additional_args).strip() args.extends(additional_args.split(' ')) # Retrieve output in file args.extend(['>', quote(tmp_path + '/bcl2fastq_output_' + run_id + '.out')]) args.extend(['2>', quote(tmp_path + '/bcl2fastq_output_' + run_id + '.err')]) return " ".join(args)
def is_sync_step_enable(conf): """Check if all parameters useful for synchronization step are defined Arguments: conf: configuration dictionary """ # Synchronization step: True if common.is_conf_value_equals_true(SYNC_STEP_KEY, conf): # Check bcl path must be defined if common.is_conf_key_exists(BCL_DATA_PATH_KEY, conf): bcl_path = conf[BCL_DATA_PATH_KEY] # Check bcl not same hiseq output path for path in hiseq_run.get_hiseq_data_paths(conf): if path == bcl_path: error('Configuration error.', 'Basecalling path and hiseq output data path must be different: ' + bcl_path, conf) return False return True return False
def demux(run_id, conf): """Add a processed run id to the list of the run ids. Arguments: run_id: The run id conf: configuration dictionary """ start_time = time.time() common.log('INFO', 'Demux step: Starting', conf) reports_data_base_path = conf[REPORTS_DATA_PATH_KEY] reports_data_path = common.get_report_run_data_path(run_id, conf) samplesheet_filename = build_samplesheet_filename(run_id, conf) bcl2fastq_samplesheet_path = conf[TMP_PATH_KEY] + '/' + samplesheet_filename + '.csv' input_run_data_path = common.get_input_run_data_path(run_id, conf) if input_run_data_path is None: return False fastq_output_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id basecall_stats_prefix = 'basecall_stats_' basecall_stats_file = basecall_stats_prefix + run_id + '.tar.bz2' # Check if root input bcl data directory exists if not os.path.exists(input_run_data_path): error("Basecalling data directory does not exist", "Basecalling data directory does not exist: " + str(input_run_data_path), conf) # return False # Check if root input fastq data directory exists if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf): error("FASTQ data directory does not exist", "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf) return False # Check if bcl2fastq samplesheets path exists if not common.is_dir_exists(BCL2FASTQ_SAMPLESHEETS_PATH_KEY, conf): error("Bcl2fastq samplesheet directory does not exist", "Bcl2fastq samplesheet directory does not exist: " + conf[BCL2FASTQ_SAMPLESHEETS_PATH_KEY], conf) return False # Check if bcl2fastq basedir path exists if not common.is_conf_value_equals_true(BCL2FASTQ_USE_DOCKER_KEY, conf): if not common.is_dir_exists(BCL2FASTQ_PATH_KEY, conf): error("Bcl2fastq directory does not exist", "Bcl2fastq directory does not exist: " + conf[BCL2FASTQ_PATH_KEY], conf) return False # Check if temporary directory exists if not common.is_dir_exists(TMP_PATH_KEY, conf): error("Temporary directory does not exist", "Temporary directory does not exist: " + conf[TMP_PATH_KEY], conf) return False # Check if reports_data_path exists if not os.path.exists(reports_data_base_path): error("Report directory does not exist", "Report directory does not exist: " + reports_data_base_path, conf) return False # Create if not exist report directory for the run if not os.path.exists(reports_data_path): os.mkdir(reports_data_path) # Check if basecall stats archive exists if os.path.exists(reports_data_path + '/' + basecall_stats_file): error('Basecall stats archive already exists for run ' + run_id, 'Basecall stats archive already exists for run ' + run_id + ': ' + basecall_stats_file, conf) return False # Check if the output directory already exists if os.path.exists(fastq_output_dir): error("FASTQ output directory already exists for run " + run_id, 'FASTQ output directory already exists for run ' + run_id + ': ' + fastq_output_dir, conf) return False # Compute disk usage and disk free to check if enough disk space is available input_path_du = common.du(input_run_data_path) output_df = common.df(conf[FASTQ_DATA_PATH_KEY]) du_factor = float(conf[DEMUX_SPACE_FACTOR_KEY]) space_needed = input_path_du * du_factor common.log("WARNING", "Demux step: input disk usage: " + str(input_path_du), conf) common.log("WARNING", "Demux step: output disk free: " + str(output_df), conf) common.log("WARNING", "Demux step: space needed: " + str(space_needed), conf) common.log("CONFIG", "Bcl2fastq Docker mode: " + str( common.is_conf_value_equals_true(Settings.BCL2FASTQ_USE_DOCKER_KEY, conf)), conf) # Check if free space is available if output_df < space_needed: error("Not enough disk space to perform demultiplexing for run " + run_id, "Not enough disk space to perform demultiplexing for run " + run_id + '.\n%.2f Gb' % (space_needed / 1024 / 1024 / 1024) + ' is needed (factor x' + str( du_factor) + ') on ' + fastq_output_dir + '.', conf) return False # Load RunInfo object run_info = RunInfo.parse(input_run_data_path + '/RunInfo.xml') # Load samplesheet samplesheet, original_samplesheet_path = load_samplesheet(run_id, input_run_data_path, samplesheet_filename, conf) if samplesheet is None: return False # Update samplesheet if not update_samplesheet(samplesheet, run_id, run_info.getFlowCellLaneCount(), conf): return False # Check samplesheet check_result, samplesheet_warnings = check_samplesheet(samplesheet, run_id, run_info.getFlowCell(), conf) if not check_result: return False # Get the number of mismatches nb_mismatch = get_bcl2fastq_mismatches(samplesheet, conf[BCL2FASTQ_MISMATCHES_KEY]) # Write final samplesheet if not write_bcl2fastq_samplesheet(samplesheet, bcl2fastq_samplesheet_path, conf): return False # Run demultiplexing if common.is_conf_value_equals_true(Settings.BCL2FASTQ_USE_DOCKER_KEY, conf): # With image docker if not demux_run_with_docker(run_id, input_run_data_path, fastq_output_dir, bcl2fastq_samplesheet_path, nb_mismatch, conf): return False else: if not demux_run_standalone(run_id, input_run_data_path, fastq_output_dir, bcl2fastq_samplesheet_path, nb_mismatch, conf): return False # Check if the output directory has been created if not os.path.exists(fastq_output_dir): error("Error while demultiplexing run " + run_id + ' on ' + common.get_instrument_name(run_id, conf), 'Error while demultiplexing run ' + run_id + '.\n' + 'The output directory of bcl2fastq has been created: ' + fastq_output_dir, conf) return False # Check if the output directory has been created if os.path.isfile(fastq_output_dir): error("Error while demultiplexing run " + run_id + ' on ' + common.get_instrument_name(run_id, conf), 'Error while demultiplexing run ' + run_id + '.\n' + 'The output directory of bcl2fastq is a file instead of a directory: ' + fastq_output_dir, conf) return False # Copy bcl2fastq log to output directory cmd = 'cp ' + quote(conf[TMP_PATH_KEY]) + '/bcl2fastq_output_' + run_id + '.* ' + quote(fastq_output_dir) common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error("Error while copying bcl2fastq log to the output FASTQ directory" + run_id_msg, 'Error while copying bcl2fastq log to the output FASTQ directory.\nCommand line:\n' + cmd, conf) return False # The output directory must be read only if not common.chmod_files_in_dir(fastq_output_dir, ".fastq", conf): error("Error while setting the output FASTQ directory to read only" + run_id_msg, 'Error while setting the output FASTQ directory to read only.\nCommand line:\n' + cmd, conf) return False if not check_if_output_fastq_files_exists(fastq_output_dir): error("Error with bcl2fastq execution for run " + run_id, "Error with bcl2fastq execution for run " + run_id + " no FASTQ file found in " + fastq_output_dir, conf) return False # Copy samplesheet to output directory cmd = 'cp -p ' + quote(bcl2fastq_samplesheet_path) + ' ' + quote(fastq_output_dir + '/SampleSheet.csv') common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error("Error while copying samplesheet file to FASTQ directory for run " + run_id, 'Error while copying samplesheet file to FASTQ directory.\nCommand line:\n' + cmd, conf) return False # Create archives on demultiplexing statistics if not archive_demux_stat(run_id, fastq_output_dir, reports_data_path, basecall_stats_file, basecall_stats_prefix, bcl2fastq_samplesheet_path, conf): return False # Archive samplesheet if not archive_samplesheet(run_id, original_samplesheet_path, bcl2fastq_samplesheet_path, conf): return False # Remove temporary samplesheet files if os.path.exists(bcl2fastq_samplesheet_path): os.remove(bcl2fastq_samplesheet_path) # Create index.hml file common.create_html_index_file(conf, run_id, [Settings.HISEQ_STEP_KEY, Settings.DEMUX_STEP_KEY]) df_in_bytes = common.df(fastq_output_dir) du_in_bytes = common.du(fastq_output_dir) df = df_in_bytes / (1024 * 1024 * 1024) du = du_in_bytes / (1024 * 1024 * 1024) common.log("WARNING", "Demux step: output disk free after demux: " + str(df_in_bytes), conf) common.log("WARNING", "Demux step: space used by demux: " + str(du_in_bytes), conf) duration = time.time() - start_time msg = 'Ending demultiplexing with ' + nb_mismatch + ' mismatch(es) for run ' + run_id + '.' + \ '\nJob finished at ' + common.time_to_human_readable(time.time()) + \ ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \ 'FASTQ files for this run ' + \ 'can be found in the following directory:\n ' + fastq_output_dir if samplesheet_warnings.size() > 0: msg += '\n\nSamplesheet warnings:' for warn in samplesheet_warnings: msg += "\n - " + warn # Add path to report if reports.url exists if common.is_conf_key_exists(REPORTS_URL_KEY, conf): msg += '\n\nRun reports can be found at following location:\n ' + conf[REPORTS_URL_KEY] + '/' + run_id msg += '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df) common.send_msg('[Aozan] Ending demultiplexing for run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), msg, False, conf) common.log('INFO', 'Demux step: successful in ' + common.duration_to_human_readable(duration), conf) return True
def load_samplesheet(run_id, input_run_data_path, samplesheet_filename, conf): """ Load the samplesheet. Arguments: run id: The run id input_run_data_path: The input run data path samplesheet_filename: samplesheet filename conf: configuration dictionary Return: a Samplesheet object and the original path of the samplesheet """ run_info_path = input_run_data_path + '/RunInfo.xml' if not os.path.isfile(run_info_path): error("no RunInfo.xml file found for run " + run_id, "No RunInfo.xml file found for run " + run_id + ': ' + run_info_path + '.\n', conf) return None, None run_info = RunInfo.parse(run_info_path) flow_cell_id = run_info.getFlowCell() common.log("INFO", "Flowcell id: " + flow_cell_id, conf) common.log("INFO", "Samplesheet format: " + str(conf[BCL2FASTQ_SAMPLESHEET_FORMAT_KEY]), conf) try: if common.is_conf_value_defined(BCL2FASTQ_SAMPLESHEET_FORMAT_KEY, 'xls', conf): return load_samplesheet_using_extension(conf,samplesheet_filename,'xls',input_run_data_path,run_id) elif common.is_conf_value_defined(BCL2FASTQ_SAMPLESHEET_FORMAT_KEY, 'csv', conf): return load_samplesheet_using_extension(conf,samplesheet_filename,'csv',input_run_data_path,run_id) elif common.is_conf_value_defined(BCL2FASTQ_SAMPLESHEET_FORMAT_KEY, 'xlsx', conf): return load_samplesheet_using_extension(conf,samplesheet_filename,'xlsx',input_run_data_path,run_id) elif common.is_conf_value_defined(BCL2FASTQ_SAMPLESHEET_FORMAT_KEY, 'command', conf): action_error_msg = 'Error while creating Bcl2fastq CSV samplesheet file' if not common.is_conf_key_exists(BCL2FASTQ_SAMPLESHEET_GENERATOR_COMMAND_KEY, conf): error(action_error_msg + ' for run ' + run_id, action_error_msg + ' the command is empty.', conf) return None, None input_samplesheet_generated_path = conf[TMP_PATH_KEY] + '/' + samplesheet_filename + 'generated.csv' cmd = conf[ BCL2FASTQ_SAMPLESHEET_GENERATOR_COMMAND_KEY] + ' ' + run_id + ' ' + quote(input_samplesheet_generated_path) common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error(action_error_msg + ' for run ' + run_id, action_error_msg + '.\nCommand line:\n' + cmd, conf) if not os.path.exists(input_samplesheet_generated_path): error(action_error_msg + ' for run ' + run_id, action_error_msg + ', the external command did not create Bcl2fastq CSV file:\n' + cmd, conf) return None, None # Load CSV samplesheet file samplesheet = SampleSheetCSVReader(input_samplesheet_generated_path).read() # Remove generated samplesheet os.unlink(input_samplesheet_generated_path) return samplesheet, None else: error('Error while creating Bcl2fastq CSV samplesheet file for run ' + run_id, 'No method to get Bcl2fastq samplesheet file has been defined. Please, set the ' + '"bcl2fastq.samplesheet.format" property.\n', conf) return None, None except AozanException, exp: print StringUtils.stackTraceToString(exp) error("Error reading samplesheet: " + samplesheet_filename, exp.getMessage(), conf) return None, None
def sync(run_id, conf): """Synchronize a run. Arguments: run_id: the run id conf: configuration dictionary """ start_time = time.time() common.log('INFO', 'Sync step: Starting', conf) bcl_data_path = conf[BCL_DATA_PATH_KEY] reports_data_base_path = conf[REPORTS_DATA_PATH_KEY] output_path = bcl_data_path + '/' + run_id # check if rsync exists in PATH if not common.exists_in_path("rsync"): error("Can't find all needed commands in PATH env var", "Can't find all needed commands in PATH env var. Unable to find: rsync command.", conf) return False # Check if reports_data_path exists if not os.path.exists(reports_data_base_path): error("Report directory does not exist", "Report directory does not exist: " + reports_data_base_path, conf) return False # Check if enough space to store reports if common.df(reports_data_base_path) < 10 * 1024 * 1024 * 1024: error("Not enough disk space to store aozan reports for run " + run_id, "Not enough disk space to store aozan reports for run " + run_id + '.\nNeed more than 10 Gb on ' + reports_data_base_path + '.', conf) return False # Do the synchronization if not partial_sync(run_id, True, conf): return False # Rename partial sync directory to final run BCL directory if os.path.exists(output_path + '.tmp'): os.rename(output_path + '.tmp', output_path) # Check used and free space df_in_bytes = common.df(bcl_data_path) du_in_bytes = common.du(output_path) df = df_in_bytes / (1024 * 1024 * 1024) du = du_in_bytes / (1024 * 1024 * 1024) common.log("WARNING", "Sync step: output disk free after sync: " + str(df_in_bytes), conf) common.log("WARNING", "Sync step: space used by sync: " + str(du_in_bytes), conf) duration = time.time() - start_time msg = 'Ending synchronization for run ' + run_id + '.\n' + \ 'Job finished at ' + common.time_to_human_readable(time.time()) + \ ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \ 'Run output files (without .cif files) can be found in the following directory:\n ' + output_path # Add path to report if reports.url exists if common.is_conf_key_exists(REPORTS_URL_KEY, conf): msg += '\n\nRun reports can be found at following location:\n ' + conf[REPORTS_URL_KEY] + '/' + run_id msg += '\n\nFor this task %.2f GB has been used and %.2f GB is still free.' % (du, df) common.send_msg('[Aozan] Ending synchronization for run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), msg, False, conf) common.log('INFO', 'sync step: successful in ' + common.duration_to_human_readable(duration), conf) return True
qc.writeXMLReport(report, qc_output_dir + '/' + run_id + '.xml') except AozanException, exp: error("Error while computing QC XML report for run " + run_id + ".", common.exception_msg(exp, conf), conf) return False except Throwable, exp: error("Error while computing QC XML report for run " + run_id + ".", common.exception_msg(exp, conf), conf) return False # Remove tmp extension of temporary QC directory os.rename(qc_output_dir, qc_output_dir[:-len(tmp_extension)]) qc_output_dir = qc_output_dir[:-len(tmp_extension)] # Write the HTML report html_report_file = qc_output_dir + '/' + run_id + '.html' try: if not common.is_conf_key_exists(QC_REPORT_STYLESHEET_KEY, conf): qc.writeReport(report, None, html_report_file) else: qc.writeReport(report, conf[QC_REPORT_STYLESHEET_KEY], html_report_file) except AozanException, exp: error("Error while computing QC HTML report for run " + run_id + ".", common.exception_msg(exp, conf), conf) return False except Throwable, exp: error("error while computing QC HTML report for run " + run_id + ".", common.exception_msg(exp, conf), conf) return False # Check if the report has been generated if not os.path.exists(html_report_file): error("Error while computing QC report for run " + run_id + ".", "No HTML report generated", conf) return False