def check_read_params(args, runinfo): read_info, flowcell = tk_bcl.load_run_info(runinfo) read_info_by_read_type = {r['read_name']: r for r in read_info} # verify barcode if args.bc_read_type is None: martian.exit("Barcode read must be specified.") if args.bc_read_type not in read_info_by_read_type: martian.exit("Barcode read not found in run folder: %s" % args.bc_read_type) if args.bc_start_index is not None and args.bc_length is not None: if args.bc_start_index + args.bc_length > read_info_by_read_type[ args.bc_read_type]['read_length']: martian.exit("Barcode out of bounds (%s:%d-%d)" % (args.bc_read_type, args.bc_start_index, args.bc_start_index + args.bc_length)) # if sample index reads not generated, must specify lanes to demux if args.si_read_type not in read_info_by_read_type: if not args.lanes or len(args.lanes) == 0: martian.exit( "Lanes must be specified if no sample index reads were generated" ) # if UMI present, do bounds check if args.umi_read_type is not None and args.umi_read_type not in read_info_by_read_type: martian.exit("UMI read type not found in run folder: %s" % args.umi_read_type) if args.umi_start_index is not None and args.umi_length is not None: if args.umi_start_index + args.umi_length > read_info_by_read_type[ args.umi_read_type]['read_length']: martian.exit("UMI out of bounds (%s:%d-%d)" % (args.umi_read_type, args.umi_start_index, args.umi_start_index + args.umi_length))
def main(args, outs): specs = args.specs runinfo_path = tk_preflight.check_runinfo_xml(args.run_path) output_dir = os.path.dirname(outs.samplesheet) csv_specs = [spec for spec in specs if spec.get('csv')] if not csv_specs: csv_path = make_csv_from_specs(specs, runinfo_path, output_dir) outs.input_samplesheet = None else: csv_path = csv_specs[0]['csv'] shutil.copy(csv_path, outs.input_samplesheet) read_info, flowcell = tk_bcl.load_run_info(runinfo_path) (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) read_info_by_read_type = {r['read_name']: r for r in read_info} r1_length = read_info_by_read_type.get('R1', {'read_length': 0})['read_length'] r2_length = read_info_by_read_type.get('R2', {'read_length': 0})['read_length'] rc_sample_index = (args.si_read_type == 'I2' and rc_i2_read) lane_count = tk_lane.get_flowcell_lane_count(runinfo_path) output_info = tk_sheet.transform_samplesheet( csv_path, outs.samplesheet, flowcell_lane_count=lane_count, r1_read_length=r1_length, r2_read_length=r2_length, rc_sample_index=rc_sample_index, project_name=args.project) outs.dual_indexed_samplesheet = output_info['dual_indexed']
def check_dual_index(args, runinfo): """ This assumes that the spreadsheet and runinfo have been validated already. """ read_info, flowcell = tk_bcl.load_run_info(runinfo) ignore_dual_index = args.ignore_dual_index # if the ignore_dual_index flag is set, then just proceed; we'll # ignore downstream if ignore_dual_index: return is_dual_index = tk_bcl.is_real_dual_index_flowcell(read_info) # from here on, ignore_dual_index is false if is_dual_index: # check input samplesheet for index2. If not present, complain # check for samplesheet csv_specs = [spec for spec in args.specs if spec.get('csv')] if not csv_specs: martian.exit( "Dual-index flowcell detected. Please add the --ignore-dual-index " "option to proceed, or use an Illumina Experiment Manager-formatted " "sample sheet with an index2 column for the second index.") csv_spec = csv_specs[0] csv_path = csv_spec['csv'] is_iem = tk_sheet.file_is_iem_samplesheet(csv_path) is_csv = tk_sheet.file_is_simple_samplesheet(csv_path) if is_csv: martian.exit( "Dual-index flowcell detected. Please add the --ignore-dual-index " "option to proceed, or use an Illumina Experiment Manager-formatted " "sample sheet with an index2 column for the second index.") if is_iem: if not tk_sheet.iem_has_dual_index(csv_path): martian.exit( "Dual-index flowcell detected. Please add the use --ignore-dual-index " "option to proceed, or add an index2 column to the supplied samplesheet." )
def main(args, outs): """ run_path must be the top-level Illumina flowcell directory """ if not os.path.exists(args.run_path): martian.throw("Run directory does not exist: %s" % args.run_path) run_info_xml = os.path.join(args.run_path, "RunInfo.xml") read_info, flowcell = tk_bcl.load_run_info(run_info_xml) outs.si_read_type = get_si_read_type(read_info) (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) martian.log_info("BCL folder RTA Version: %s" % rta_version) martian.log_info("BCL params: %s" % str(bcl_params)) martian.log_info("RC'ing i2 read: %s" % str(rc_i2_read)) outs.rc_i2_read = rc_i2_read split_by_tile = _split_by_tile(args) martian.log_info("Splitting by tile: %s" % str(split_by_tile)) outs.split_by_tile = split_by_tile
def run_bcl2fastq(args, outs): input_dir = os.path.join(args.run_path, "Data", "Intensities", "BaseCalls") if args.output_path: outs.fastq_path = args.output_path output_dir = outs.fastq_path if args.interop_output_path: outs.interop_path = args.interop_output_path interop_dir = outs.interop_path martian.log_info("Running bcl2fastq on run: %s" % args.run_path) martian.log_info("FASTQ output dir: %s" % output_dir) run_info_xml = os.path.join(args.run_path, "RunInfo.xml") read_info, flowcell = tk_bcl.load_run_info(run_info_xml) if not args.bases_mask: use_bases_mask_val = tk_bcl.make_bases_mask_val( read_info, sample_index_read=args.si_read_type, dual_indexed=args.dual_indexed_samplesheet, ignore_dual_index=args.ignore_dual_index) else: use_bases_mask_val = args.bases_mask outs.file_read_types_map = tk_bcl.get_bcl2fastq_read_type_map( read_info, sample_index_read=args.si_read_type, dual_indexed=args.dual_indexed_samplesheet, ignore_dual_index=args.ignore_dual_index) # Determine the RTA version of the run and whether this instrument # requires i2 to be RC'd (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) outs.rc_i2_read = rc_i2_read martian.log_info("BCL folder RTA Version: %s" % rta_version) martian.log_info("BCL params: %s" % str(bcl_params)) # Determine the best available bcl2fastq version to use # Will call martian.exit() with an error message if there isn't # a compatible version available hostname = socket.gethostname() (major_ver, full_ver) = tk_bcl.check_bcl2fastq(hostname, rta_version) outs.bcl2fastq_version = full_ver martian.log_info("Using bcl2fastq version: %s" % full_ver) martian.log_info("RC'ing i2 read: %s" % str(rc_i2_read)) # Restore the LD_LIBRARY_PATH set aside by sourceme.bash/shell10x. # Only do this for the environment in which BCL2FASTQ will run. new_environ = dict(os.environ) new_environ['LD_LIBRARY_PATH'] = os.environ['_TENX_LD_LIBRARY_PATH'] if major_ver == tk_bcl.BCL2FASTQ_V1: martian.exit( "bcl2fastq 1.8.4 is not currently supported. Please install bcl2fastq2, or use the 10x 'demux' pipeline instead." ) # configure cmd = [ "configureBclToFastq.pl", "--use-bases-mask=" + use_bases_mask_val, "--fastq-cluster-count", "20000000", "--input-dir=" + input_dir, "--output-dir=" + output_dir, "--no-eamss", "--ignore-missing-bcl", "--ignore-missing-control", "--ignore-missing-stats", "--sample-sheet=" + args.samplesheet_path ] cmd += remove_deprecated_args(args.bcl2fastq1_args, major_ver, full_ver) martian.log_info("Running bcl2fastq v1 setup command:") martian.log_info(" ".join(cmd)) outs.bcl2fastq_args = " ".join(cmd) try: ret = tk_proc.call(cmd, env=new_environ) except OSError: martian.throw( "configureBclToFastq.pl not found on path -- make sure you've added it to your environment" ) if ret != 0: martian.throw("configureBclToFastq.pl failed. Exiting.") # Run the actual makefiles makefile = os.path.join(output_dir, "Makefile") if not os.path.exists(makefile): martian.throw("BclToFastq Makefile not found where expected: %s" % makefile) martian.log_info("Running Makefile...") mk_cmd = ["make", "-C", output_dir, "-j", str(args.num_threads)] martian.log_info(" ".join(mk_cmd)) ret = tk_proc.call(mk_cmd, env=new_environ) if ret > 0: martian.throw( "Running the BclToFastq Makefile failed with code: %d. Exiting" % ret) elif ret < 0: martian.throw("Bcl2Fastq was killed with signal %d." % ret) elif major_ver == tk_bcl.BCL2FASTQ_V2: if not os.path.exists(outs.interop_path): os.makedirs(outs.interop_path) if not os.path.exists(outs.fastq_path): os.makedirs(outs.fastq_path) # minimum-trimmed-read-length and mask-short-adapter-reads must be our call (SIs, UMIs) min_read_length = min([x["read_length"] for x in read_info]) if min_read_length > 8: # ensure min is at sample-index, if extra base grabbed for QC purposes (I8n, for example) min_read_length = 8 cmd = [ "bcl2fastq", "--minimum-trimmed-read-length", str(min_read_length), "--mask-short-adapter-reads", str(min_read_length), "--create-fastq-for-index-reads", "--ignore-missing-positions", "--ignore-missing-filter", "--ignore-missing-bcls", #'-r', str(args.__threads), '-w', str(args.__threads), "--use-bases-mask=" + use_bases_mask_val, "-R", args.run_path, "--output-dir=" + output_dir, "--interop-dir=" + interop_dir, "--sample-sheet=" + args.samplesheet_path ] cmd += remove_deprecated_args(args.bcl2fastq2_args, major_ver, full_ver) outs.bcl2fastq_args = " ".join(cmd) martian.log_info("Running bcl2fastq2: %s" % (" ".join(cmd))) try: ret = tk_proc.call(cmd, env=new_environ) except OSError: martian.throw( "bcl2fastq not found on PATH -- make sure you've added it to your environment" ) if ret > 0: files_path = os.path.abspath(martian.make_path('_stderr')) enclosing_path = os.path.dirname(os.path.dirname(files_path)) stderr_path = os.path.join(enclosing_path, '_stderr') martian.exit( "bcl2fastq exited with an error. You may have specified an invalid command-line option. See the full error here:\n%s" % stderr_path) elif ret < 0: # subprocess.call returns negative code (on UNIX): bcl2fastq killed by external signal martian.exit("bcl2fastq was killed with signal %d." % ret)
def process_raw_ilmn_data(args, outs): """ run_path must be the top-level Illumina run directory """ input_dir = os.path.join(args.run_path, "Data", "Intensities", "BaseCalls") output_dir = outs.raw_fastq_path martian.log_info("Running bcl2fastq on run: %s" % args.run_path) martian.log_info("FASTQ output dir: %s" % output_dir) if not os.path.exists(args.run_path): martian.throw("Run directory does not exist: %s" % args.run_path) run_info_xml = os.path.join(args.run_path, "RunInfo.xml") read_info, flowcell = tk_bcl.load_run_info(run_info_xml) use_bases_mask_val = tk_bcl.make_bases_mask_val(read_info) # Determine the RTA version of the run and whether this instrument # requires i2 to RC'd (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) martian.log_info("BCL folder RTA Version: %s" % rta_version) martian.log_info("BCL params: %s" % str(bcl_params)) # Determine the best available bcl2fastq version to use # Will call martian.exit() with an error message if there isn't # a compatible version available hostname = socket.gethostname() (major_ver, full_ver) = tk_bcl.check_bcl2fastq(hostname, rta_version) martian.log_info("Using bcl2fastq version: %s" % full_ver) tile_split = args.tile_suffix != '*' try: # Internal use only. Move aside Illumina sample sheet so # bcl2fastq doesn't use it. For customers, there is a pre-flight # check to make sure there is no sample sheet in the places # bcl2fastq looks for it. import kitten # Older RTA put sheet into Data/Intensities/BaseCalls while # newer RTA put sheet at top of the BCL folder. Check both. for ss_dir in [args.run_path, input_dir]: ilmn_sample_sheet = os.path.join(ss_dir, "SampleSheet.csv") mv_sample_sheet = os.path.join(ss_dir, "IlluminaSampleSheet.csv") if os.path.exists(ilmn_sample_sheet): martian.log_info("Renaming the Illumina sample sheet") os.rename(ilmn_sample_sheet, mv_sample_sheet) except ImportError: pass # Restore the LD_LIBRARY_PATH set aside by sourceme.bash/shell10x. # Only do this for the environment in which BCL2FASTQ will run. new_environ = dict(os.environ) new_environ['LD_LIBRARY_PATH'] = os.environ['_TENX_LD_LIBRARY_PATH'] if major_ver == tk_bcl.BCL2FASTQ_V1: if tile_split: martian.throw( "Cannot support NovaSeq demux scheme on bcl2fastq v1. Exiting." ) # configure # write bigger fastq chunks to avoid blow-up of chunks cmd = [ "configureBclToFastq.pl", "--fastq-cluster-count", "20000000", "--no-eamss", "--use-bases-mask=" + use_bases_mask_val, "--input-dir=" + input_dir, "--output-dir=" + output_dir ] martian.log_info("Running bcl2fastq setup command:") martian.log_info(" ".join(cmd)) try: ret = tenkit.log_subprocess.call(cmd, env=new_environ) except OSError: martian.throw( "configureBclToFastq.pl not found on path -- make sure you've added it to your environment" ) if ret != 0: martian.throw("configureBclToFastq.pl failed. Exiting.") # Run the actual makefiles makefile = os.path.join(output_dir, "Makefile") if not os.path.exists(makefile): martian.throw("BclToFastq Makefile not found where expected: %s" % makefile) martian.log_info("Running Makefile...") mk_cmd = ["make", "-C", output_dir, "-j", str(args.num_threads)] martian.log_info(" ".join(mk_cmd)) ret = tenkit.log_subprocess.call(mk_cmd, env=new_environ) if ret > 0: martian.throw( "running the BclToFastq Makefile failed with code: %d. Exiting" % ret) elif ret < 0: martian.throw("Bcl2Fastq was killed with signal %d." % ret) elif major_ver == tk_bcl.BCL2FASTQ_V2: if tile_split: proj_output_dir = os.path.join(output_dir, "Tile%s" % args.tile_suffix, "Project_%s" % flowcell) else: proj_output_dir = os.path.join(output_dir, "Project_%s" % flowcell) fastq_output_dir = os.path.join(proj_output_dir, "fastq") interop_output_dir = os.path.join(proj_output_dir, "interop") if not os.path.exists(fastq_output_dir): os.makedirs(fastq_output_dir) if not os.path.exists(interop_output_dir): os.makedirs(interop_output_dir) min_read_length = min([x["read_length"] for x in read_info]) if tile_split: flowcell_info = tk_lane.get_flowcell_layout(run_info_xml) if flowcell_info.tile_length is None: martian.throw( "Cannot determine tile name length from RunInfo.xml") tiles_regex_prefix = "[0-9]" * (flowcell_info.tile_length - 1) tiles_regex = "%s%s" % (tiles_regex_prefix, args.tile_suffix) cmd = [ "bcl2fastq", "--minimum-trimmed-read-length", str(min_read_length), # PIPELINES-1140 - required in bcl2fastq 2.17 to generate correct index read fastqs "--mask-short-adapter-reads", str(min_read_length), # LONGRANGER-121 - ignore missing bcl data "--ignore-missing-bcls", "--ignore-missing-filter", "--ignore-missing-positions", "--ignore-missing-controls", '-r', str(args.__threads), '-w', str(args.__threads), # TENKIT-72 avoid CPU oversubscription '-p', str(args.__threads), "--use-bases-mask=" + use_bases_mask_val, "-R", args.run_path, "--output-dir=" + fastq_output_dir, "--interop-dir=" + interop_output_dir, "--tiles=" + tiles_regex ] else: cmd = [ "bcl2fastq", "--minimum-trimmed-read-length", str(min_read_length), # PIPELINES-1140 - required in bcl2fastq 2.17 to generate correct index read fastqs "--mask-short-adapter-reads", str(min_read_length), # LONGRANGER-121 - ignore missing bcl data "--ignore-missing-bcls", "--ignore-missing-filter", "--ignore-missing-positions", "--ignore-missing-controls", '-r', str(args.__threads), '-w', str(args.__threads), # TENKIT-72 avoid CPU oversubscription '-p', str(args.__threads), "--use-bases-mask=" + use_bases_mask_val, "-R", args.run_path, "--output-dir=" + fastq_output_dir, "--interop-dir=" + interop_output_dir ] martian.log_info("Running bcl2fastq 2: %s" % (" ".join(cmd))) try: ret = tenkit.log_subprocess.call(cmd, env=new_environ) except OSError: martian.throw( "bcl2fastq not found on PATH -- make sure you've added it to your environment" ) if ret > 0: martian.exit("bcl2fastq failed. Exiting.") elif ret < 0: martian.exit("bcl2fastq was killed with signal %d." % ret) # Glob over all lanes - demultiplex handles whether to collapse them if tile_split: fastq_glob = os.path.join(output_dir, "Tile*", "Project_" + flowcell, "*", "*.fastq*") else: fastq_glob = os.path.join(output_dir, "Project_" + flowcell, "*", "*.fastq*") start_fastq_files = glob.glob(fastq_glob) # File renaming -- bcl2fastq names the reads R1, R2, R3, R4 # Use our conventions to make them R1, I1, I2, R2, as the case may be. rename_fastq_files(read_info, start_fastq_files)