def main(args, outs): hostname = socket.gethostname() print "Checking run folder..." tk_preflight.check_rta_complete(args.run_path) print "Checking RunInfo.xml..." runinfo = tk_preflight.check_runinfo_xml(args.run_path) if not args.allow_no_barcodes: ok, msg = check_reads(runinfo) if not ok: martian.exit(msg) print "Checking system environment..." ok, msg = tk_preflight.check_ld_library_path() if not ok: martian.exit(msg) # Presence of SampleSheet.csv interferes with demux. # Ask customer to move it. Under older RTA, bcl2fastq looks for it # in Data/Intensities/BaseCalls while under newer RTA, it looks for it # at the top of the run folder. bc_dir = os.path.join(args.run_path, "Data", "Intensities", "BaseCalls") for ss_dir in [args.run_path, bc_dir]: ilmn_sample_sheet = os.path.join(ss_dir, "SampleSheet.csv") external = True try: import kitten external = False except ImportError: pass if external and os.path.exists(ilmn_sample_sheet): martian.exit( "On machine: %s, SampleSheet.csv found in run folder that would interfere with demux:\n%s\nPlease move, rename, or delete the file and run demux again." % (hostname, ilmn_sample_sheet)) if args.check_executables: print "Checking bcl2fastq..." # Determine the RTA version of the run and whether this instrument # requires i2 to RC'd (rta_version, rc_i2_read, bcl_params) = tenkit.bcl.get_rta_version(args.run_path) martian.log_info("RTA Version: %s" % rta_version) martian.log_info("BCL Params: %s" % str(bcl_params)) # Determine the best available bcl2fastq version to use # Will call martian.exit() with an error message if there isn't # a compatible version available (major_ver, full_ver) = tenkit.bcl.check_bcl2fastq(hostname, rta_version) martian.log_info("Running bcl2fastq mode: %s. Version: %s" % (major_ver, full_ver)) ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg)
def main(args, outs): hostname = socket.gethostname() if args.output_format == 'bam' and args.read_group is None: martian.exit( "Please specify a read_group to populate the @RG field of the BAM file" ) if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit( "Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) # Check open file handles limit ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): hostname = socket.gethostname() print "Checking run folder..." tk_preflight.check_rta_complete(args.run_path) print "Checking RunInfo.xml..." runinfo = tk_preflight.check_runinfo_xml(args.run_path) print "Checking system environment..." ok, msg = tk_preflight.check_ld_library_path() if not ok: martian.exit(msg) print "Checking barcode whitelist..." tk_preflight.check_barcode_whitelist(args.barcode_whitelist) if args.check_executables: print "Checking bcl2fastq..." (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) martian.log_info("RTA Version: %s" % rta_version) martian.log_info("BCL Params: %s" % str(bcl_params)) (major_ver, full_ver) = tk_bcl.check_bcl2fastq(hostname, rta_version) martian.log_info("Running bcl2fastq mode: %s. Version: %s" % (major_ver, full_ver)) if '--no-lane-splitting' in args.bcl2fastq2_args: martian.exit("The --no-lane-splitting option is not supported.") print "Emitting run information..." martian.log_info("-------mkfastq diagnostic start-------") emit_info(args) print "Checking read specification..." check_read_params(args, runinfo) martian.log_info("-------mkfastq diagnostic end-------") print "Checking samplesheet specs..." check_specs(args) print "Checking for dual index flowcell..." check_dual_index(args, runinfo) ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg)
def main(args, outs): hostname = socket.gethostname() print "Checking run folder..." tk_preflight.check_rta_complete(args.run_path) print "Checking RunInfo.xml..." tk_preflight.check_runinfo_xml(args.run_path) print "Checking system environment..." ok, msg = tk_preflight.check_ld_library_path() if not ok: martian.exit(msg) print "Checking barcode whitelist..." tk_preflight.check_barcode_whitelist(args.barcode_whitelist) if args.check_executables: print "Checking bcl2fastq..." (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) martian.log_info("RTA Version: %s" % rta_version) martian.log_info("BCL Params: %s" % str(bcl_params)) (major_ver, full_ver) = tk_bcl.check_bcl2fastq(hostname, rta_version) martian.log_info("Running bcl2fastq mode: %s. Version: %s" % (major_ver, full_ver)) ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) if args.output_path is not None: tk_preflight.check_folder_or_create("--output-dir", args.output_path, hostname, permission=os.W_OK|os.X_OK) if args.interop_output_path is not None: tk_preflight.check_folder_or_create("--interop-dir", args.interop_output_path, hostname, permission=os.W_OK|os.X_OK) if args.max_bcl2fastq_threads < 1: msg = "Cannot run bcl2fastq with zero threads." martian.exit(msg)
def check_environment(): check(tk_preflight.check_open_fh())
def main(args, outs): hostname = socket.gethostname() # Sample ID / pipestance name if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) # FASTQ input for sample_def in args.sample_def: #if not tk_preflight.check_is_chromium(sample_def): # martian.exit("This version of Longranger does not support GemCode data. Please use Longranger 1.2 instead.") read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) # Reference MAX_CONTIGS = 1000 ok, msg = tk_preflight.check_refdata(args.reference_path, MAX_CONTIGS) if ok: martian.log_info(msg) else: martian.exit(msg) # Sex (given reference) if args.sex is not None: if args.sex.lower() not in ["m", "male", "f", "female"]: martian.exit("Sex of sample must be 'm', 'male', 'f', or 'female'.") else: if tenkit.reference.load_male_chromosomes(args.reference_path) == None: martian.exit("Must specify sex of sample, or use a reference package that includes a sex_chromosomes.tsv file.\nFor more details, see http://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references") ref = tenkit.reference.open_reference(args.reference_path) male_chrom = tenkit.reference.load_male_chromosomes(args.reference_path) for m in male_chrom: if m not in ref: martian.exit("Reference issue in sex_chromosomes.tsv. Male-specific chromosome '%s' does not exist in reference" % m) auto_chrom = tenkit.reference.load_autosomal_chromosomes(args.reference_path) if auto_chrom is None: martian.exit("No autosomal chromosome listed in sex_chromosomes.tsv. Please list an autosomal chromosome to use as a reference for sex determination") for a in auto_chrom: if a not in ref: martian.exit("Reference issue in sex_chromosomes.tsv. Autosomal chromosome '%s' does not exist in reference" % a) # Open file handles limit - per LONGRANGER-1758, only check this on the execution machine. # We can tell if we're on the execution machine by looking at args.check_executables if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) # Targets if args.targets is not None: tk_preflight.check_file("targets", args.targets, hostname) tk_preflight.check_bed(args.targets, args.reference_path) if args.target_blacklist is None: print "\nWARNING: You selected targeted mode but did not provide a --cnvfilter.\nPlease note this may result in a high number of false positive CNV calls.\nFor more details, see http://support.10xgenomics.com/genome-exome/software\n" # Target blacklist if args.target_blacklist is not None: tk_preflight.check_file("cnvfilter", args.target_blacklist, hostname) tk_preflight.check_bed(args.target_blacklist, args.reference_path) # Restrict locus if tenkit.reference.is_tenx(args.reference_path): if args.restrict_locus is not None: if not re.match("^chr[A-Za-z0-9]{1,2}:[0-9]+\.\.[0-9]+$", args.restrict_locus): martian.exit("restrict_locus must be of the form 'chrXX:start..end'.") # Pre-called if args.vc_precalled is not None: tk_preflight.check_file("pre-called VCF", args.vc_precalled, hostname) check_vcf(args.vc_precalled, args) # VC mode if not re.match("^(disable|freebayes|gatk:/.*\.jar|precalled:/.*\.vcf)$", args.vc_mode): martian.exit("vc_mode must be of the form 'freebayes', 'gatk:/path/to/gatk_jar_file.jar', 'disable'.") if args.vc_precalled is None and args.vc_mode == "disable": martian.exit("Because you have not provided a pre-called VCF file, variant calling cannot be disabled.") vc_args = args.vc_mode.split(":") vc_mode = vc_args[0] if vc_mode == "precalled": if args.vc_precalled is not None: martian.exit("Please specify a pre-called VCF file using only one method.") precalled_vars_path = vc_args[1] tk_preflight.check_file("pre-called VCF", precalled_vars_path, hostname) check_vcf(precalled_vars_path, args) elif vc_mode == "gatk": jar_path = vc_args[1] if not jar_path.startswith('/'): martian.exit("Specified GATK jar file must be an absolute path: %s" % jar_path) if not os.path.exists(jar_path): martian.exit("On machine: %s, specified GATK jar file does not exist: %s" % (hostname, jar_path)) if os.path.isdir(jar_path): martian.exit("Please specify a GATK jar file, not a folder.") if args.check_executables: check_gatk(jar_path, hostname) check_gatk_ref(args.reference_path) # VC ground truth if args.vc_ground_truth is not None: tk_preflight.check_file("VCF ground truth", args.vc_ground_truth, hostname) check_vcf(args.vc_ground_truth, args) # SV min QV if args.sv_min_qv is not None and args.sv_min_qv < 0: martian.exit("sv_min_qv must be a positive integer.") # SV ground truth if args.sv_ground_truth is not None: tk_preflight.check_file("SV ground truth", args.sv_ground_truth, hostname) martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): hostname = socket.gethostname() tk_preflight.record_package_versions() ## no barcode whitelist if args.barcode_whitelist is None: martian.exit("No barcode whitelist specified.") ## there must be a barcode in each sample ## and it should be 16 bases long ## and it should be on read 1 or read 2 for sd in args.sample_def: if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read", 3) not in [1, 2]: martian.exit("Barcode must be 16 bases and on read1 or read2.") print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, supernova does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") # Open file handles limit - per SUPERNOVA-152, only check this on the execution machine. # We can tell if we're on the execution machine by looking at args.check_executables if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) ## compile a list of fastq files fastq_files = [] if args.input_mode == "BCL_PROCESSOR": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "sample_indices", [list, type(None)]) check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) main_read_type = "RA" find_func = tk_fasta.find_input_fastq_files_10x_preprocess for read_chunk in args.sample_def: sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_index in sample_index_strings: reads = find_func(path, main_read_type, sample_index, lanes) fastq_files.extend(reads) elif args.input_mode == "ILMN_BCL2FASTQ": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "sample_names", [list, type(None)]) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for read_chunk in args.sample_def: sample_names = read_chunk['sample_names'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_name in sample_names: reads = find_func(path, "R1", sample_name, lanes) fastq_files.extend(reads) reads = find_func(path, "R3", sample_name, lanes) fastq_files.extend(reads) else: martian.throw("Unrecognized input_mode: %s" % args.input_mode) ## if we found nothing then break if len(fastq_files) == 0: martian.exit( "No input FASTQs were found with the requested lanes and sample indices." ) ## make sure they are okay first check_fastqs(fastq_files) total_reads = 0.0 global_avg = 0.0 num_files = 0 for fn in fastq_files: reads_fn, avg_read_len_fn = estimate_read_count_and_length( fn, num_reads=1000) total_reads += reads_fn global_avg += avg_read_len_fn num_files += 1 global_avg = global_avg / num_files martian.log_info( "Estimated read length = %.1f, Estimated total read input = %.1f" % (global_avg, total_reads)) PreflightAlert = alerts.AlertLogger(stage="preflight") PreflightAlert.issue("mean_read_length", global_avg) # verify type and range for downsampling parameters # Note that non-numerical values for bc_subsample_rate and target_reads in mro trickle down as 'None' if args.downsample is not None: bc_subsample_rate = args.downsample.get("bc_subsample_rate", None) if bc_subsample_rate is not None: if not isinstance(bc_subsample_rate, float) and not isinstance( bc_subsample_rate, int): martian.exit( "Specified barcode fraction: %s is not a fraction. Please specify a valid float between 0 and 1." % str(bc_subsample_rate)) if bc_subsample_rate <= 0 or bc_subsample_rate > 1: martian.exit( "Specified barcode fraction: %s is not between 0 and 1. Please specify a valid float between 0 and 1." % str(bc_subsample_rate)) if abs(bc_subsample_rate) < 1e-5: martian.exit( "Specified barcode fraction: %s is too close to 0 and thus impractical." % str(bc_subsample_rate)) target_reads = args.downsample.get("target_reads", None) if target_reads is not None: if not isinstance(target_reads, int) and not isinstance( target_reads, float): martian.exit( "Specified maxreads: %s is not a number. Please specify an integer larger than one for maxreads" % str(target_reads)) if target_reads < 1: martian.exit( "Specified maxreads: %s is less than one. Please specify an integer larger than one for maxreads" % str(target_reads))
def main(args, outs): hostname = socket.gethostname() # Sample ID / pipestance name if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) # Check numerical options # types are already checked by mrp so only need to check ranges if args.force_cells is not None and (args.force_cells < 1 or args.force_cells > 20000): martian.exit("MRO parameter force_cells must be a positive integer"\ " <= 20000.") # check min_ploidy, max_ploidy if args.cnv_params is not None: min_ploidy = args.cnv_params.get("min_ploidy", None) max_ploidy = args.cnv_params.get("max_ploidy", None) if min_ploidy is not None and min_ploidy <= 0: martian.exit("Command line argument soft-min-avg-ploidy must be a "\ "positive real number.") if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0): martian.exit("Command line argument soft-max-avg-ploidy must be a "\ "positive real number <= 8.") if (min_ploidy is not None and max_ploidy is not None and max_ploidy <= min_ploidy): martian.exit("Command line arguments must satisfy "\ "soft-min-avg-ploidy < soft-max-avg-ploidy.") # check downsample options if args.downsample is not None and len(args.downsample.keys()) > 0: keys = args.downsample.keys() if len(keys) > 1: martian.exit("Please supply either maxreads or downsample but not "\ "both.") key = keys[0] value = args.downsample[key] param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"} bad_value = False try: float(value) bad_value = value < 1e-12 except ValueError: bad_value = True if bad_value: cs_key = param_map[key] martian.exit("Command line argument %s must be a positive number" % cs_key) # FASTQ input for idx, sample_def in enumerate(args.sample_def): read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") if args.fastq_mode == "BCL_PROCESSOR": sample_indices, msg = tk_preflight.check_sample_indices(sample_def) if sample_indices is None: martian.exit(msg) find_func = tk_fasta.find_input_fastq_files_10x_preprocess reads = [] for sample_index in sample_indices: # process interleaved reads reads.extend(find_func(read_path, "RA", sample_index, lanes)) if len(reads) == 0: martian.exit("No input FASTQs were found for the requested parameters.") elif args.fastq_mode == "ILMN_BCL2FASTQ": sample_names = sample_def.get("sample_names", None) if sample_names is None: martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx)) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult reads1 = [] reads2 = [] for sample_name in sample_names: r1 = find_func(read_path, "R1", sample_name, lanes) r2 = find_func(read_path, "R2", sample_name, lanes) if len(r1) != len(r2): martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx)) reads1.extend(r1) reads2.extend(r2) if len(reads1) == 0 and len(reads2) == 0: martian.exit("No input FASTQs were found for the requested parameters.") else: martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode)) # Reference ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) contig_defs_json_path = os.path.join(args.reference_path, "fasta", "contig-defs.json") faidx_path = os.path.join(args.reference_path, "fasta", "genome.fa.fai") error_msg = contig_manager.verify_contig_defs(contig_defs_json_path, faidx_path) if error_msg is not None: martian.exit(error_msg) try: ref = contig_manager.contig_manager(args.reference_path) except Exception as e: martian.exit("Unexpected error occurred.\n%s"%str(e)) # too many contigs primary = ref.primary_contigs(allow_sex_chromosomes=True) num_primary_contigs = len(primary) if num_primary_contigs > 100: martian.exit("There can be at most 100 primary contigs.") # contig length checks chrom_length_dict = ref.get_contig_lengths() contig_length_exit = 500 * 1000 contig_length_warn = 10 ** 7 offending_contigs_warn = [] offending_contigs_exit = [] for c in primary: clen = chrom_length_dict[c] if clen < contig_length_exit: offending_contigs_exit.append(c) elif clen < contig_length_warn: offending_contigs_warn.append(c) if len(offending_contigs_exit) > 0: martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig must be at least %d bases "\ "in length."%(",".join(offending_contigs_exit), contig_length_exit, contig_length_exit)) elif (not args.check_executables) and len(offending_contigs_warn) > 0: martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig is recommended to be at least %d bases "\ "in length."%(",".join(offending_contigs_warn), contig_length_warn, contig_length_warn)) # Open file handles limit if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def check_filehandle_limit(): """checks file handles""" ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg)
def main(args, outs): hostname = socket.gethostname() print "Checking sample info..." ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, cellranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) if args.reference_path is None and args.vdj_reference_path is None: martian.exit( "Must specify either reference_path or vdj_reference_path.") print "Checking transcriptome..." if args.reference_path is not None: ok, msg = cr_preflight.check_refdata(args.reference_path) if not ok: martian.exit(msg) if args.vdj_reference_path is not None: ok, msg = vdj_preflight.check_refdata(args.vdj_reference_path) if not ok: martian.exit(msg) print "Checking chemistry..." ok, msg = cr_chem.check_chemistry_defs() if not ok: martian.exit(msg) ok, msg = cr_chem.check_chemistry_arg(args.chemistry_name) if not ok: martian.exit(msg) if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME: ok, msg = cr_chem.check_chemistry_def(args.custom_chemistry_def) if not ok: martian.exit(msg) # Open file handles limit - per CELLRANGER-824, only check this on the execution machine. # We can tell if we're on the execution machine by looking at args.check_executables if args.check_executables: print "Checking system environment..." ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) print "Checking optional arguments..." if args.recovered_cells is not None and args.force_cells is not None: martian.exit( "Cannot specify both --force-cells and --expect-cells (or --cells) in the same run." ) cr_preflight.record_package_versions()
def main(args, outs): hostname = socket.gethostname() tk_preflight.record_package_versions() ## no barcode whitelist if args.barcode_whitelist is None: martian.exit("No barcode whitelist specified.") ## there must be a barcode in each sample ## and it should be 16 bases long ## and it should be on read 1 or read 2 for sd in args.sample_def: if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read", 3) not in [1, 2]: martian.exit("Barcode must be 16 bases and on read1 or read2.") print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, supernova does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") # Open file handles limit ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) ## compile a list of fastq files fastq_files = [] if args.input_mode == "BCL_PROCESSOR": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "sample_indices", [list, type(None)]) check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) main_read_type = "RA" find_func = tk_fasta.find_input_fastq_files_10x_preprocess for read_chunk in args.sample_def: sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_index in sample_index_strings: reads = find_func(path, main_read_type, sample_index, lanes) fastq_files.extend(reads) elif args.input_mode == "ILMN_BCL2FASTQ": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "sample_names", [list, type(None)]) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for read_chunk in args.sample_def: sample_names = read_chunk['sample_names'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_name in sample_names: reads = find_func(path, "R1", sample_name, lanes) fastq_files.extend(reads) reads = find_func(path, "R3", sample_name, lanes) fastq_files.extend(reads) else: martian.throw("Unrecognized input_mode: %s" % args.input_mode) ## if we found nothing then break if len(fastq_files) == 0: martian.exit( "No input FASTQs were found with the requested lanes and sample indices." ) ## make sure they are okay first check_fastqs(fastq_files) total_reads = 0.0 global_avg = 0.0 num_files = 0 for fn in fastq_files: reads_fn, avg_read_len_fn = estimate_read_count_and_length( fn, num_reads=1000) total_reads += reads_fn global_avg += avg_read_len_fn num_files += 1 global_avg = global_avg / num_files martian.log_info( "Estimated read length = %.1f, Estimated total read input = %.1f" % (global_avg, total_reads)) exit_msg = "We observe many reads shorter than 125 bases. The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly, and the algorithm has not been tested on short reads. Because reads are too short, execution will be terminated." warn_msg = "We observe many reads shorter than 150 bases.The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly." if global_avg < 125: martian.exit(exit_msg) elif global_avg < 149: martian.alarm(warn_msg)