def post(self): self._posted = True ## if alarm file does not exist, do nothing if os.path.exists(self._alarms_file): with open(self._alarms_file, 'r') as fp: alerts = json.loads(fp.read()) else: return meta_alarm = [] exit_str = '' for k, v in alerts.iteritems(): if not k in self.SN_ALERT_HANDLERS: meta_alarm.append("unknown key {} in {} (BUG)".format( k, self._alarms_file)) elif k == self.SN_EXIT: exit_str = ';'.join(v) else: handler = self.SN_ALERT_HANDLERS[k] for post in v: handler(post) for alarm in meta_alarm: martian.alarm(alarm) if len(exit_str) > 0: self.exit(exit_str)
def main(args, outs): raw_profiles, mask = coverage_matrix.load_matrix( args.raw_singlecell_profiles, args.reference_path) # Get mappability, GC content ncells = raw_profiles[0].shape[0] # Default: (intercept, linear, quadratic) = (1.0, 0.0, 0.0) # Sum up all single-cell profiles try: print('DEBUG 0') result = estimate_gc_bias(args.raw_singlecell_profiles, args.tracks, args.reference_path) print('DEBUG result') print(result) (quadratic, linear, intercept) = result['Summary']['quadratic_coefficients'] print('DEBUG intercept=%f, linear=%f, quadratic=%f' % (intercept, linear, quadratic)) except Exception as error: martian.alarm( "stages/copy_number_processor/estimate_gc_bias_coefficients/__init__ encountered an exception. Error: %s" % repr(error)) # try/except # # Export scale factor and GC bias coefficients outs.linear = linear outs.quadratic = quadratic
def split(args): '''We just align each chunk independently -- joining will happen in the join step of SORT_READS''' # Pull some reads from fastq files -- bail out if it's less than 25bp fastq_tests = [x['read1'] for x in args.chunks] for fastq_test in fastq_tests: with open(fastq_test) as in_file: reader = tk_fasta.read_generator_fastq(in_file) for name, read, qual in itertools.islice(reader, 10): continue if len(read) < MIN_READ_LENGTH: martian.alarm( "BWA-MEM can't handle reads <25bp -- reads will be unmapped." ) continue # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB reference_pyfasta = tenkit.reference.open_reference(args.reference_path) reference_bases = sum( len(reference_pyfasta[contig]) for contig in reference_pyfasta) base_mem_in_gb = int(math.ceil(2 * reference_bases / (1024.0**3))) mem_in_gb = base_mem_in_gb + 4 chunks = [{ 'chunk': x, '__threads': args.num_threads, '__mem_gb': mem_in_gb } for x in args.chunks] return {'chunks': chunks}
def issue(self, metric, value, format_string=""): for alert in self.alerts: ## find the right metric if alert["metric"] == metric: ## should we trigger? if (alert["compare"] == ">") ^ (value < alert["threshold"]): ## optional formatting of alert message with format_string or value if len(format_string) == 0: format_string = str(value) message = alert["message"].replace("{}", format_string) ## issue an alert if alert["action"] == "alarm": martian.alarm(message) elif alert["action"] == "exit": martian.exit(message)
def main(args, outs): normalized_singlecell_profiles, mask = coverage_matrix.load_matrix( args.normalized_singlecell_profiles, args.reference_path) print('DEBUG generate_final_clusters/__init__.main():') print('normalized_singlecell_profiles[0].shape') print(normalized_singlecell_profiles[0].shape) ncells = normalized_singlecell_profiles[0].shape[0] results = [range(ncells)] try: if args.skip_clustering: print('Skipping clustering.') else: ## NOTE: this is a temporary short circuit of clustering when there are more than ## 500 cells. We will revisit this module and fix the issue later. if True: # ncells < 500: # results = cluster_jedna.cluster(normalized_singlecell_profiles, mask, n_merge=25, score_cutoff=10) results = cluster_jedna.cluster(normalized_singlecell_profiles, mask, n_merge=25, score_cutoff=5) else: martian.alarm( "Too many cells for clustering. Putting all cells in one cluster." ) # if ncells else # if skip_clustering else except Exception as error: martian.alarm( "Clustering encountered an exception. Putting all cells in one cluster. Error: %s" % repr(error)) # try/except # out_file = open(outs.clusters, 'w') out_file.write(tenkit.safe_json.safe_jsonify(results)) out_file.close()
def main(args, outs): with open(args.summary) as sf: metrics = json.load(sf) with open(CRDNA_ALARMS) as af: alarms = json.load(af) howbadisit = tenkit.alarms.evaluate_alarms(alarms, metrics) with open(outs.alarms, 'w') as of: of.write(tenkit.safe_json.safe_jsonify(howbadisit)) with open(outs.alarms_summary, 'w') as sf: sf.write("10X Genomics -- Pipeline Run Details\n") sf.write("-" * 40 + "\n") sf.write("Sample ID: %s\n" % args.sample_id) sf.write("Reference: %s\n" % args.reference_path) for oopsie in howbadisit: sf.write("ALERT: %s is %s. %s\n" % (oopsie["title"], oopsie["formatted_value"], oopsie["message"])) if len(howbadisit) > 0: martian.alarm("There were %i sequencing alerts. Look at alarms_summary.txt for details.\n" % len(howbadisit))
def join(args, outs, chunk_defs, chunk_outs): cell_barcodes = {} full_counts = {} for chunk_out in chunk_outs: if not chunk_out.counts: continue for (species, bc_counts) in chunk_out.counts.iteritems(): if not species in full_counts: full_counts[species] = {} for (bc, count) in bc_counts.iteritems(): full_counts[species][bc] = count for species in full_counts.iterkeys(): sorted_data = sorted(full_counts[species].items(), key=lambda item: item[1], reverse=True) bc_counts = [item[1] for item in sorted_data] ## Define cells by first taking one log width. Then refine by choosing ## one log width from the 99th percentile amongst the cells max_count = bc_counts[0] min_count = float(max_count) / np.power(10, args.log_width) count_99th = np.percentile([x for x in bc_counts if x >= min_count], 99) min_count = float(count_99th) / np.power(10, args.log_width) # implement force_cells if supplied to overrule min_count if args.force_cells is not None and args.force_cells > 0: index = min(args.force_cells, len(bc_counts)) - 1 min_count = max(bc_counts[index], 1) martian.log_info("Using force_cells") if not species in cell_barcodes: cell_barcodes[species] = {} for i, (bc, count) in enumerate(sorted_data, start=1): if count < min_count: break cell_barcodes[species][bc] = count if i >= MAX_CELLS: martian.log_info("%s: hit maximum number of cells "\ "(%d)"%(species, MAX_CELLS)) min_count = count break martian.log_info("%s: max count %d, min count %d" % (species, max_count, min_count)) # some logging ncell = len(cell_barcodes[species]) nobs = len(bc_counts) if len(cell_barcodes[species]) > 0: mean = np.mean(cell_barcodes[species].values()) median = np.median(cell_barcodes[species].values()) else: mean = 0 median = 0 print("{}: {} cells of {} obs, cell barcode reads: " "mean = {:.2f}, median = {:.1f}").format(species, ncell, nobs, mean, median) # alarm user with open(CRDNA_ALARMS) as af: alarms = json.load(af) # filter alarms alarms = [ alarm for alarm in alarms if alarm['id'] in ['not_enough_cells', 'too_many_cells'] ] num_cells = sum([len(bc_cts) for bc_cts in cell_barcodes.itervalues()]) alarm_results = tk_alarms.evaluate_alarms(alarms, {'num_cells': num_cells}) for a in alarm_results: martian.alarm("%s is %s. %s\n" % (a["title"], a["formatted_value"], a["message"])) outs.cell_barcodes = cell_barcodes
def __del__(self): if not self._posted: martian.alarm( "C++ alarms were not posted, but left in file (BUG).") else: self.check_delete()
def main(args, outs): hostname = socket.gethostname() # Sample ID / pipestance name if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) # Check numerical options # types are already checked by mrp so only need to check ranges if args.force_cells is not None and (args.force_cells < 1 or args.force_cells > 20000): martian.exit("MRO parameter force_cells must be a positive integer"\ " <= 20000.") # check min_ploidy, max_ploidy if args.cnv_params is not None: min_ploidy = args.cnv_params.get("min_ploidy", None) max_ploidy = args.cnv_params.get("max_ploidy", None) if min_ploidy is not None and min_ploidy <= 0: martian.exit("Command line argument soft-min-avg-ploidy must be a "\ "positive real number.") if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0): martian.exit("Command line argument soft-max-avg-ploidy must be a "\ "positive real number <= 8.") if (min_ploidy is not None and max_ploidy is not None and max_ploidy <= min_ploidy): martian.exit("Command line arguments must satisfy "\ "soft-min-avg-ploidy < soft-max-avg-ploidy.") # check downsample options if args.downsample is not None and len(args.downsample.keys()) > 0: keys = args.downsample.keys() if len(keys) > 1: martian.exit("Please supply either maxreads or downsample but not "\ "both.") key = keys[0] value = args.downsample[key] param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"} bad_value = False try: float(value) bad_value = value < 1e-12 except ValueError: bad_value = True if bad_value: cs_key = param_map[key] martian.exit("Command line argument %s must be a positive number" % cs_key) # FASTQ input for idx, sample_def in enumerate(args.sample_def): read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") if args.fastq_mode == "BCL_PROCESSOR": sample_indices, msg = tk_preflight.check_sample_indices(sample_def) if sample_indices is None: martian.exit(msg) find_func = tk_fasta.find_input_fastq_files_10x_preprocess reads = [] for sample_index in sample_indices: # process interleaved reads reads.extend(find_func(read_path, "RA", sample_index, lanes)) if len(reads) == 0: martian.exit("No input FASTQs were found for the requested parameters.") elif args.fastq_mode == "ILMN_BCL2FASTQ": sample_names = sample_def.get("sample_names", None) if sample_names is None: martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx)) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult reads1 = [] reads2 = [] for sample_name in sample_names: r1 = find_func(read_path, "R1", sample_name, lanes) r2 = find_func(read_path, "R2", sample_name, lanes) if len(r1) != len(r2): martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx)) reads1.extend(r1) reads2.extend(r2) if len(reads1) == 0 and len(reads2) == 0: martian.exit("No input FASTQs were found for the requested parameters.") else: martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode)) # Reference ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) contig_defs_json_path = os.path.join(args.reference_path, "fasta", "contig-defs.json") faidx_path = os.path.join(args.reference_path, "fasta", "genome.fa.fai") error_msg = contig_manager.verify_contig_defs(contig_defs_json_path, faidx_path) if error_msg is not None: martian.exit(error_msg) try: ref = contig_manager.contig_manager(args.reference_path) except Exception as e: martian.exit("Unexpected error occurred.\n%s"%str(e)) # too many contigs primary = ref.primary_contigs(allow_sex_chromosomes=True) num_primary_contigs = len(primary) if num_primary_contigs > 100: martian.exit("There can be at most 100 primary contigs.") # contig length checks chrom_length_dict = ref.get_contig_lengths() contig_length_exit = 500 * 1000 contig_length_warn = 10 ** 7 offending_contigs_warn = [] offending_contigs_exit = [] for c in primary: clen = chrom_length_dict[c] if clen < contig_length_exit: offending_contigs_exit.append(c) elif clen < contig_length_warn: offending_contigs_warn.append(c) if len(offending_contigs_exit) > 0: martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig must be at least %d bases "\ "in length."%(",".join(offending_contigs_exit), contig_length_exit, contig_length_exit)) elif (not args.check_executables) and len(offending_contigs_warn) > 0: martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig is recommended to be at least %d bases "\ "in length."%(",".join(offending_contigs_warn), contig_length_warn, contig_length_warn)) # Open file handles limit if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): """Combine reads from multiple input FASTQ files, and potentially trim. Demultiplex outputs a series of FASTQ files with filenames of the form: read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz]. """ def check_key(n, dict_in, name, tys): if not dict_in.has_key(name): martian.exit("Entry %d in sample_def missing required field: %s" % (n, name)) if not (type(dict_in[name]) in tys): martian.exit("Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name]))) if args.downsample is not None: if len(args.downsample.keys()) > 1: martian.exit("More than one downsampling mode requested. Please select a single downsampling mode") (k,v) = args.downsample.items()[0] if not k in ["gigabases", "subsample_rate", "target_reads"]: martian.exit("Unrecognized downsampling mode: %s" % k) # Check for self-consistent gem_group settings in the sample_def entries gem_groups = [x['gem_group'] for x in args.sample_def] all_null = all([x is None for x in gem_groups]) all_int = all([type(x) is int for x in gem_groups]) if not (all_null or all_int): martian.exit("Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer") # If all gem_groups are set to null, then set them all to 1 if all_null: for sample_item in args.sample_def: sample_item['gem_group'] = 1 # Predicted input bases total_seq_bases = 0 # Predicted input reads total_input_reads = 0 # verify input mode upfront if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]: martian.throw("Unrecognized input_mode: %s" % args.input_mode) for (idx, sample_item) in enumerate(args.sample_def): # validate fields check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "gem_group", [int, type(None)]) if args.input_mode == "BCL_PROCESSOR": check_key(idx, sample_item, "sample_indices", [list, type(None)]) elif args.input_mode == "ILMN_BCL2FASTQ": check_key(idx, sample_item, "sample_names", [list, type(None)]) interleaved_read_type = "RA" chunks = [] read_groups = set() for read_chunk in args.sample_def: # Each sample_def entry can have a separate pre-applied downsampling rate # We adjust the estimated data in that chunk to account for this # subsampling chunk_subsample_rate = read_chunk.get('subsample_rate', 1.0) bc_in_read = {} if read_chunk.has_key('bc_in_read'): if read_chunk['bc_in_read'] is not None: bc_in_read['bc_in_read'] = read_chunk['bc_in_read'] bc_in_read['bc_length'] = read_chunk['bc_length'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] gem_group = read_chunk['gem_group'] unbarcoded = read_chunk.get('unbarcoded') sample_id = args.sample_id library_id = read_chunk.get('library_id', 'MissingLibrary') # split on BCL_PROCESSOR / ILMN_BCL2FASTQ # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index; # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name if args.input_mode == "BCL_PROCESSOR": sample_index_strings, msg = tk_preflight.check_sample_indices(read_chunk) if sample_index_strings is None: martian.exit(msg) sample_seq_bases = 0 read_length = 100 # Should be overwritten below find_func = tk_fasta.find_input_fastq_files_10x_preprocess for sample_index in sample_index_strings: # process interleaved reads reads = find_func(path, interleaved_read_type, sample_index, lanes) for read in reads: _, predicted_seq_bases, read_length = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_bases = chunk_subsample_rate * sample_seq_bases bp_per_read_pair = 2*read_length martian.log_info("Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases)/1e9, path, bp_per_read_pair)) total_seq_bases += sample_seq_bases total_input_reads += float(sample_seq_bases)/read_length for sample_index in sample_index_strings: reads = find_func(path, interleaved_read_type, sample_index, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_index, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(reads) if not unbarcoded: barcodes = find_func(path, bc_read, sample_index, lanes) if len(barcodes) == 0: barcodes = [None] * len(reads) else: barcodes = [None] * len(reads) # calculate chunks for r,b,si in zip(reads, barcodes, sis): (flowcell, lane) = get_run_data(r) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) new_chunk = { 'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': chunk_subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) elif args.input_mode == "ILMN_BCL2FASTQ": sample_names = read_chunk['sample_names'] read_length1 = None read_length2 = None sample_seq_bases = 0 find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for sample_name in sample_names: # process read 1 reads = find_func(path, "R1", sample_name, lanes) for read in reads: _, predicted_seq_bases, read_length1 = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases # process read 2 reads = find_func(path, "R2", sample_name, lanes) for read in reads: _, predicted_seq_bases, read_length2 = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases if read_length1 is None and read_length2 is None: martian.exit("No input FASTQs were found for the requested parameters.") elif read_length1 is None: martian.exit("No input FASTQs were found for Read1.") elif read_length2 is None: martian.exit("No input FASTQs were found for Read2.") sample_seq_bases = chunk_subsample_rate * sample_seq_bases bp_per_read_pair = read_length1 + read_length2 martian.log_info("Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases)/1e9, path, bp_per_read_pair)) total_seq_bases += sample_seq_bases total_input_reads += float(sample_seq_bases)*2/(read_length1 + read_length2) for sample_name in sample_names: r1_reads = find_func(path, "R1", sample_name, lanes) r2_reads = find_func(path, "R2", sample_name, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_name, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(r1_reads) # in Chromium chemistry... there shouldn't be separate barcode reads... if not unbarcoded: barcodes = find_func(path, bc_read, sample_name, lanes) if len(barcodes) == 0: barcodes = [None] * len(r1_reads) else: barcodes = [None] * len(r1_reads) # again, with Chromium, the barcodes should be an array of Nones, but # just in case... if not (len(r1_reads) == len(r2_reads) == len(barcodes)): martian.log_info("Read 1 files: %s" % str(r1_reads)) martian.log_info("Read 2 files: %s" % str(r2_reads)) martian.log_info("Barcode files: %s" % str(barcodes)) martian.exit("Read1, Read2, and Barcode files are mismatched. Exiting pipline") # calculate chunks for r1,r2,b,si in zip(r1_reads, r2_reads, barcodes, sis): (flowcell, lane) = get_run_data(r1) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) new_chunk = { 'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': chunk_subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases)/1e9)) if len(chunks) == 0: martian.exit("No input FASTQs were found for the requested parameters.") # # Downsampling setup # # The total available input raw gigabases of input data (est_gb), and the base pairs per read pair (bp_per_read_pair) # are estimated above. (est_gb, bp_per_read_pair) = (float(total_seq_bases)/1e9, bp_per_read_pair) downsample = args.downsample if args.downsample is not None else {} # Possible BC subsampling -- try to get the requested amount of data _after_ bc subsampling est_gb_post_bc = est_gb * downsample.get("bc_subsample_rate", 1.0) # Aim high to ensure that we won't be left with too few reads fudge_factor = 1.00 downsample_succeeded = True if downsample.has_key("gigabases"): read_sample_rate = min(1.0, fudge_factor * downsample['gigabases'] / est_gb_post_bc) requested_read_pairs = int(1e9 * downsample['gigabases'] / bp_per_read_pair) downsample_succeeded = downsample['gigabases'] > est_gb_post_bc elif downsample.has_key("target_reads"): requested_read_pairs = int(downsample['target_reads'] / 2) est_read_pair_post_bc = 1e9 * est_gb_post_bc / bp_per_read_pair read_sample_rate = min(1.0, fudge_factor * requested_read_pairs / est_read_pair_post_bc) downsample_succeeded = requested_read_pairs > est_read_pair_post_bc elif downsample.has_key("subsample_rate"): read_sample_rate = min(1.0, downsample["subsample_rate"] / downsample.get("bc_subsample_rate", 1.0)) requested_read_pairs = None else: if len(downsample.keys()) > 0: martian.exit("Unrecognized downsample request: %s.\n Please use 'gigabases', 'target_reads', or 'subsample_rate'" % str(downsample)) read_sample_rate = 1.0 requested_read_pairs = None ## Alert if user requests analysis on too many reads ## Three CS scenarios: ## no downsampling ## "gigabases" downsampling ## "target_reads" downsampling READ_THRESHOLD = 5*1000*1000*1000 est_reads_post_ds = (requested_read_pairs*2 if requested_read_pairs is not None else total_input_reads) martian.log_info("Estimate %.3f M reads entering pipeline" % (est_reads_post_ds/1e6)) if est_reads_post_ds > READ_THRESHOLD: martian.alarm("We will be processing data from %.1f billion reads "\ "and the pipeline run time will likely exceed 24 hours. Please "\ "consult the 10x support website for guidance on run times. You "\ "can reduce the number of reads using the downsample/maxreads "\ "command-line option." % (est_reads_post_ds/1e9)) martian.log_info("Downsampling request: %s" % str(downsample)) martian.log_info("Base pairs per read pair: %s" % bp_per_read_pair) martian.log_info("Estimated Input: %.2f GB, Initial Downsample Rate: %.3f. Requested total reads: %s" % (est_gb, read_sample_rate, str(requested_read_pairs))) # Copy over the per-chunk subsample rates if read_sample_rate is not None: for chunk in chunks: chunk['subsample_rate'] = chunk.get('subsample_rate', 1.0) * read_sample_rate if downsample.has_key("bc_subsample_rate"): chunk["bc_subsample_rate"] = downsample["bc_subsample_rate"] outs.requested_read_pairs = requested_read_pairs martian.log_info("Input reads: %s" % str(chunks)) outs.chunks = chunks outs.read_groups = [rg for rg in read_groups] downsample_info = {} downsample_info['available_gb'] = est_gb downsample_info['requested_gb'] = downsample.get('gigabases', None) downsample_info['requested_rate'] = read_sample_rate downsample_info['post_downsample_gb'] = float(requested_read_pairs * bp_per_read_pair) / 1e9 if requested_read_pairs is not None else None downsample_info['downsample_succeeded'] = downsample_succeeded with open(outs.downsample_info, 'w') as downsample_out: tenkit.safe_json.dump_numpy(downsample_info, downsample_out) check_fastqs(outs.chunks) # Give out full path to BC whitelist if args.barcode_whitelist: outs.barcode_whitelist_path = bc_utils.barcode_whitelist_path(args.barcode_whitelist) else: outs.barcode_whitelist_path = None
def main(args, outs): hostname = socket.gethostname() tk_preflight.record_package_versions() ## no barcode whitelist if args.barcode_whitelist is None: martian.exit("No barcode whitelist specified.") ## there must be a barcode in each sample ## and it should be 16 bases long ## and it should be on read 1 or read 2 for sd in args.sample_def: if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read", 3) not in [1, 2]: martian.exit("Barcode must be 16 bases and on read1 or read2.") print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, supernova does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") # Open file handles limit ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) ## compile a list of fastq files fastq_files = [] if args.input_mode == "BCL_PROCESSOR": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "sample_indices", [list, type(None)]) check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) main_read_type = "RA" find_func = tk_fasta.find_input_fastq_files_10x_preprocess for read_chunk in args.sample_def: sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_index in sample_index_strings: reads = find_func(path, main_read_type, sample_index, lanes) fastq_files.extend(reads) elif args.input_mode == "ILMN_BCL2FASTQ": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "sample_names", [list, type(None)]) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for read_chunk in args.sample_def: sample_names = read_chunk['sample_names'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_name in sample_names: reads = find_func(path, "R1", sample_name, lanes) fastq_files.extend(reads) reads = find_func(path, "R3", sample_name, lanes) fastq_files.extend(reads) else: martian.throw("Unrecognized input_mode: %s" % args.input_mode) ## if we found nothing then break if len(fastq_files) == 0: martian.exit( "No input FASTQs were found with the requested lanes and sample indices." ) ## make sure they are okay first check_fastqs(fastq_files) total_reads = 0.0 global_avg = 0.0 num_files = 0 for fn in fastq_files: reads_fn, avg_read_len_fn = estimate_read_count_and_length( fn, num_reads=1000) total_reads += reads_fn global_avg += avg_read_len_fn num_files += 1 global_avg = global_avg / num_files martian.log_info( "Estimated read length = %.1f, Estimated total read input = %.1f" % (global_avg, total_reads)) exit_msg = "We observe many reads shorter than 125 bases. The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly, and the algorithm has not been tested on short reads. Because reads are too short, execution will be terminated." warn_msg = "We observe many reads shorter than 150 bases.The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly." if global_avg < 125: martian.exit(exit_msg) elif global_avg < 149: martian.alarm(warn_msg)
def main(args, outs): # Analysis parameters lead_trim = args.trim_length analysis_params = {} analysis_params['lead_trim'] = lead_trim analysis_params['analysis_version'] = martian.get_pipelines_version() analysis_params_output_file = open(outs.analysis_params, 'w') analysis_params_output_file.write(json.dumps(analysis_params)) analysis_params_output_file.close() # Summary metrics summary_metrics = {} # Add longranger version to summary so we get it everywhere the summary.json goes # We'll also pipe it to the customer csv summary_metrics['longranger_version'] = martian.get_pipelines_version() basic_metrics = json.load(open(args.basic_results, 'r')) for (k, v) in basic_metrics.items(): summary_metrics[k] = v # Get the set of instrument IDs observed in the BAM file if args.bam_file is not None: instrument_ids = get_instrument_ids(args.bam_file) summary_metrics['instrument_ids'] = ";".join(instrument_ids) else: summary_metrics['instrument_ids'] = '' # Copy over single_partition results sp_metrics = json.load(open(args.single_partition_results, 'r')) for (k, v) in sp_metrics.items(): summary_metrics[k] = v # Load the duplicate summary results # only include the overall dup rate in the customer metrics dup_metrics = json.load(open(args.duplicate_summary, 'r')) key = 'full_use_bcs' dup_counts = dup_metrics[key] if dup_counts is None: key = 'full_ignore_bcs' dup_counts = dup_metrics[key] mean_tag = "mean_dup_rate" sd_tag = "sd_dup_rate" optical_tag = "optical_dup_rate" dup_frac_tag = "dup_fraction" if dup_counts: dd = {int(k): v for (k, v) in dup_counts.items()} n_dups = sum([v * (k - 1) for (k, v) in dd.items() if k > 1]) n_non_dups = sum(dd.values()) mean_dup_rate = tk_stats.robust_divide(float(n_dups + n_non_dups), n_non_dups) summary_metrics[mean_tag] = mean_dup_rate # Customer facing dup rate on 0 - 1 scale summary_metrics[dup_frac_tag] = (mean_dup_rate - 1.0) / mean_dup_rate optical_dup_count = dup_metrics['optical_' + key]['count'] summary_metrics[optical_tag] = tk_stats.robust_divide( float(optical_dup_count), n_non_dups) sd_terms = [(k - mean_dup_rate)**2.0 * v for (k, v) in dd.items()] sd_dup_rate = math.sqrt( tk_stats.robust_divide(sum(sd_terms), sum(dd.values()))) summary_metrics[sd_tag] = sd_dup_rate else: summary_metrics[dup_frac_tag] = 0.0 summary_metrics[mean_tag] = 1.0 summary_metrics[sd_tag] = 0.0 summary_metrics[optical_tag] = 0.0 # Load the bias results bias_results = json.load(open(args.coverage_results, 'r')) summary_depth_info = bias_results['summary_depth_info'] mean_depth, median_depth, zero_cov_fract = get_depth_info( summary_depth_info) on_target_bases = get_on_target_bases(summary_depth_info) depth_positional_cv = get_depth_positional_cv(summary_depth_info, COVERAGE_TRIM_TAIL) summary_depth_info = bias_results['summary_depth_info_deduped'] mean_depth_deduped, median_depth_deduped, garb = get_depth_info( summary_depth_info) depth_positional_cv_deduped = get_depth_positional_cv( summary_depth_info, COVERAGE_TRIM_TAIL) # low coverage tail for customers, based on deduped coverage profile summary_metrics['low_cov_' + str(CUSTOMER_LEFT_TAIL_COVERAGE)] = get_depth_tail_fract( summary_depth_info, CUSTOMER_LEFT_TAIL_COVERAGE, left_tail=True) if bias_results['target_info'] != {}: target_info = bias_results['target_info'] summary_metrics['fraction_on_target'] = tk_stats.robust_divide( float(target_info['on_target_bases']), target_info['total_bases']) else: summary_metrics['fraction_on_target'] = None summary_metrics['detected_sex'] = bias_results.get('detected_sex') summary_metrics['mean_depth'] = mean_depth summary_metrics['male_chromosome_copies'] = bias_results.get( 'male_chromosome_copies') summary_metrics['median_depth'] = median_depth summary_metrics['mean_depth_deduped'] = mean_depth_deduped summary_metrics['median_depth_deduped'] = median_depth_deduped summary_metrics['on_target_bases'] = on_target_bases summary_metrics['depth_positional_cv'] = depth_positional_cv summary_metrics[ 'depth_positional_cv_deduped'] = depth_positional_cv_deduped summary_metrics['zero_cov_fract'] = zero_cov_fract # Compute fraction of reads in high-coverage spikes cov_data = bias_results['summary_depth_info_deduped'] _, conf_median, _ = get_depth_info(cov_data) conf_median = max(conf_median, 1) cov_variance = conf_median + (conf_median * depth_positional_cv_deduped)**2 cov_sigma = math.sqrt(cov_variance) high_cutoff = conf_median + 5.0 * cov_sigma cov_data = {int(k): v for (k, v) in cov_data.iteritems()} total = sum(float(k * v) for (k, v) in cov_data.iteritems()) outlier = sum( float(k * v) for (k, v) in cov_data.iteritems() if k > high_cutoff) summary_metrics['high_coverage_pileup_fraction'] = tk_stats.robust_divide( outlier, total) # Add metrics from variant_results if not (args.variant_results is None): with open(args.variant_results) as variant_results_file: variant_results = json.load(variant_results_file) summary_metrics.update(variant_results) # Copy of coalescence results coa_metrics = json.load(open(args.filter_barcodes_results)) for (k, v) in coa_metrics.items(): summary_metrics[k] = v if not (args.sv_results is None): with open(args.sv_results) as sv_results_file: sv_results = json.load(sv_results_file) summary_metrics.update(sv_results) if not (args.short_del_results is None): with open(args.short_del_results) as short_del_results_file: short_del_results = json.load(short_del_results_file) new_res = {} for k, v in short_del_results.iteritems(): new_res['short_del_' + k] = v summary_metrics.update(new_res) # Length mass results # Only copy scalar results if args.length_mass_results is not None: with open(args.length_mass_results) as length_mass_file: lm_results = json.load(length_mass_file) for (k, v) in lm_results.iteritems(): if type(v) == str or type(v) == int or type( v) == float or v is None: summary_metrics[k] = v # Reference genome information summary_metrics[ 'reference_name'] = reference_name = tenkit.reference.get_genome( args.reference_path) ref_fasta = tenkit.reference.open_reference(args.reference_path) summary_metrics['reference_contigs'] = reference_contigs = len(ref_fasta) summary_metrics['reference_bases'] = reference_bases = sum( len(ref_fasta[contig]) for contig in ref_fasta) martian.log_info("Reference: %s, %d contigs, %d bases" % (reference_name, reference_contigs, reference_bases)) # Check for SV blacklist (only check if SV calling is enabled) summary_metrics['sv_blacklist_present'] = True if not (args.sv_results is None): if not tenkit.reference.is_tenx(args.reference_path): if not os.path.exists( tenkit.reference.get_sv_blacklist(args.reference_path)): summary_metrics['sv_blacklist_present'] = False martian.alarm( "WARNING: Pipeline run without a region blacklist for SV calling. SV calls may contain many false positives due to problematic regions in the reference." ) # Gelbead lot information if not (args.lot_info is None): with open(args.lot_info) as lot_info_file: lot_info_results = json.load(lot_info_file) summary_metrics.update(lot_info_results) # Downsampling information if not (args.downsample_info is None): with open(args.downsample_info) as downsample_info_file: downsample_info_results = json.load(downsample_info_file) summary_metrics.update(downsample_info_results) # Summary metrics are now finalized -- evaluate alarms # Select alarm file -- right now we always use the same one alarm_rules = tenkit.alarms.load_rules(args.targets) alarms = tenkit.alarms.evaluate_alarms(alarm_rules, summary_metrics) # Write alarm file with open(outs.alarms, 'w') as alarms_output_file: alarms_output_file.write(tenkit.safe_json.safe_jsonify(alarms)) # Log alarms to martian with open(outs.alarms_summary, 'w') as al_summary_file: def wl(s): al_summary_file.write(s + "\n") wl("10X Genomics - Pipeline Run Details") wl("-" * 40) wl("Sample ID: %s" % args.sample_id) wl("Genome: %s" % tenkit.reference.get_genome(args.reference_path)) wl("Reference Path: %s" % args.reference_path) wl("Targets file: %s" % args.targets) if alarms is not None and len(alarms) > 0: wl("") wl("Sequencing Metric Alarms:") wl("-" * 40) for alarm in alarms: wl("%s [%s] -- %s" % (alarm['level'], alarm['title'], alarm['message'])) else: wl("") wl("No alarms raised.") summary_output_file = open(outs.summary, 'w') summary_output_file.write( tenkit.safe_json.safe_jsonify(summary_metrics, pretty=True)) summary_output_file.close() # Generate CS summary metrics CSV sv_calls_metric = "num_calls" metrics_key_map = copy.deepcopy(CS_METRICS_KEY_MAP) metrics_key_map.append([sv_calls_metric, "large_sv_calls"]) if args.targets is None: metrics_key_map.append( ["short_del_calledDEL_num_calls", "short_deletion_calls"]) else: metrics_key_map.append( ["short_del_total_del_numPosCalls", "short_deletion_calls"]) generate_summary_cs_csv(metrics_key_map, summary_metrics, outs.summary_cs)
def main(args, outs): normalized_profiles = [] raw_profiles, mask = coverage_matrix.load_matrix( args.raw_singlecell_profiles, args.reference_path) print('len(mask)=%d' % len(mask)) print('len(raw_profiles)=%d' % len(raw_profiles)) chromosomes = coverage_matrix.list_primary_contigs( args.raw_singlecell_profiles, args.reference_path) print('chromosomes:') print(chromosomes) n_chrom = len(chromosomes) # # Get mappability, GC content: bin_parameters = [] vesna.load_track_parameters(args.tracks, bin_parameters) n_cells = raw_profiles[0].shape[0] linear = args.linear quadratic = args.quadratic gc0 = 0.45 # TODO: Replace this with mean of GC in good bins across entire genome # remove = [] for chrom_index, chrom_name in enumerate(chromosomes): try: mappability = get_mappability(bin_parameters, chrom_name, ordered_chromosomes) gc_gc0 = get_gc(bin_parameters, gc0, chrom_name, ordered_chromosomes) print('len(mappability)=%d' % len(mappability)) print('len(gc_gc0)=%d' % len(gc_gc0)) print('raw_profiles[chrom_index].shape:') print(raw_profiles[chrom_index].shape) expectation = mappability * (1.0 + linear * gc_gc0 + quadratic * gc_gc0 * gc_gc0) #print('expectation') #print(expectation.tolist()) tmp = np.zeros(raw_profiles[chrom_index].shape, dtype='float') for cell in range(n_cells): #print('tmp[cell, :] before:') #print(tmp[cell, :].tolist()) tmp[cell, :] = raw_profiles[chrom_index][cell, :] / expectation tmp[cell, tmp[cell, :] < 0.0] = 0.0 #print('tmp[cell, :] after:') #print(tmp[cell, :].tolist()) # for cell normalized_profiles.append(tmp) except Exception as error: martian.alarm( "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s" % repr(error)) print( "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s" % repr(error)) print( 'Removing chrom_name=%s, chrom_index=%d (absent from input raw profiles)' % (chrom_name, chrom_index)) remove.append(chrom_name) # try/except # for chrom for chrom_name in remove: if chrom_name in chromosomes: chromosomes.remove(chrom_name) # if chrom_name # for chrom_name # # Export normalized cell profiles bin_size = 20000 # TODO: Fetch this value from input raw_profiles h5 file tracks = pd.HDFStore(args.tracks, 'r') coverage_matrix.store_matrix(file_name=outs.normalized_singlecell_profiles, chroms=chromosomes, profiles=normalized_profiles, tracks=tracks, window_size=bin_size) tracks.close()