def join(args, outs, chunk_defs, chunk_outs): # Sample ID / pipestance name check_sample_id(args.sample_id) # factorization. check_factorization(args.factorization) # normalization checks if args.normalization not in ALLOWED_NORMALIZATIONS: martian.exit( "Unsupported normalization method provided. Options are {}.". format(", ".join(ALLOWED_NORMALIZATIONS))) # # Reference # ref directory structure and timestamps ok, msg = check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) # usability and check file formats check_reference_format(args.reference_path) # aggr csv checks if args.aggr_csv is None: martian.exit("aggregation csv must be provided") check_aggr_csv(args.aggr_csv, args.reference_path) # Open file handles limit if args.check_executables: check_filehandle_limit() martian.log_info(tk_preflight.record_package_versions())
def run_assembly(fastq_pref, fasta_pref, args): cmd = [ 'vdj_asm', 'asm', fastq_pref, fasta_pref, '--kmers=' + str(args.min_kmer_count), '--min-contig=' + str(args.min_contig_len), '--npaths=' + str(args.npaths), '--nx=' + str(args.nx), '--min-qual=' + str(args.min_qual), '--score-factor=' + str(args.score_factor), '--qual-factor=' + str(args.qual_factor), '--min-sw-score=' + str(args.min_sw_score), '--rt-error=' + str(args.rt_error), '--subsample-rate=' + str(args.subsample_rate[str(args.gem_group)]), ] if not cr_chem.has_umis(args.chemistry_def): martian.log_info('Assembly without UMIs is not fully supported.') if not args.use_sw: cmd.append('--fast-align') if not args.min_readpairs_per_umi is None: # If only assembling with read2, adjust this cutoff # NOTE: Martian stores the gem_group dict keys as strings cutoff = args.min_readpairs_per_umi[str(args.gem_group)] cmd.append('--min-umi-reads=' + str(cutoff)) print >> sys.stderr, 'Running', ' '.join(cmd) subprocess.check_call(cmd, cwd=os.getcwd())
def get_downsample_info(downsample, total_seq_bases): """Returns information about input versus requested GB when downsampling """ available_gb = total_seq_bases / 1e9 requested_gb = None requested_rate = None downsample_succeeded = True if downsample is None: post_downsample_gb = available_gb else: if downsample.get('gigabases', None) is not None: requested_gb = float(downsample['gigabases']) post_downsample_gb = min(available_gb, requested_gb) if available_gb < requested_gb: martian.log_info( "Downsample requested more GB than was available; will not downsample." ) downsample_succeeded = False elif downsample.get('subsample_rate', None) is not None: requested_rate = float(downsample['subsample_rate']) post_downsample_gb = available_gb * requested_rate return { 'available_gb': available_gb, 'requested_gb': requested_gb, 'requested_rate': requested_rate, 'post_downsample_gb': post_downsample_gb, 'downsample_succeeded': downsample_succeeded }
def run_model(dirname, targeted=False): if targeted: exe = "len_mass_model_targeted" iters = 600 else: exe = 'len_mass_model' iters = 900 args = [ exe, 'optimize', 'iter=%d' % iters, 'data', "file=" + os.path.join(dirname, "input.R"), 'output', 'file=' + os.path.join(dirname, "output.csv"), "refresh=10", "init=" + os.path.join(dirname, "init.R"), "random", "seed=4072180573", ] print " ".join(args) proc = subprocess.Popen(args, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate() martian.log_info(stdoutdata) martian.log_info(stderrdata) return (proc.returncode == 0)
def split(args): contig_info = get_contig_info(args) length_bases = [v for v in contig_info["contig_lengths"].values()] total_bins = 0 for bases in length_bases: bins, over = divmod(bases, args.profile_bin_size) total_bins += bins if over > 0: total_bins += 1 with open(args.per_cell_summary_metrics) as f: summary_line_count = len(f.readlines()) total_nodes = 2 * (summary_line_count - 1) - 1 martian.log_info("Bins: %d, Nodes: %d" % (total_bins, total_nodes)) # observed worst-case dlconverter memory scenario is when the pipeline is # copying two elements: the complete float64 float read depth # 2*(8 x (nodes*2(cells)-1) and the int8 ploidy (in theory, # 2*nodes*2(cells)-1, for a total of 18*(total_nodes*total_bins), but # observed is closer to 21x. Going higher (24x) to be safe. # # Estimated hit for a 1000-cell dataset is 7GB. # Update 5/29/18: Test dataset JD-100_79 broke these assumptions, and requested # slightly more memory than expected. The easiest solution is to just add a bit # of a buffer. Changing it to 26x and adding a 4GB minimum. This is an ad-hoc # solution, but it *should* be safe generally. Better to err on the side of # not crashing things. mem_bytes = total_nodes * total_bins * 26 mem_gb = int(math.ceil(mem_bytes / (1024.0 * 1024.0 * 1024.0))) + 6 martian.log_info("Bins: %d, Nodes: %d, Bytes: %d" % (total_bins, total_nodes, mem_bytes)) return {'chunks': [], 'join': {'__mem_gb': mem_gb, '__threads': 2}}
def join(args, outs, chunk_defs, chunk_outs): contig_info = get_contig_info(args) with open(outs.contig_info_json, 'w') as outfile: json.dump(contig_info, outfile) call = [ "dlconverter", args.sample_id, "--output", outs.output_for_dloupe, "--description", args.sample_desc, "--node-profile-h5", args.normalized_node_profiles, "--contig-info-json", outs.contig_info_json, "--merged-bed", args.node_cnv_calls, "--tree-data", args.tree_data, "--tracks", args.tracks, "--per-cell-summary", args.per_cell_summary_metrics ] gene_annotation_path = tk_ref.get_loupe_genes(args.reference_path) if os.path.exists(gene_annotation_path): call.extend(["--gene-annotations", gene_annotation_path]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] martian.log_info("Running dlconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("dlconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_dloupe = None martian.throw("Could not generate .dloupe file: \n%s" % e.output)
def main(args, outs): metrics = {} for fname in args.metrics: if fname is not None: with open(fname, 'r') as f: metrics.update(json.load(f)) # Normalize "NaN" values for key in metrics: value = metrics[key] if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)): metrics[key] = None # add version info metrics['cellranger-atac_version'] = martian.get_pipelines_version() if len(metrics) > 0: martian.log_info('Writing out summary_metrics') with open(outs.metrics, 'w') as outfile: outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True)) # compile summary.csv metrics # load library info and fake libraries as species metric_registry = MetricAnnotations() metrics_csv_dict = {} if args.library_info is not None: with open(args.library_info, 'r') as f: library_info = pickle.load(f) library_list = [library_info[n]['library_id'] for n in library_info.keys()] metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list)) # load species level metrics ctg_mgr = ReferenceManager(args.reference_path) metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species())) write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
def log_ps( pid = None ): MAX_LINES = 6 ps_cmd = ["ps", "--sort=-rss", "-eo", "pid,pmem,rss,uid,cmd"] ps_log=textwrap.dedent(""" ---- START of ps output ---- Output of the 'ps' command is intended to diagnose whether there are other memory intensive processes running on the same machine. Process names are limited to the first five characters, and Supernova processes show up as "exe." Note that the Supernova process that failed here will *not* be in the list, as it has already exited. The RSS column describes the physical memory used by the process in kB. """) if pid is not None: ps_log+="Note that he process id for this Supernova process was " + str(pid) + "\n\n" ps_log += "Running command " + " ".join(ps_cmd) + "\n" try: ps_out = subprocess.check_output( ps_cmd ) for line in ps_out.split("\n")[-MAX_LINES:]: s=line.split() if len(s) == 5: ps_log += "\t%8s %5s %12s %s %5.5s\n" % tuple(s) else: ps_log += " ".join(s) except Exception as e: ps_log += "Running ps command failed: " + str(e) + " on line " + line + "\n" ps_log += "---- END of ps output ----\n" martian.log_info( ps_log )
def run_assembly(fastq_pref, fasta_pref, args): cmd = [ 'vdj_asm', 'asm', fastq_pref, fasta_pref, '--kmers=' + str(args.min_kmer_count), '--min-contig=' + str(args.min_contig_len), '--min-qual=' + str(args.min_qual), '--score-factor=' + str(args.score_factor), '--qual-factor=' + str(args.qual_factor), '--min-sw-score=' + str(args.min_sw_score), '--rt-error=' + str(args.rt_error) ] if not cr_chem.has_umis(args.chemistry_def): martian.log_info('Assembly without UMIs is not fully supported.') cutoff = args.min_readpairs_per_umi[str(args.gem_group)] if cr_chem.is_paired_end(args.chemistry_def): cmd.append('--min-umi-reads=' + str(2 * cutoff)) else: cmd.append('--min-umi-reads=' + str(cutoff)) cmd.append('--single-end') if args.use_unmapped: cmd.append('--use-unmapped') #cmd.append('--mixture-filter') print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd())
def prune(matrix, num_analysis_bcs=None, random_state=None): """Remove all cells that show no counts. If num_analysis_bcs is provided, it choses those number of barcodes. Finally, it returns a modified input matrix""" np.random.seed(0 if random_state is None else random_state) if matrix is None: return None if num_analysis_bcs: num_bcs = len(matrix.bcs) bc_indices = np.sort(np.random.choice(np.arange(num_bcs), size=min(num_analysis_bcs, num_bcs), replace=False)) matrix = matrix.select_barcodes(bc_indices) nbcs = matrix.bcs_dim # keep barcode that have at least one peak left peaks_per_bc = (matrix.m > 0).sum(axis=0) keep_bcs = np.squeeze(np.array(peaks_per_bc != 0)) matrix = matrix.select_barcodes(np.where(keep_bcs)[0]) nbcs_new = matrix.bcs_dim martian.log_info("filtered out {} barcodes".format(nbcs - nbcs_new)) return matrix
def main(args, outs): paired_end = cr_chem.is_paired_end(args.chemistry_def) # Write compressed files outs.read1s += cr_constants.LZ4_SUFFIX outs.read2s += cr_constants.LZ4_SUFFIX cutadapt_out = os.path.join(os.path.dirname(outs.chunked_reporter), 'cutadapt_stdout') with open(cutadapt_out, 'w') as cut_stdout: status = run_cutadapt(args, outs.read1s, outs.read2s, args.chemistry_def, cut_stdout) if args.read2s_chunk == None: outs.read2s = None if status != 0: martian.log_info('Error while running cutadapt') else: reporter = vdj_report.VdjReporter(primers=cr_utils.get_primers_from_dicts(args.primers)) get_vdj_trim_metrics(reporter, cutadapt_out, paired_end) reporter.save(outs.chunked_reporter)
def join(args, outs, chunk_defs, chunk_outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return reference = ReferenceManager(args.reference_path) contig_info_fn = martian.make_path("contig_info.json") with open(contig_info_fn, 'w') as outfile: contig_info = get_contig_info(args.reference_path) json.dump(contig_info, outfile) gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.feature_barcode_matrix, "--analysis", args.analysis, "--output", outs.output_for_cloupe, "--description", '"' + args.sample_desc + '"', "--peaks", args.peaks, "--fragmentsindex", args.fragments_index, "--geneannotations", reference.genes, "--contiginfo", contig_info_fn, ] if args.metrics_json is not None: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv is not None: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json is not None: call.extend(["--gemgroups", gem_group_index_json]) transcript_gene_types = get_annotation_gene_types(args) if transcript_gene_types is not None: call.extend(["--geneannotationtypes", ",".join(transcript_gene_types)]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] # but keep the arg 'call' here because log_info inherently # attempts to encode the message... (TODO: should log_info # figure out the encoding of the input string) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError as e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def split(args): # determine number of fastq file for each library and gem group, {gem_group : {library_type : count_of_fastq_file} } chunk_counts = defaultdict(lambda: defaultdict(int)) for chunk in args.chunks: chunk_counts[chunk["gem_group"]][chunk["library_type"]] += 1 single_library = True for gem_group in chunk_counts: if len(chunk_counts[gem_group]) > 1: single_library = False if single_library: martian.log_info( 'Single library in input. No need to check barcode compatibility.') # `[]` for the chunks will skip the main return {'chunks': [], 'join': {}} num_reads_to_check_barcode = cr_constants.NUM_READS_TO_CHECK_BARCODE if args.num_reads_to_check_barcode is None else args.num_reads_to_check_barcode chunks = [] for chunk in args.chunks: chunk_def = chunk chunk_def['num_reads_per_chunk_to_check_barcode'] = int( tk_stats.robust_divide( num_reads_to_check_barcode, chunk_counts[chunk["gem_group"]][chunk["library_type"]])) chunks.append(chunk_def) return {'chunks': chunks, 'join': {'__mem_gb': 4}}
def process_fastq_chunk_no_demult(seq_iters, filenames, file_cache, _interleave_map, summary_counts, max_reads = -1): if _interleave_map is None: interleave_map = range(len(seq_iters)) else: interleave_map = _interleave_map read_iterators = itertools.izip(*seq_iters) n = 0 for read_set in read_iterators: # Log the counts for each sample index summary_counts[DEMULTIPLEX_INVALID_SAMPLE_INDEX] += 1 target_streams = [file_cache.get(x) for x in filenames] for i in range(len(read_set)): target_index = interleave_map[i] read_set[i].write(target_streams[target_index]) n += 1 if (n%10**5) == 0: martian.log_info("Reads processed %i" % n) if max_reads > 0 and n >= max_reads: break
def _read_stitched_coverage(profiles_h5, reference_path, window_size, mask_data=None): # # load profiles into memory # profiles = crdna_profiles.ProfilesData2(profiles_h5, reference_path, load_conf_filter=False, reuse_mask_from=mask_data) # # apply mappability mask # nbins, n_unmasked = profiles.apply_mask(use_default_mask=True, use_conf_filter=False) martian.log_info("%s: (%d/%d) unmasked bins" % (profiles_h5, n_unmasked, nbins)) # # rebin data to requested window size # norm_factor = window_size / profiles.get_window_size() profiles.aggregate(norm_factor) # # calculate dpcv and local_dpcv # only include autosomes # _, _, coverage = profiles.get_stitched_coverage( allow_sex_chromosomes=False) return coverage, profiles
def pick_common_indexes(self, fastqs): index_counts = self.get_index_counts(fastqs) items_list = index_counts.items() items_list.sort(cmp=None, key=lambda x: x[1], reverse=True) total_counts = sum(v for (k,v) in items_list) c = 0 i = 0 for i in range(len(index_counts)): c += items_list[i][1] if c > 0.90 * total_counts: break # number of barcodes that account for 90% of reads c90 = i # median # of observations of barcodes accounting for the 90% num_obs_good_bcs = numpy.median([ count for (bc, count) in items_list[:(c90+1)] ]) martian.log_info("Median counts of good barcodes in 2e6 reads: %s" % num_obs_good_bcs) min_obs_bc = max(num_obs_good_bcs / 250, 20) # only demultiplex a reasonable number of sample indices if len(items_list) > MAX_INDICES: min_obs_bc = max(min_obs_bc, items_list[MAX_INDICES][1]) good_bcs = [ k for (k,v) in items_list if v > min_obs_bc ] noise_bcs = [ k for (k,v) in items_list if v <= min_obs_bc ] return (good_bcs, noise_bcs)
def main(args, outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.filtered_gene_bc_matrices_h5, "--analysis", get_analysis_h5_path(args), "--output", outs.output_for_cloupe, "--description", args.sample_desc ] if args.metrics_json: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json: call.extend(["--gemgroups", gem_group_index_json]) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = subprocess.check_output(call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs): if args.pipestance_type != "count" and args.pipestance_type != "aggr": martian.exit("The type argument must be one of: count, aggr") if args.pipestance_type == "count": pname = "SC_RNA_COUNTER_CS" if args.pipestance_type == "aggr": pname = "SC_RNA_AGGREGATOR_CS" pipestance_exists = os.path.exists(args.pipestance_path) if not pipestance_exists: martian.exit("Invalid pipestance path: %s" % args.pipestance_path) # check to see if an analysis file exists. If it doesn't, then # this is likely a barnyard sample, and we cannot generate a # .loupe file (CELLRANGER-773); analysis_h5_path = os.path.join(args.pipestance_path, "outs/analysis/analysis.h5") # 1.2.0 location only internal_count_h5_path = os.path.join( args.pipestance_path, "SC_RNA_COUNTER_CS/SC_RNA_COUNTER/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5" ) internal_aggr_h5_path = os.path.join( args.pipestance_path, "SC_RNA_AGGREGATOR_CS/SC_RNA_AGGREGATOR/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5" ) if not os.path.exists(analysis_h5_path) \ and not os.path.exists(internal_count_h5_path) \ and not os.path.exists(internal_aggr_h5_path): martian.exit( "Could not find single-species analysis HDF5 file. " + "Loupe Cell Browser files are not generated for multi-species experiments." ) # has to be 1.2 or higher cellranger_pd_before_1_2_path = os.path.join(args.pipestance_path, "CELLRANGER_PD") cellranger_cs_before_1_2_path = os.path.join(args.pipestance_path, "CELLRANGER_CS") if os.path.exists(cellranger_pd_before_1_2_path) or os.path.exists( cellranger_cs_before_1_2_path): martian.exit( "mkloupe is only supported for Cell Ranger 1.2 and later.") call = [ "crconverter", args.sample_id, pname, "--pipestance", args.pipestance_path, "--output", outs.output_for_cloupe ] martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = subprocess.check_output(call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def split(args): # validate if args.kit_type not in ["5'", "3'"]: martian.exit("Kit type is not one of 5' or 3'.") # group by gene tx_dict = get_gene_pred_dict(args.transcripts) tx_by_name2 = defaultdict(list) valid_chroms = set(args.valid_chroms) for tx in tx_dict.itervalues(): if tx.chromosome in valid_chroms: tx_by_name2[tx.name2].append(tx) singletons = [ x[0] for x in tx_by_name2.itervalues() if len(x) == 1 and args.lower_size_cutoff <= len(x[0]) <= args.upper_size_cutoff ] avg = np.mean([len(x) for x in singletons]) med = np.median([len(x) for x in singletons]) martian.log_info( '{} singleton genes under consideration (avg size = {} median size = {}' .format(len(singletons), avg, med)) def tx_to_str(singletons): for x in singletons: yield x.get_gene_pred() chunks = [{ 'tx_subset': list(x), '__mem_gb': 4 } for x in grouper(tx_to_str(singletons), 20)] return {'chunks': chunks, 'join': {'__mem_gb': 32}}
def main(args, outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.filtered_gene_bc_matrices_h5, "--analysis", get_analysis_h5_path(args), "--output", outs.output_for_cloupe, "--description", args.sample_desc ] if args.metrics_json: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json: call.extend(["--gemgroups", gem_group_index_json]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] # but keep the arg 'call' here because log_info inherently # attempts to encode the message... (TODO: should log_info # figure out the encoding of the input string) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def join(args, outs, chunk_defs, chunk_outs): if not chunk_defs or chunk_defs[0].skip: martian.log_info('Skipping peak annotation') outs.peak_annotation = None return chunk_peak_annotations = [chunk.peak_annotation for chunk in chunk_outs if chunk.peak_annotation is not None] combine_csv(chunk_peak_annotations, outs.peak_annotation)
def main(args, outs): hostname = socket.gethostname() if args.output_format == 'bam' and args.read_group is None: martian.exit( "Please specify a read_group to populate the @RG field of the BAM file" ) if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit( "Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) # Check open file handles limit ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): """For each slice produce a fasta file sampling reads from that slice. We split our section of the genome into a bunch of 20kb chunks. For each chunk we sample an identical number of paired end reads. The name of each read encodes the true position that it was sampled from.""" # Grab basic stats for the read lengths and quality scores stats_fp = open(args.basic_stats) stats = json.load(stats_fp) # Fix the random seed np.random.seed(0) # Info is a map we use everywhere to track the sampling parameters. # r1_len: the length of read1 # r2_len: the length of read2 # insert_size_map: a map of insert-size (as a string) to frequency # q_score_map a map of quality score (as a string) to frequency info = {'r1_len': stats['r1_len'], 'r2_len': stats['r2_len']} info['q_score_map'] = { '30': stats['bc_q30_bases'], '20': stats['bc_q20_bases'] - stats['bc_q30_bases'], '0': stats['bc_tot_bases'] - stats['bc_q20_bases'] } stats_is_fp = open(args.insert_sizes) info['insert_size_map'] = json.load(stats_is_fp)['60'] # How many samples will we make from each window? samples = int( round(2.0 * args.target_coverage * (float(args.window_size) / (stats['r1_len'] + stats['r2_len'])))) martian.log_info("Using %i samples per %i bin" % (samples, args.window_size)) output_path = martian.make_path("chnk.fasta") output = open(output_path, "w") ref = reference.open_reference(args.reference_path) #Loop over every window in every loci. for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci): cur = start while (cur < end): # Sample |samples| reads from chrom:cur-chrom:cur+window_size and put # the results in the output file perbin(chrom, cur, ref, output, info, args.window_size, samples) cur += args.window_size outs.tmp = output_path outs.samples_per_bin = samples output.close()
def log_ps( ): MAX_LINES = 6 ps_cmd = ["ps", "--sort=-rss", "-eo", "pid,pmem,rss,comm,uid"] ps_log = "Running command " + " ".join(ps_cmd) + "\n" try: ps_out = subprocess.check_output( ps_cmd ) for i, line in enumerate( ps_out.split("\n") ): if i == MAX_LINES: break ps_log += "%8s %5s %12s %5.5s %s\n" % tuple(line.split()) except: ps_log += "Running ps command failed\n" martian.log_info( ps_log )
def join(args, outs, chunk_defs, chunk_outs): """Joins the various chunk outputs and computes further summary metrics based on the merged metrics.""" martian.log_info("Combining miscellaneous summary managers") misc_sm = combine_summary_managers([chunk.misc_sm for chunk in chunk_outs]) martian.log_info("Computing summary metrics") compute_summary_metrics(misc_sm) with open(outs.summary, 'w') as outfile: outfile.write( tenkit.safe_json.safe_jsonify( misc_sm.get_summarizer('metrics').dict, pretty=True))
def gen_metric_helptext(self, keys): """Processes a metrics dictionary and generates helptext for keys if present in metrics.csv""" output = [] for key in keys: if key in self.metric_data: metric_info = self.metric_data[key] if metric_info.help_description is not None: output += [[ metric_info.full_name, [metric_info.help_description] ]] else: martian.log_info( '{} not found in registered metrics'.format(key)) return output
def load_fragments_filtered(self, fn, bcs_to_use): ''' Load fragment data for coalescence calculation ''' martian.log_info("loading fragment data") def fragment_filter(frags): #return np.logical_and(frags.num_reads > 1, frags.bc.isin(bcs_to_use)) return frags.bc.isin(bcs_to_use) frags = kt_hdf.read_data_frame_filtered( fn, fragment_filter, query_cols=['bc', 'num_reads', 'chrom', 'start_pos']) return frags
def do_not_make_cloupe(args): """ Returns True if there is a reason why this stage should not attempt to generate a .cloupe file """ if args.no_secondary_analysis: martian.log_info( "Skipping .cloupe generation by instruction (--no-secondary-analysis)" ) return True if args.analysis is None: martian.log_info( "Skipping .cloupe generation due to missing analysis folder") return True if not os.path.exists(args.filtered_gene_bc_matrices_h5): martian.log_info( "Skipping .cloupe generation due to missing or zero-length gene-barcode matrix" ) return True genomes = cr_matrix.GeneBCMatrices.load_genomes_from_h5( args.filtered_gene_bc_matrices_h5) if len(genomes) > 1: martian.log_info( "Skipping .cloupe generation due to multiple species in the gene-barcode matrix" ) return True return False
def do_not_make_cloupe(args): """ Returns True if there is a reason why this stage should not attempt to generate a .cloupe file """ if args.no_secondary_analysis: martian.log_info( "Skipping .cloupe generation by instruction (--no-secondary-analysis)" ) return True if args.analysis is None or not os.path.exists(args.analysis): martian.log_info( "Skipping .cloupe generation due to missing analysis hdf5 file") return True if args.feature_barcode_matrix is None or not os.path.exists( args.feature_barcode_matrix): martian.log_info( "Skipping .cloupe generation due to missing or zero-length feature-barcode matrix" ) return True ref_mgr = ReferenceManager(args.reference_path) if len(ref_mgr.list_species()) > 1: martian.log_info( "Skipping .cloupe generation as the sample is composed of multiple genomes" ) return True return False
def estimate_mean_coverage(targets_file, bam_in, read_filter=lambda x: True): if targets_file is not None: target_regions_dict = tk_io.get_target_regions_dict(open(targets_file)) # Pick a random sample of target regions to estimate overall depth on targets = [(chrom, start, end) for (chrom, regions) in target_regions_dict.items() for (start, end) in regions if end - start > 0] if len(targets) == 0: martian.log_info("No non-empty target regions") return 1.0 np.random.seed(0) regions_to_sample = min(EXONS_SAMPLE_COVERAGE, len(targets)) region_indices = np.random.choice(len(targets), regions_to_sample, replace=False) sample_targets = [targets[idx] for idx in region_indices] else: # Pick a series of random intervals on the genome to measure coverage np.random.seed(0) if sum(bam_in.lengths) < 1e6: num_windows = WGS_WINDOWS_SMALL_GENOME else: num_windows = WGS_WINDOWS_SAMPLE_COVERAGE chrom_probs = np.array(bam_in.lengths, dtype=np.float) / sum( bam_in.lengths) rand_chroms = np.random.choice(len(bam_in.lengths), num_windows, replace=True, p=chrom_probs) starts = [ np.random.randint(max(bam_in.lengths[chrom] - WGS_WINDOW_SIZE, 1)) for chrom in rand_chroms ] sample_targets = [(bam_in.references[chrom], start, min(start + WGS_WINDOW_SIZE, bam_in.lengths[chrom])) for (chrom, start) in zip(rand_chroms, starts)] mean_depth = float( np.mean([ mean_coverage_region(bam_in, region, read_filter) for region in sample_targets ])) return mean_depth