def join(args, outs, chunk_defs, chunk_outs): ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) profiles, gc, mask = load_data(args.raw_profiles, args.tracks, chroms) gc_norm_params = json.load(open(args.gc_norm_params, "r")) scale = gc_norm_params["scale"] linear = gc_norm_params["linear"] quadratic = gc_norm_params["quadratic"] norm_profiles = gc_normalize(profiles, gc, linear, quadratic, chroms) bin_size = coverage_matrix.get_bin_size(args.raw_profiles) coverage_matrix.store_matrix(file_name=outs.normalized_profiles, chroms=chroms, profiles=norm_profiles, tracks=None, window_size=bin_size, masks=mask, dtype="float32") store = pd.HDFStore(outs.normalized_profiles, "a") constants = load_h5(args.raw_profiles, "constants") store["constants"] = constants store.close() store = pd.HDFStore(outs.normalized_profiles, "a") store["/gc_params/scale"] = pd.Series(scale) store["/gc_params/linear"] = pd.Series(linear) store["/gc_params/quadratic"] = pd.Series(quadratic) store.close()
def split(args): ref = contig_manager.contig_manager( args.reference_path ) constants = load_h5(args.sc_norm_profiles, "constants").to_dict() ncells = constants["ncells"] window_size = constants["window_size"] # maximum memory usage is the maximum of these four values: # sumbins = sum(len(c) for c in primary_contigs)/window_size # maxbins = max(len(c) for c in all_contigs)/window_size # X + Q + H = ((2*sizeof(i8) + 2*sizeof(f32)) * ncells * sumbins) # occupancy = sizeof(f32) * levels(=6) * (ncells - 1) * sumbins / nchunks(=100) # het = X + Q + H + occupancy # X + Y + Z = ((2*sizeof(float)) * ncells * maxbins) # merged_bed = sc_cnv_calls_bed + internal_cnv_calls_bed # unmerged_bed = sc_unmerged_cnv_calls_bed + internal_unmerged_cnv_calls_bed # * NOTE: ask for the double the matrix sizes to acct for intermediate values f32sz = 4 sumbins = sum(ref.contig_lengths[c]/window_size+1 for c in ref.primary_contigs()) maxbins = max(ref.contig_lengths[c]/window_size+1 for c in ref.list_all_contigs()) XQH_mem_gb = float((2 + 2*f32sz) * ncells * sumbins)/1e9 occ_mem_gb = float(f32sz * 6 * (ncells - 1) * sumbins/100)/1e9 het_mem_gb = XQH_mem_gb + occ_mem_gb XYZ_mem_gb = 2 * float(f32sz * ncells * maxbins) / 1e9 merged_bed_gb = os.path.getsize(args.sc_cnv_calls)/1e9 + \ os.path.getsize(args.internal_cnv_calls)/1e9 + 1 unmerged_bed_gb = os.path.getsize(args.sc_unmerged_cnv_calls)/1e9 + \ os.path.getsize(args.internal_unmerged_cnv_calls)/1e9 + 1 mem_gb = int(np.ceil(max(het_mem_gb, XYZ_mem_gb, merged_bed_gb, unmerged_bed_gb))) + 3 return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
def split(args): ref = contig_manager.contig_manager(args.reference_path) ## every primary chromosome gets its own chunk ## all the secondary pieces are in one chunk chrom_chunks = [] non_primary_chunk = [] for chrom in ref.list_all_contigs(): if ref.is_primary_contig(chrom, allow_sex_chromosomes=True): chrom_chunks.append([chrom]) else: non_primary_chunk.append(chrom) if len(non_primary_chunk) > 0: chrom_chunks.append( non_primary_chunk ) chrom_sizes = ref.contig_lengths max_size = 0 for chroms in chrom_chunks: chunk_size = sum([chrom_sizes[chrom] for chrom in chroms]) max_size = max(max_size, chunk_size) nbcs = 0 for v in args.cell_barcodes.itervalues(): nbcs += len(v) max_mat_size = 4*nbcs*max_size/args.window_size chunk_mem_gb = int(np.ceil((1.0*max_mat_size/1e9) + 1)) join_mem_gb = int(np.ceil(1.0*max_mat_size/1e9 + 1.0*sum(chrom_sizes.values())/args.window_size/1e9 + 1)) chunk_defs = [{'chroms': chroms, '__mem_gb': chunk_mem_gb} for chroms in chrom_chunks] return {'chunks': chunk_defs, 'join': {'__mem_gb': join_mem_gb}}
def get_contig_info(args): manager = contig_manager.contig_manager(args.reference_path) contig_info = {"contig_order": {}, "contig_lengths": {}} contig_lengths = manager.get_contig_lengths() for idx, contig in enumerate(manager.contigs["primary_contigs"]): contig_info["contig_order"][contig] = idx contig_info["contig_lengths"][contig] = contig_lengths[contig] contig_info["species"] = manager.list_species() return contig_info
def split(args): ref = contig_manager.contig_manager(args.reference_path) chroms = ref.list_all_contigs() max_chrom_size = max([ref.contig_lengths[chrom] for chrom in chroms]) constants = load_h5(args.profiles, "constants").to_dict() ncells = constants["ncells"] window_size = constants["window_size"] max_mat_size_gb = float( 2 * ncells * max_chrom_size / window_size) / 1e9 * 4 mem_gb = int(np.ceil(max_mat_size_gb * 4 + 1)) return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
def _load_hdf5_file(self, filename, reference_path): self._contig_manager = contig_manager.contig_manager(reference_path) self._contig_list = self._contig_manager.primary_contigs( allow_sex_chromosomes=True) store = pd.HDFStore(filename, "r") self._window_size = store['constants']['window_size'] self._conf_filter = {} for chrom in self._contig_list: cmask = (store["/CONF/"+chrom].values > crdna.constants.CONFIDENT_BIN_THRESHOLD) & \ (store["/N/"+chrom].values < 1.0/self._window_size) self._conf_filter[chrom] = cmask # for chrom store.close()
def split(args): MAX_CHUNKS = 30 MIN_CELLS_PER_CHUNK = 100 ## TODO : store ncells in the profiles.h5 as a constant so we don't have ## to do this to get the number of cells ref = contig_manager.contig_manager(args.reference_path) chrom = ref.primary_contigs(allow_sex_chromosomes=True)[0] store = pd.HDFStore(args.profiles, "r") ncells, _ = store["/contigs/" + chrom].shape store.close() ## no cells, do nothing! if ncells < 1: return {'chunks': [], 'join': {}} nchunks = np.clip( ncells / MIN_CELLS_PER_CHUNK + int(ncells % MIN_CELLS_PER_CHUNK != 0), 1, MAX_CHUNKS) cells_per_chunk = ncells / nchunks + int(ncells % nchunks != 0) mat_size_gb = coverage_matrix.get_genome_matrix_size_gb(args.profiles) chunk_mem_gb = int(np.ceil(4 * mat_size_gb / ncells * cells_per_chunk + 1)) join_mem_gb = int(np.ceil(4 * mat_size_gb + 1)) ## if this is a multi species sample do nothing if len(ref.list_species()) > 1: return { 'chunks': [{ 'chunk': { 'start': 0, 'end': ncells, 'ncells': ncells } }], 'join': { '__mem_gb': join_mem_gb } } chunk_defs = [{ 'chunk': { 'start': i, 'end': min(i + cells_per_chunk, ncells), 'ncells': ncells }, '__mem_gb': chunk_mem_gb } for i in xrange(0, ncells, cells_per_chunk)] return {'chunks': chunk_defs, 'join': {'__mem_gb': join_mem_gb}}
def split(args): ref = contig_manager.contig_manager(args.reference_path) contig_lengths = ref.get_contig_lengths() target_regions = None all_loci = [] for (chrom_name, chrom_size) in contig_lengths.iteritems(): all_loci.extend( generate_chrom_loci(target_regions, chrom_name, chrom_size, 100000000)) locus_sets = pack_loci(all_loci) chunk_defs = [{'loci': loci} for loci in locus_sets] return {'chunks': chunk_defs}
def split(args): ref = contig_manager.contig_manager(args.reference_path) contig_lengths = ref.get_contig_lengths() target_regions = None all_loci = [] for (chrom_name, chrom_size) in contig_lengths.iteritems(): all_loci.extend( generate_chrom_loci(target_regions, chrom_name, chrom_size, tenkit.constants.PARALLEL_LOCUS_SIZE)) locus_sets = pack_loci(all_loci) chunk_defs = [{'loci': loci, '__mem_gb': 12} for loci in locus_sets] return {'chunks': chunk_defs, 'join': {'__mem_gb': 12}}
def main(args, outs): ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) ## read in calls as dataframe calls = pd.read_csv(args.cnv_calls, sep="\t", names=[ "chrom", "start", "end", "ploidy", "confidence", "cluster_index" ]) ## figure out dimensions of cnv_tracks and gather chrom data nclusters = len(np.unique(calls["cluster_index"].values)) window_size = args.window_size contig_sizes = ref.get_contig_lengths() chrom_bin_sizes = {} nbins = 0 chrom_offset = {} for chrom in chroms: chrom_offset[chrom] = nbins csize = contig_sizes[chrom] cbins = csize / window_size + int(csize % window_size != 0) chrom_bin_sizes[chrom] = cbins nbins += cbins cnv_tracks = np.zeros((nclusters, nbins), dtype="int32") for chrom in chroms: p = ref.expected_ploidy(chrom, args.sex) csize = chrom_bin_sizes[chrom] cnv_tracks[:, chrom_offset[chrom]:chrom_offset[chrom] + csize] = p nclusters = calls["cluster_index"].unique().shape[0] for ci in xrange(nclusters): print ci cluster_calls = calls[calls["cluster_index"] == ci] for _, row in cluster_calls.iterrows(): offset = chrom_offset[row["chrom"]] assert (row["start"] - 1) % window_size == 0 #assert row["end"] % window_size == 0 sbin = offset + (row["start"] - 1) / window_size ebin = offset + row["end"] / window_size cnv_tracks[ci, sbin:ebin] = row["ploidy"] out_store = pd.HDFStore(outs.cnv_tracks, "w") out_store["cnv_tracks"] = pd.DataFrame(cnv_tracks) out_store.close()
def split(args): ctg_mgr = contig_manager.contig_manager(args.reference_path) ## every primary chromosome gets its own chunk ## all the secondary pieces are in one chunk chrom_chunks = [] non_primary_chunk = [] for chrom in ctg_mgr.list_all_contigs(): if ctg_mgr.is_primary_contig(chrom, allow_sex_chromosomes=True): chrom_chunks.append([chrom]) else: non_primary_chunk.append(chrom) if len(non_primary_chunk) > 0: chrom_chunks.append( non_primary_chunk ) chunk_defs = [{'chroms': chroms, '__mem_gb': 12} for chroms in chrom_chunks] return {'chunks': chunk_defs, 'join': {'__mem_gb': 12}}
def join(args, outs, chunk_defs, chunk_outs): store = pd.HDFStore( outs.profiles, "w" ) ## put cell barcodes in canonical order bc_list = [] for v in args.cell_barcodes.itervalues(): bc_list.extend(v.keys()) # for v bc_list = list(set(bc_list)) bc_list.sort( ) store["barcodes"] = pd.Series( bc_list ) # load ref to determine primary-ness all_chroms = [] masks = [] genomebins = 0 ncells = None for chunk_out, chunk_def in zip(chunk_outs, chunk_defs): chroms = chunk_def.chroms all_chroms.extend(chroms) profile_chunk = pd.HDFStore( chunk_out.profiles, "r" ) for chrom in chroms: mask = profile_chunk["/masks/" + chrom] masks.extend(mask) store["/contigs/" + chrom] = profile_chunk["/contigs/" + chrom] store["/masks/" + chrom] = mask genomebins += profile_chunk["constants"]["genomebins"] if ncells is None: ncells = profile_chunk["constants"]["ncells"] # for chrom profile_chunk.close( ) # for chunk_out, chunk_def ## store the window size in the h5 store["constants"] = pd.Series({"window_size": args.window_size, "ncells" : ncells, "genomebins" : genomebins}) ref = contig_manager.contig_manager(args.reference_path) write_mask_bed(outs.mappable_regions,store,all_chroms,args.window_size,ref, args) store.close( )
def estimate_cnv_confidence_score_v2( raw_profiles, cnv_calls, reference_path, logp, bin_size ): """ Calculates a CNV confidence score (log(posterior)) for each CNV call using the pre-computed logp matrix of per-bin confidence scores. """ ref = contig_manager.contig_manager(reference_path) chrom_names = ref.primary_contigs(allow_sex_chromosomes=True) PER_BIN_MAX_SCORE = 100.0 scores = np.zeros( len(cnv_calls), dtype='int32' ) for i, cnv_call in enumerate(cnv_calls.itertuples()): # # the create cnv tracks module already sets confidence to zero for masked bins # just use that confidence if it's already set # if cnv_call.Confidence==0.0: continue chrom_name = cnv_call.Chr chrom_index = chrom_names.index(chrom_name) start = int(round(cnv_call.Start / bin_size)) # end in BED file is exclusive end = int(round(cnv_call.End / bin_size)) cell = cnv_call.NodeID # start = max([0, start]) n_bins = raw_profiles[chrom_index].shape[1] end = min([end, n_bins]) # start can == end in the case where a CNV call happens only on the terminal bin # this was found by randomly breaking up a reference such that a mappable bin is # cut in two. This is extremely unlikely in a 'real' reference since bins at the # end of contigs will unmappable and/or have the same ploidy as the neighboring # bins. The pipeline steps that will lead to this are in # CREATE_CNV_TRACKS_AND_BED which converts cluster_data.h5 into cnv_calls.bed if start == end: # special case: CNV call on single bin score = np.nansum( logp[chrom_index][cell,start] ) score = np.clip(score, 0, PER_BIN_MAX_SCORE) scores[i] = min(np.round(score * 100), np.iinfo("uint8").max) else: score = np.nansum( logp[chrom_index][cell,start:end] ) score = np.clip(score, 0, PER_BIN_MAX_SCORE*(end-start)) scores[i] = min(np.round(score/(end-start)*100), np.iinfo("uint8").max) cnv_calls['Confidence'] = scores return cnv_calls
def split(args): if args.input is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0", '__mem_gb': 1}] return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}} ref = contig_manager.contig_manager(args.reference_path) species_list = ref.list_species() if (args.force_cells is not None and args.force_cells > 0 and len(species_list) > 1): martian.exit( "force_cells can only be used for single species reference.") min_chunks = 10 bam_in = tk_bam.create_bam_infile(args.input) chunks = tk_bam.chunk_bam_records(bam_in, chunk_split_func, chunk_size_gb=8.0, min_chunks=min_chunks) # 0.03 =~ 26meg = 1M bcs * (sizeof(int64) + 18) join_mem_gb = int(np.ceil(0.03 * (len(chunks) + 1) + 1)) return {'chunks': chunks, 'join': {'__mem_gb': join_mem_gb}}
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() ref = contig_manager.contig_manager( args.reference_path ) chroms = ref.primary_contigs(allow_sex_chromosomes=True) ## Load data store = pd.HDFStore( args.cnv_tracks, "r" ) Q = store["/cnv_tracks"].values sf = store["/scale_factor"] rpb = store["/reads_per_bin"] segment_windows = store["constants"]["segment_windows"] store.close( ) if args.tracks is None: gmask = np.ones(Q.shape[1], dtype=bool) else: gmask = [] maptrack = pd.HDFStore(args.tracks, "r") for chrom in chroms: x = maptrack["/map/"+chrom].values ## TODO make this consistent across stages gmask.extend( x > MAPPABILITY_THRESHOLD ) maptrack.close( ) gmask = np.array(gmask) ## Aggregate all cells to the same resolution and compute L1 norm Q_agg = np.round(aggregate_matrix( Q[:, gmask].astype("float32"), segment_windows)/segment_windows).astype("int32") distances, Z = compute_linkage( Q_agg ) out_store = pd.HDFStore( outs.data, "w") out_store["/Z"] = pd.DataFrame(Z) out_store["distances"] = pd.Series(distances) out_store["constants"] = pd.Series({"segment_windows": segment_windows}) out_store["scale_factor"] = sf out_store["reads_per_bin"] = rpb out_store.close( )
def _load_hdf5_file(self, filename, reference_path, reuse_mask_from=None): X, Xm = load_matrix(filename, reference_path) if len(X) == 0: raise ValueError( "Loading profiles from %s returned zero contigs matching the reference at %s" % (filename, reference_path)) # it's low cost to hang on to a contig_manager instance self._contig_manager = contig_manager.contig_manager(reference_path) # this list has to match the list inside load_matrix primary_contigs = self._contig_manager.primary_contigs( allow_sex_chromosomes=True) store = pd.HDFStore(filename, "r") self._window_size = store['constants']['window_size'] if 'barcodes' in store: self._barcodes = np.array(store['barcodes']) self._num_cells = len(self._barcodes) else: self._barcodes = None self._num_cells = len(X[0]) profile_contigs = set(list_all_contigs(store)) i = 0 self._contig_list = [] self._contig_coverage = {} self._contig_mask = {} self._contig_idx = {} for chrom in primary_contigs: if chrom in profile_contigs: self._contig_list.append(chrom) self._contig_coverage[chrom] = X[i] if reuse_mask_from: self._contig_mask[chrom] = reuse_mask_from._contig_mask[ chrom] else: self._contig_mask[chrom] = Xm[i] nbins = X[i].shape[1] self._contig_idx[chrom] = np.arange(1, nbins * self._window_size, self._window_size) i = i + 1 store.close()
def calculate_logposterior_matrix( raw_profiles, poisson_expectations, mask, cnv_calls, reference_path, bin_size ): """Create a ncell x nbins matrix of log(posterior) values """ ref = contig_manager.contig_manager(reference_path) chrom_names = ref.primary_contigs(allow_sex_chromosomes=True) logp = [] ncells = 0 for chrom_index in xrange(len(raw_profiles)): if ncells==0: ncells = raw_profiles[chrom_index].shape[0] nbins = raw_profiles[chrom_index].shape[1] logp.append( np.zeros( ( ncells, nbins ), dtype='float32' ) ) for cnv_call in cnv_calls.itertuples(): # # the create cnv tracks module already sets confidence to zero for masked bins # just use that confidence if it's already set # if cnv_call.Confidence==0.0: continue chrom_name = cnv_call.Chr chrom_index = chrom_names.index(chrom_name) ploidy = int(round(cnv_call.CopyNumber)) assert(ploidy >= 0), 'Negative ploidy: %s' % repr(cnv_call) start = int(round(cnv_call.Start / bin_size)) # end in BED file is exclusive end = int(round(cnv_call.End / bin_size)) cell = cnv_call.NodeID # start = max(0, start) n_bins = raw_profiles[chrom_index].shape[1] end = min(end, n_bins) scores = get_segment_scores(raw_profiles[chrom_index][cell,:], poisson_expectations[chrom_index][cell,:], mask[chrom_index], start, end, ploidy) logp[chrom_index][cell,start:end] = scores # for cnv_call return logp
def write_sorted_bed(chunk_getter, outfilename): with open(outfilename, 'w') as out_file: # for chunk in chunk_outs: if not os.path.exists(chunk_getter(chunk)): continue # if !exists with open(chunk_getter(chunk), 'r') as in_file: shutil.copyfileobj(in_file, out_file, 1024 * 1024) # for chunk ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) chrom_index = dict([(c, i) for i, c in enumerate(chroms)]) cnv_df = pd.read_csv(outfilename, sep="\t", names=COLUMN_NAMES) cnv_df["chrom_index"] = cnv_df["Chr"].apply(chrom_index.get) cnv_df.sort_values(by=["chrom_index", "Start", "End"], inplace=True) cnv_df.to_csv(outfilename, sep="\t", columns=COLUMN_NAMES, header=False, index=False)
def main(args, outs): ref = contig_manager.contig_manager(args.reference_path) args.coerce_strings() outs.coerce_strings() # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.summary = None return bam_in = tk_bam.create_bam_infile(args.input) bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # Skip reads without a barcode bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk) bc_read_iter = itertools.groupby(bam_chunk_filt, lambda x: crdna_io.get_read_barcode(x)) counts = {} for bc, reads in bc_read_iter: for r in reads: contig = bam_in.references[r.tid] species = ref.species_from_contig(contig) if not species in counts: counts[species] = {} if not bc in counts[species]: counts[species][bc] = 0 if r.is_secondary or r.is_supplementary: ## we are ignoring alternate alignments continue if (r.is_unmapped or r.mapping_quality < CELL_DETECT_MAPQ_THRESHOLD or r.is_duplicate): ## if read is unmapped, poor mapping quality or dup continue counts[species][bc] += 1 outs.counts = counts
def split(args): args.coerce_strings() # ctg_mgr = contig_manager.contig_manager(args.reference_path) chroms = ctg_mgr.primary_contigs(allow_sex_chromosomes=False) # # Handle case when clusters = None if args.clusters is None: ncells = coverage_matrix.get_num_cells(args.coverage_profile, args.reference_path) clusters = [[x] for x in xrange(ncells)] else: f = open(args.clusters) clusters = json.load(f) cart_prod = [] for chrom in chroms: for ci, cluster in enumerate(clusters): chunk_def = {'chrom': chrom, 'cluster_index': ci} cart_prod.append(chunk_def) # for cluster # for chrom # # Split these pieces into at most MAX_CHUNKS chunks MAX_CHUNKS = 100 npieces = len(cart_prod) pieces_per_chunk = npieces / MAX_CHUNKS + int(npieces % MAX_CHUNKS != 0) chunks = [] start = 0 while start < npieces: chunk_def = {"chroms": [], "cluster_indices": []} end = min(start + pieces_per_chunk, npieces) for i in xrange(start, end): chunk_def["chroms"].append(cart_prod[i]["chrom"]) chunk_def["cluster_indices"].append(cart_prod[i]["cluster_index"]) chunks.append(chunk_def) start += pieces_per_chunk assert len(chunks) <= MAX_CHUNKS return {'chunks': chunks}
def split(args): with open(args.cnv_calls, 'r') as infile: nodes = {l.rstrip().split('\t')[3] for l in infile} num_nodes = len(nodes) store = pd.HDFStore(args.raw_profiles) ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) max_chrom_nbins = max(store["/contigs/" + chrom].shape[1] for chrom in chroms) store.close() MAX_CHUNKS = 30 MIN_NODES_PER_CHUNK = 5 nchunks = np.clip(np.ceil(1.0 * num_nodes / MIN_NODES_PER_CHUNK), 1, MAX_CHUNKS) nodes_per_chunk = max(1, int(np.ceil(1.0 * num_nodes / nchunks))) chromsz_gb = 1.0 * max_chrom_nbins * max(1, num_nodes) / 1e9 matsize_gb = ( 1.0 * coverage_matrix.get_genome_matrix_size_gb(args.raw_profiles) * nodes_per_chunk / max(1, num_nodes)) unmerged_gb = int(np.ceil(os.path.getsize(args.unmerged_cnv_calls) / 1e9)) chunk_mem_gb = int(np.ceil(6 * max(matsize_gb, chromsz_gb) + 2)) join_mem_gb = int(np.ceil(6 * unmerged_gb + 2)) chunk_defs = [{ 'chunk': { 'start': i, 'end': min(i + nodes_per_chunk, num_nodes) }, '__mem_gb': chunk_mem_gb } for i in xrange(0, num_nodes, nodes_per_chunk)] return {'chunks': chunk_defs, 'join': {'__mem_gb': join_mem_gb}}
def join(args, outs, chunk_defs, chunk_outs): ref = contig_manager.contig_manager(args.reference_path) ## only run normalization for single species samples species_list = ref.list_species() if len(species_list) == 1: chroms = ref.primary_contigs(allow_sex_chromosomes=True) profiles, mask, _ = load_genome_data(args.raw_profiles, args.tracks, chroms) gc = load_gc_data(args.tracks, chroms) scale, linear, quadratic = estimate_gc_normalization( profiles, gc, mask) else: ncells = coverage_matrix.get_num_cells(args.raw_profiles, args.reference_path) scale = [1.0] * ncells linear = [0.0] * ncells quadratic = [0.0] * ncells with open(outs.gc_norm_params, "w") as out: gc_norm_data = { "scale": scale, "linear": linear, "quadratic": quadratic } json.dump(gc_norm_data, out, indent=4)
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) ## load genome data and cell profiles X, gmask, bdy = load_genome_data( args.profiles, args.tracks, chroms, mappability_threshold=crdna.constants.MAPPABILITY_THRESHOLD) # compute chromosome boundaries after masking by gmask cbdy = np.zeros_like(bdy) for i in xrange(1, len(bdy)): cbdy[i] = (gmask[0:bdy[i]].sum()) ## load GC info and create GC emission track gctrack = [] store = pd.HDFStore(args.tracks, "r") for chrom in chroms: gctrack.extend(store["/GC/" + chrom].values) gctrack = np.array(gctrack)[gmask] store.close() store = pd.HDFStore(args.ll_ratios, "r") llrs = store["/llrs"].values store.close() nbins = gmask.sum() ncells = X.shape[0] ## Heuristics to define breakpoints ll_threshold = 5 delta_threshold = 0.10 Y_quant = np.zeros((ncells, nbins), dtype="int8") scale_factor = np.zeros(ncells) windows_per_cell = [] gc_norm_params = json.load(open(args.gc_norm_params, "r")) print "Starting loop over cells" sys.stdout.flush() for i in xrange(ncells): print "-" * 80 print "Cell", i sys.stdout.flush() ## genome profile y = X[i][gmask] ## log likelihood ratio profile ll = llrs[i] ## GC coefficients gc_linear = gc_norm_params["linear"][i] gc_quadratic = gc_norm_params["quadratic"][i] ## GC correction track for cell xi = parabola(gctrack, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi_low = parabola(crdna.constants.MIN_GC, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi_high = parabola(crdna.constants.MAX_GC, crdna.constants.GC_ORIGIN, gc_linear, gc_quadratic) xi[gctrack < crdna.constants.MIN_GC] = xi_low xi[gctrack > crdna.constants.MAX_GC] = xi_high ## Define breakpoints ## bp_cands2 = get_breakpoint_positions(y, ll, xi, ll_threshold=ll_threshold, delta_threshold=delta_threshold) assert bp_cands2[0] == 0, "genome start must be breakpoint" assert bp_cands2[-1] == y.shape[0], "genome end must be breakpoint" ## define segments using breakpoints segment_bdy = [] for j in xrange(len(bp_cands2) - 1): segment_bdy.append((bp_cands2[j], bp_cands2[j + 1])) ## add chromosome boundaries as mandatory breakpoints segment_bdy = break_segments_at_points(segment_bdy, cbdy, verbose=False) validate_segment_intervals(segment_bdy, cbdy) ## aggregate bins within a segment to resolution given by window ## and compute segment mean read counts and lengths window = int( np.round(crdna.constants.BREAKPOINT_READ_THRESHOLD / np.median(y[y > 0]))) window = np.clip(window, 1, None) windows_per_cell.append(window) segment_means = [] segment_lengths = [] for s, e in segment_bdy: segment = y[s:e] xi_piece = xi[s:e] length = e - s agg = [] xi_agg = [] j = 0 while j < length: piece = segment[j:j + window] assert len(piece) > 0, "%d, %d-%d" % (j, s, e) corr = float(window) / len(piece) agg.append(corr * piece.sum()) xi_agg.append(xi_piece[j:j + window].mean()) j += window agg = np.array(agg) xi_agg = np.array(xi_agg) ## remove outliers med = np.median(agg) mad = np.abs(agg - med) mmad = np.median(mad) outlier_mask = mad <= 5 * mmad segment_means.append( np.sum(agg[outlier_mask]) / np.sum(xi_agg[outlier_mask])) segment_lengths.append(e - s) segment_means = np.array(segment_means) segment_lengths = np.array(segment_lengths) ## Find the scaling factor to produce integer ploidies ## Heuristics # max ploidy to assign to initially chosen long segment max_ploidy_long = 10 # max value of segment mean to consider "zero ploidy" zero_ploidy_count = crdna.constants.BREAKPOINT_READ_THRESHOLD / 4.0 # longest segment with segment mean > zero_ploidy_count # that we will push to zero ploidy max_segment_push_to_zero = 200 # prior params prior_params = { "prior_mean": args.params.get("prior_mean", 2.0), "prior_std": args.params.get("prior_std", 1.0) } min_ploidy = args.params.get("min_ploidy", None) max_ploidy = args.params.get("max_ploidy", None) lam_best = find_best_scale_v14( y, segment_bdy, segment_means, segment_lengths, window, max_ploidy_long=max_ploidy_long, zero_ploidy_count=zero_ploidy_count, prior_params=prior_params, max_segment_push_to_zero=max_segment_push_to_zero, min_ploidy=min_ploidy, max_ploidy=max_ploidy, verbose=True) print "Scaling factor:" print lam_best assert lam_best > 0.001, "run away to zero" scale_factor[i] = lam_best / window ## Compute the ploidy vector ploidy = get_ploidy_vector(y, segment_means, segment_bdy, lam_best) ## Set max ploidy at 127 ploidy = np.clip(ploidy, None, np.iinfo("int8").max) print "Ploidies encountered" print Counter(ploidy).most_common() Y_quant[i, :] = ploidy.astype("int8") ## store in output h5 out_store = pd.HDFStore(outs.denoised_profiles, "w") out_store["/constants"] = pd.Series( {"segment_windows": int(np.median(windows_per_cell))}) out_store["/scale_factor"] = pd.Series(scale_factor) out_store["/quantized"] = pd.DataFrame(Y_quant) out_store.close()
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) store = pd.HDFStore(args.cluster_data, "r") windows = store["windows"] Q = store["quantized"].values # # due to the int8 conversion and the use of -127 as a special value, # unpredictable bad things will happen if Q>126 # in practice we don't expect such a thing to ever occur # martian.log_info("Found %d bins with Q>126" % (Q > 126).sum()) Q[Q > 126] = 126 constants = store["constants"] store.close() ## cnv track ncells = Q.shape[0] store = pd.HDFStore(args.tracks, "r") window_size = store["constants"]["window_size"] nbins = 0 for chrom in chroms: nbins += store["/map/" + chrom].shape[0] C = MISSING_VALUE * np.ones((ncells, nbins), dtype="int8") # # The C array is filled out with the following convention: # * unmasked bins have positive ploidies # * masked bins with imputed ploidies are recorded with negative ploidies # chrom_start = 0 masked_chrom_start = 0 chrom_bdy = {} for chrom in chroms: ctrack = store["/map/" + chrom].values cmask = ctrack > crdna.constants.MAPPABILITY_THRESHOLD chrom_end = chrom_start + len(cmask) masked_chrom_end = masked_chrom_start + cmask.sum() chrom_bdy[chrom] = (chrom_start, chrom_end) C[:, chrom_start:chrom_end][:, cmask] = Q[:, masked_chrom_start: masked_chrom_end] impute_ploidies_for_chromosome_nocall_boundaries( C, chrom_start, chrom_end, window_size) chrom_start = chrom_end masked_chrom_start = masked_chrom_end store.close() in_store = pd.HDFStore(args.cluster_data, "r") out_store = pd.HDFStore(outs.cnv_tracks, "w") out_store["/cnv_tracks"] = pd.DataFrame(C) out_store["/windows"] = windows out_store["constants"] = constants out_store["/ploidy_conf"] = in_store["/ploidy_conf"] out_store["/reads_per_bin"] = in_store["/reads_per_bin"] out_store["/scale_factor"] = in_store["/scale_factor"] out_store.close() in_store.close() ## break up profile into segments and write to BED with open(outs.cnv_calls, "w") as out_bed, open(outs.unmerged_cnv_calls, "w") as out_unmerged_bed: for cell in xrange(ncells): for chrom in chroms: chrom_start, chrom_end = chrom_bdy[chrom] ## chrom piece of CNV chrom_piece = C[cell, chrom_start:chrom_end] for b in get_event_blocks_v2(cell, chrom, chrom_piece, window_size, ref): out_bed.write("\t".join(map(str, b)) + os.linesep) for b in get_event_blocks_v2(cell, chrom, chrom_piece, window_size, ref, merge_imputed_blocks=False): out_unmerged_bed.write("\t".join(map(str, b)) + os.linesep)
def join(args, outs, chunk_defs, chunk_outs): num_sc_bcs = 0 num_qual_reads = 0 num_sc_reads = 0 bc_counts = {} ## compute species_list ref = contig_manager.contig_manager(args.reference_path) species_list = ref.list_species() species_list.sort() ## doublet rate estimation total_unique_cell_barcodes = set() total_cell_barcodes = [] species_counts = {} for (species, species_barcodes) in args.cell_barcodes.iteritems(): species_counts[species] = 0 for bc in species_barcodes.iterkeys(): total_cell_barcodes.append(bc) total_unique_cell_barcodes.add(bc) species_counts[species] += 1 counts = species_counts.values() observed_doublets = len(total_cell_barcodes) - len(total_unique_cell_barcodes) observed_doublet_rate = tk_stats.robust_divide(observed_doublets, float(len(total_cell_barcodes))) inferred_doublets = float('NaN') inferred_doublet_rate = float('NaN') if len(species_counts) > 1: inferred_doublets = _infer_multiplets_from_observed(observed_doublets, counts[0], counts[1]) inferred_doublet_rate = tk_stats.robust_divide(float(inferred_doublets), float(len(total_cell_barcodes))) ## combine barnyard_hits chunks combine_csv([c.barnyard_hits for c in chunk_outs], outs.barnyard_hits, header_lines=1) ## aggregate summary.json from chunks raw_bc_on_whitelist = 0 for j,chunk_out in enumerate(chunk_outs): if chunk_out.summary is None: continue chunk_summary = json.loads(open(chunk_out.summary).read()) num_sc_bcs += chunk_summary['num_sc_bcs'] num_qual_reads += chunk_summary['num_sc_qual_reads'] num_sc_reads += chunk_summary['num_sc_reads'] raw_bc_on_whitelist += chunk_summary['raw_bc_on_whitelist'] chunk_bc_counts_file = open(chunk_out.barcode_histogram) chunk_bc_counts = json.loads(chunk_bc_counts_file.read()) bc_counts.update(chunk_bc_counts) ## combine barnyard chunks combine_csv([c.barnyard for c in chunk_outs], outs.barnyard, header_lines=1) n_reads = np.array(bc_counts.values()) max_val = np.percentile(n_reads, 99.99) * 1.3 min_val = n_reads.min() num_bins = 400 step = math.ceil((max_val - min_val)/num_bins) if max_val - min_val < 1e-6: bins = np.array([min_val, min_val+1]) else: bins = np.arange(min_val, max_val, step) (hist, edges) = np.histogram(n_reads, bins=bins) bc_hist = {int(edges[i]):hist[i] for i in range(len(bins)-1)} cells = 0 for (speci, cell_list) in args.cell_barcodes.iteritems(): cells += len(cell_list) summary_info = {} summary_info['cells_detected'] = cells summary_info['num_sc_bcs'] = num_sc_bcs summary_info['num_sc_qual_reads'] = num_qual_reads summary_info['num_sc_reads'] = num_sc_reads summary_info['fract_sc_reads'] = tk_stats.robust_divide(num_sc_reads, num_qual_reads) summary_info['observed_doublets'] = observed_doublets summary_info['obserbed_doublet_rate'] = observed_doublet_rate summary_info['inferred_doublets'] = inferred_doublets summary_info['inferred_doublet_rate'] = inferred_doublet_rate ## compute stats from barnyard file barnyard_df = pd.read_csv( outs.barnyard ) bkeys = ["amp_rate", "library_complexity", "dup_ratio", "mapped", "mapped_frac"] for species in species_list: if len(species_list) == 1: key_suffix = "" else: key_suffix = "_" + species is_cell_filter = barnyard_df["is_%s_cell_barcode"%species] == 1 species_barcodes = args.cell_barcodes.get(species, {} ) ## compute quartiles, min, mean, max and CV for bkey in bkeys: vals = barnyard_df[bkey][is_cell_filter] for pct in [25, 50, 75]: summary_key = bkey + key_suffix + ("_cells_p%d"%pct) summary_info[summary_key] = tk_stats.robust_percentile(vals, pct) summary_key = bkey + key_suffix + "_cells_cv" summary_info[summary_key] = tk_stats.robust_divide(vals.std(), vals.mean()) summary_key = bkey + key_suffix + "_cells_mean" summary_info[summary_key] = vals.mean() summary_key = bkey + key_suffix + "_cells_min" summary_info[summary_key] = vals.min() summary_key = bkey + key_suffix + "_cells_max" summary_info[summary_key] = vals.max() ## tabulate waste metrics from barnyard_hits file waste_keys = ["no_barcode", "non_cell_barcode", "unmapped", "low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD, "dups", "denominator", "unusable_read"] bh_df = pd.read_csv( outs.barnyard) # calculate median percent unmapped (defined as unmapped / (denominator - non_cell_barcode - no_barcode) banyard_cell_df = bh_df[~(bh_df.cell_id == 'None')] unmapped_frac = 1.0 * banyard_cell_df['unmapped'] / banyard_cell_df['denominator'] unmapped_frac = unmapped_frac.fillna(0) median_unmapped_frac = unmapped_frac.median() waste_totals = {} sum_waste_keys = 0.0 for key in waste_keys: waste_totals[key] = float(bh_df[key].sum( )) if key != "denominator": sum_waste_keys += waste_totals[key] for level, key in enumerate(waste_keys): if key == "denominator": continue summary_info["waste_%s_reads"%key] = waste_totals[key] summary_info["frac_waste_%s"%(key)] = tk_stats.robust_divide( waste_totals[key], waste_totals["denominator"] ) summary_info["waste_total_reads"] = sum_waste_keys summary_info["frac_waste_total"] = tk_stats.robust_divide( sum_waste_keys, waste_totals["denominator"] ) summary_info['frac_raw_bc_on_whitelist'] = float(raw_bc_on_whitelist)/waste_totals["denominator"] summary_info['median_unmapped_frac'] = median_unmapped_frac ## compute leakage metric and add to summary_info if len(species_list) == 2: compute_leakage( outs.barnyard_hits, ref, summary_info ) with open(outs.summary, 'w') as summary_file: summary_file.write(tenkit.safe_json.safe_jsonify(summary_info,pretty=True)) with open(outs.barcode_histogram, 'w') as bc_hist_file: bc_hist_file.write(tenkit.safe_json.safe_jsonify(bc_hist)) # logging print tenkit.safe_json.safe_jsonify(summary_info, pretty=True)
def main(args, outs): #min_insert_size = 0 #max_insert_size = 1e4 ## sc purity threshold: what fraction of contamination by another species ## will we tolerate SC_PURITY_THRESHOLD = 0.95 args.coerce_strings() outs.coerce_strings() # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.summary = None return ## group bam records by barcode NO_BARCODE/raw barcode tag/processed barcode tag bam_in = tk_bam.create_bam_infile(args.input) bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) bc_read_iter = itertools.groupby(bam_chunk, groupbybarcode) ## compute species_list refs = bam_in.references ref = contig_manager.contig_manager(args.reference_path) species_list = ref.list_species() has_species_info = (species_list != [""]) species_list.sort() genome_size = sum(ref.get_contig_lengths().values()) ## index cells of each species cell_index = {} for sp in species_list: bc_list = args.cell_barcodes.get(sp, {}).keys() bc_list.sort( ) for i, b in enumerate(bc_list): y = cell_index.get(b, "") if len(y) == 0: cell_index[b] = "%s_cell_%d"%(sp, i) else: cell_index[b] = y + "_" + "%s_cell_%d"%(sp, i) ## construct and write header for barnyard file barnyard_file = open(outs.barnyard, 'w') barnyard_header = (['BC'] + ["cell_id"] + [s+("_" if has_species_info else "")+"reads_mapq_60" for s in species_list] + [s+("_" if has_species_info else "")+"contigs" for s in species_list] + ['mapped', 'num_mapped_bases', 'soft_clip_frac', 'insert_p50', 'num_mapped_pos', 'mapped_frac', 'amp_rate', 'library_complexity', 'dup_ratio', 'num_pairs'] + ["is_%s_cell_barcode"%s for s in species_list]) waste_keys = ["no_barcode", "non_cell_barcode", "unmapped", "low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD, "dups", "denominator", "unusable_read"] fractional_waste_keys = [ "no_barcode_frac", "non_cell_barcode_frac", "unmapped_frac", "low_mapq_lt_%d_frac"%PROFILE_MAPQ_THRESHOLD, "dups_frac"] barnyard_header.extend(waste_keys) barnyard_header.extend(fractional_waste_keys) barnyard_file.write( ",".join(barnyard_header) + "\n" ) ## wasted data categories ## construct and write header for barnyard_hits file barnyard_hits_file = open( outs.barnyard_hits, "w" ) bh_header = ["barcode", "is_whitelisted"] bh_header.extend(["is_%s_cell_barcode"%s for s in species_list]) bh_header.extend([refname for refname in bam_in.references]) barnyard_hits_file.write( ",".join(bh_header) + "\n" ) # For each barocode, count # per each contig, number per each window (for each window size) # number per species (if available in contig), number per species # TODO: Add detailed matrix by contigs, windows output num_sc_bcs = 0 num_qual_reads = 0 num_sc_reads = 0 ploidy = 2 bc_hist = {} ## count number of raw barcodes that exactly match whitelist ## without any error correction raw_bc_on_whitelist = 0 # dup_summary = json.load(open(args.duplicate_summary)) # pcr_dup_fraction = dup_summary['dup_fraction']['pcr'] #barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) for bc, reads in bc_read_iter: ## collect various forms of wasted data here per barcode wastebin = defaultdict(int) bh_hits = [0 for _ in bam_in.references] dup_count = 1 non_dup = 1 bc_count = 0 num_per_species = defaultdict(int) contigs_per_species = defaultdict(set) total_reads_by_clip = np.zeros(2, dtype=float) insert_length = [] num_pairs = 0 num_mapped = 0 num_mapped_bases = 0 pos_set = set([]) for r in reads: ## secondary/supplementary are never counted towards anything if r.is_secondary or r.is_supplementary: continue ## include everything in the denominator wastebin["denominator"] += 1 ## how many reads have >= 10 soft clipped bases if r.cigartuples is not None: cigar_dict = dict(r.cigartuples) soft_clip_index = int(cigar_dict.get(4, 0) >= 10) total_reads_by_clip[soft_clip_index] += 1 if barnyard_hits_include(r): bh_hits[r.tid] += 1 ## non-whitelisted barcodes count as wasted data if not "-" in bc: wastebin["no_barcode"] += 1 continue if bc[:-2] == r.get_tag(RAW_BARCODE_TAG): raw_bc_on_whitelist += 1 is_cell_bc_read = True ## waste hierarchy ## if not a cell or if read doesn't belong to species, then waste ## else if not mapped, then waste ## else if mapq< 30, then waste ## else if dup, then waste ## is this is a contaminant read from a different species ## it is wasted contig = refs[r.tid] read_species = ref.species_from_contig(contig) if ( not(read_species in args.cell_barcodes) or not(bc in args.cell_barcodes[read_species]) ): wastebin["non_cell_barcode"] += 1 is_cell_bc_read = False elif r.is_unmapped: wastebin["unmapped"] += 1 elif r.mapq < PROFILE_MAPQ_THRESHOLD: wastebin["low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD] += 1 elif r.is_duplicate: wastebin["dups"] += 1 bad_map_or_dup = (r.is_unmapped or (r.mapq < PROFILE_MAPQ_THRESHOLD) or r.is_duplicate) if is_cell_bc_read: bc_count += 1 # if (stringent_read_filter(r, True) and # not(r.is_unmapped) and not(r.mate_is_unmapped)): # if r.is_duplicate: # dup_count += 1 # else: # non_dup += 1 if r.has_tag(DUPLICATE_COUNT_TAG): dup_count += r.get_tag(DUPLICATE_COUNT_TAG) non_dup += 1 elif bad_map_or_dup: # unusable reads are those that are non-cell barcodes that are # also any of unmapped, low mapq, nor dups wastebin['unusable_read'] += 1 ## whether we have a cell barcode or not, count these stats if not bad_map_or_dup: num_mapped += 1 num_mapped_bases += r.reference_length pos_set.add((r.reference_name, r.reference_start/1000)) ## if read is part of a proper pair, only count read or its pair if r.is_proper_pair: if r.is_read1: insert_length.append( r.template_length ) num_pairs += 1 else: continue ## Use MAPQ >= 60 to get accurate mappings only for barnyard stuff if r.mapq < 60: continue num_qual_reads += 1 if has_species_info: num_per_species[read_species] += 1 contigs_per_species[read_species].add(contig) ## end of loop over reads in this barcode assert wastebin['denominator'] - wastebin['no_barcode'] - wastebin['unusable_read'] == num_mapped + \ wastebin["low_mapq_lt_%d" % PROFILE_MAPQ_THRESHOLD] + wastebin['unmapped'] + wastebin['dups'] ## compute the library complexity and amp ## NOTE: insert length is hardcoded as 250, so the amp rate is really the ## library complexity in different units num_amplicons = num_mapped - num_pairs dup_ratio = tk_stats.robust_divide(float(dup_count + non_dup), float(non_dup)) library_complexity = tk_stats.robust_divide(num_amplicons, (dup_ratio-1.0)*2) amp_rate = tk_stats.robust_divide(float(library_complexity * DEFAULT_AMPLICON_LENGTH) , float(ploidy * genome_size)) bc_hist[bc] = bc_count map_rate = tk_stats.robust_divide(float(num_mapped), wastebin["denominator"]) ## write row to barnyard_hits file bh_row = [ bc, int("-" in bc)] for s in species_list: bh_row.append( int(s in args.cell_barcodes and bc in args.cell_barcodes[s]) ) bh_row.extend( bh_hits ) barnyard_hits_file.write(",".join(map(str, bh_row)) + "\n" ) ## write row to barnyard file barnyard_row = ([bc, cell_index.get(bc, "None")] + [num_per_species[s] for s in species_list] + [len(contigs_per_species[s]) for s in species_list] + [num_mapped, num_mapped_bases] + [tk_stats.robust_divide(total_reads_by_clip[1], sum(total_reads_by_clip)), np.median(insert_length) if len(insert_length) else np.nan, len(pos_set), map_rate, amp_rate, library_complexity, dup_ratio, num_pairs]) for speci in species_list: barnyard_row.append( int((speci in args.cell_barcodes) and (bc in args.cell_barcodes[speci])) ) for key in waste_keys: fkey = key + "_frac" if (fkey in fractional_waste_keys): wastebin[fkey] = tk_stats.robust_divide(float(wastebin[key]), float(wastebin["denominator"])) barnyard_row.extend( [ wastebin[x] for x in waste_keys ] ) barnyard_row.extend( [ wastebin[x] for x in fractional_waste_keys ] ) barnyard_file.write( ",".join(map(str, barnyard_row)) + "\n") ## metrics relating to purity - only for multi species if has_species_info and len(species_list) >= 2: counts_by_species = [float(num_per_species[s]) for s in species_list] major_species_index = np.argmax( counts_by_species ) major_species = species_list[major_species_index] species_purity = tk_stats.robust_divide( counts_by_species[major_species_index], np.sum(counts_by_species) ) if species_purity >= SC_PURITY_THRESHOLD: num_sc_bcs += 1 num_sc_reads += num_per_species[major_species] ## END of loop over barcodes summary_info = {} summary_info['num_sc_bcs'] = num_sc_bcs summary_info['num_sc_qual_reads'] = num_qual_reads summary_info['num_sc_reads'] = num_sc_reads summary_info['raw_bc_on_whitelist'] = raw_bc_on_whitelist barnyard_file.close() barnyard_hits_file.close() with open(outs.summary, 'w') as summary_file: summary_file.write(tenkit.safe_json.safe_jsonify(summary_info)) with open(outs.barcode_histogram, 'w') as bc_hist_file: bc_hist_file.write(tenkit.safe_json.safe_jsonify(bc_hist))
def main(args, outs): """Compute a CNV confidence score from the profile for a specific choice of cluster and contig.""" martian.log_info('Entering __init__.main()') node_start = args.chunk['start'] # exclusive end node_end = args.chunk['end'] raw_profiles, mask = coverage_matrix.load_matrix(args.raw_profiles, args.reference_path, start_cell=node_start, end_cell=node_end) bin_size = coverage_matrix.get_bin_size(args.raw_profiles) ## read in CNV data for nodes of interest node_column = COLUMN_NAMES.index("NodeID") cnv_calls = read_cnv_data(args.cnv_calls, node_start, node_end, node_column) # scale = get_scaling_factors(raw_profiles, cnv_calls) with open(args.gc_norm_params, "r") as handle: gc_norm_params = json.load(handle) linear = gc_norm_params["linear"] quadratic = gc_norm_params["quadratic"] # ref = contig_manager.contig_manager(args.reference_path) # # Get mappability, GC content: bin_parameters = [] vesna.load_track_parameters(args.tracks, bin_parameters, ref) # logp, cnv_calls2 = ccs.process_cnv_calls(raw_profiles, mask, bin_parameters, args.reference_path, args.sex, scale, linear, quadratic, cnv_calls, bin_size) export_segments(outs.cnvs, cnv_calls2, node_start) # free some memory del cnv_calls del cnv_calls2 # # Compute confidence values for unmerged, broken-up CNV calls # unmerged_cnv_calls = read_cnv_data(args.unmerged_cnv_calls, node_start, node_end, node_column) _, unmerged_cnv_calls2 = ccs.process_cnv_calls(raw_profiles, mask, bin_parameters, args.reference_path, args.sex, scale, linear, quadratic, unmerged_cnv_calls, bin_size, logp=logp) export_segments(outs.unmerged_cnvs, unmerged_cnv_calls2, node_start) # # Debugging: # martian.log_info('Leaving __init__.main()') martian.log_info('.' * 80)
def estimate_gc_bias(profiles, tracks, reference_path): ## load genome tracks and profiles skipping sex chromosomes ref = contig_manager.contig_manager(reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=False) maptrack = pd.HDFStore(tracks, "r") cmask = [] gctrack = [] bdy = [0] mtrack = [] for chrom in chroms: x = maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD cmask.extend(x) z = bdy[-1] + len(x) gctrack.extend(maptrack["/GC/"+chrom].values) mtrack.extend(maptrack["/map/"+chrom].values) bdy.append(z) cmask = np.array(cmask) maptrack.close( ) gctrack = np.array(gctrack) mtrack = np.array(mtrack) bdy = np.array(bdy) nbins = bdy[-1] pstore = pd.HDFStore(profiles, "r") ncells = len(pstore["/barcodes"].values) X = np.zeros((ncells, nbins), dtype="int32") for ci, chrom in enumerate(chroms): X[:, bdy[ci]:bdy[ci+1]] = pstore["/contigs/"+chrom].values pstore.close( ) ## genome wide profile of all cells @ GC_RES resolution ## restricted to mappable regions y = aggregate_counts(X.sum( axis=0 )[cmask], GC_RES).astype(float) y /= y.mean( ) gc = aggregate_counts(gctrack[cmask], GC_RES)/GC_RES gcbins = np.linspace(MIN_GC, MAX_GC, NUM_GC_BINS+1) gc_vals = 0.5 * (gcbins[1:] + gcbins[:-1]) gc_bin_index = np.searchsorted(gcbins, gc) gc0 = np.nanmean(gc_vals) ## group data points by GC bins and compute the median x_vals = [] y_vals = [] for bi in xrange(1, NUM_GC_BINS+1): bin_filter = gc_bin_index == bi num_data_points = bin_filter.sum( ) if num_data_points < MIN_POINTS_PER_BIN: continue bin_gc = gc_vals[bi-1] bin_val = np.median(y[bin_filter]) x_vals.append(bin_gc) y_vals.append(bin_val) # for bi x_vals = np.array(x_vals) - gc0 ## fit to ax^2 + bx + c a, b, c = np.polyfit(x_vals, y_vals, 2) ## GC metric is mean absolute deviation away from 1.0 gc_metric = np.abs(np.array(y_vals) - 1.0).sum( ) / len(y_vals) ## store gc data in summary summary = {} summary["GC_content"] = x_vals summary["scaled_read_counts"] = y_vals summary["quadratic_coefficients"] = [a, b, c] summary["gc_cells_only"] = gc_metric summary["gc0"] = gc0 #with open(outs.summary, "w") as out: # json.dump(summary, out, indent=4) # return( {'GCMetric': gc_metric, 'Summary': summary})
def main(args, outs): hostname = socket.gethostname() # Sample ID / pipestance name if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) # Check numerical options # types are already checked by mrp so only need to check ranges if args.force_cells is not None and (args.force_cells < 1 or args.force_cells > 20000): martian.exit("MRO parameter force_cells must be a positive integer"\ " <= 20000.") # check min_ploidy, max_ploidy if args.cnv_params is not None: min_ploidy = args.cnv_params.get("min_ploidy", None) max_ploidy = args.cnv_params.get("max_ploidy", None) if min_ploidy is not None and min_ploidy <= 0: martian.exit("Command line argument soft-min-avg-ploidy must be a "\ "positive real number.") if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0): martian.exit("Command line argument soft-max-avg-ploidy must be a "\ "positive real number <= 8.") if (min_ploidy is not None and max_ploidy is not None and max_ploidy <= min_ploidy): martian.exit("Command line arguments must satisfy "\ "soft-min-avg-ploidy < soft-max-avg-ploidy.") # check downsample options if args.downsample is not None and len(args.downsample.keys()) > 0: keys = args.downsample.keys() if len(keys) > 1: martian.exit("Please supply either maxreads or downsample but not "\ "both.") key = keys[0] value = args.downsample[key] param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"} bad_value = False try: float(value) bad_value = value < 1e-12 except ValueError: bad_value = True if bad_value: cs_key = param_map[key] martian.exit("Command line argument %s must be a positive number" % cs_key) # FASTQ input for idx, sample_def in enumerate(args.sample_def): read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") if args.fastq_mode == "BCL_PROCESSOR": sample_indices, msg = tk_preflight.check_sample_indices(sample_def) if sample_indices is None: martian.exit(msg) find_func = tk_fasta.find_input_fastq_files_10x_preprocess reads = [] for sample_index in sample_indices: # process interleaved reads reads.extend(find_func(read_path, "RA", sample_index, lanes)) if len(reads) == 0: martian.exit("No input FASTQs were found for the requested parameters.") elif args.fastq_mode == "ILMN_BCL2FASTQ": sample_names = sample_def.get("sample_names", None) if sample_names is None: martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx)) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult reads1 = [] reads2 = [] for sample_name in sample_names: r1 = find_func(read_path, "R1", sample_name, lanes) r2 = find_func(read_path, "R2", sample_name, lanes) if len(r1) != len(r2): martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx)) reads1.extend(r1) reads2.extend(r2) if len(reads1) == 0 and len(reads2) == 0: martian.exit("No input FASTQs were found for the requested parameters.") else: martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode)) # Reference ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) contig_defs_json_path = os.path.join(args.reference_path, "fasta", "contig-defs.json") faidx_path = os.path.join(args.reference_path, "fasta", "genome.fa.fai") error_msg = contig_manager.verify_contig_defs(contig_defs_json_path, faidx_path) if error_msg is not None: martian.exit(error_msg) try: ref = contig_manager.contig_manager(args.reference_path) except Exception as e: martian.exit("Unexpected error occurred.\n%s"%str(e)) # too many contigs primary = ref.primary_contigs(allow_sex_chromosomes=True) num_primary_contigs = len(primary) if num_primary_contigs > 100: martian.exit("There can be at most 100 primary contigs.") # contig length checks chrom_length_dict = ref.get_contig_lengths() contig_length_exit = 500 * 1000 contig_length_warn = 10 ** 7 offending_contigs_warn = [] offending_contigs_exit = [] for c in primary: clen = chrom_length_dict[c] if clen < contig_length_exit: offending_contigs_exit.append(c) elif clen < contig_length_warn: offending_contigs_warn.append(c) if len(offending_contigs_exit) > 0: martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig must be at least %d bases "\ "in length."%(",".join(offending_contigs_exit), contig_length_exit, contig_length_exit)) elif (not args.check_executables) and len(offending_contigs_warn) > 0: martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig is recommended to be at least %d bases "\ "in length."%(",".join(offending_contigs_warn), contig_length_warn, contig_length_warn)) # Open file handles limit if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def join(args, outs, chunk_defs, chunk_outs): ## merge gc params jsons node_gc_params = {} sc_gc_params = json.load(open(args.sc_gc_params, "r")) internal_gc_params = json.load(open(args.internal_gc_params, "r")) ncells = len(sc_gc_params['linear']) nnodes = 2*ncells - 1 for key in ["scale", "linear", "quadratic"]: node_gc_params[key] = sc_gc_params[key] + internal_gc_params[key] with open(outs.node_gc_params, "w") as out: json.dump(node_gc_params, out, indent=4) ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) index_chrom = dict([(str(i), c) for i, c in enumerate(chroms)]) chrom_index = dict([(c, str(i)) for i, c in enumerate(chroms)]) tmp = martian.make_path('tmp.bed') tmp_dir = os.path.dirname(tmp) tmp_sorted = martian.make_path('tmp_sorted.bed') calls = [[args.sc_cnv_calls, args.internal_cnv_calls], [args.sc_unmerged_cnv_calls, args.internal_unmerged_cnv_calls]] out_calls = [outs.node_cnv_calls, outs.node_unmerged_cnv_calls] for calls, out in zip(calls, out_calls): with open(tmp, 'w') as outf: for f in calls: for l in open(f): fields = l.split() # offset internal node indices by ncells if f == calls[1]: fields[3] = str(int(fields[3]) + ncells) # fix type of confidence field to integer fields[-1] = str(int(float(fields[-1]))) # replace index number at start for sorting fields[0] = chrom_index[fields[0]] outf.write('\t'.join(fields) + '\n') no_unicode = dict(LC_ALL='C') tmp_mem_gib = max(1, int(np.ceil(float(os.path.getsize(tmp)) / (1024**3)))) try: subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n', '--parallel=1', # force sort to use 1 thread '-S', '{}G'.format(tmp_mem_gib), '-T', tmp_dir, '-o', tmp_sorted, tmp], env=no_unicode, stderr=sys.stderr) # on some systems, --parallel is unavailable except subprocess.CalledProcessError: subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n', # will by default only use 1 thread '-S', '{}G'.format(tmp_mem_gib), '-T', tmp_dir, '-o', tmp_sorted, tmp], env=no_unicode, stderr=sys.stderr) # strip index column into outfile with open(out, 'w') as outf: version = martian.get_pipelines_version() outf.write("#cellranger-dna {}\n".format(version)) outf.write("#reference genome: {}\n".format(args.reference_path)) outf.write("#chrom\tstart\tend\tid\tcopy_number\tevent_confidence\n") for l in open(tmp_sorted): l = l.split('\t') l[0] = index_chrom[l[0]] outf.write('\t'.join(l)) os.remove(tmp) os.remove(tmp_sorted) ## cnv tracks file sc_windows = load_h5(args.sc_cnv_tracks, "windows") internal_windows = load_h5(args.internal_cnv_tracks, "windows") windows = sc_windows.append(internal_windows).values constants = load_h5(args.sc_cnv_tracks, "constants") sc_ploidy_conf = scale_confidence_score(load_h5(args.sc_cnv_tracks, "ploidy_conf").values) internal_ploidy_conf = scale_confidence_score(load_h5( args.internal_cnv_tracks, "ploidy_conf").values) sc_scale_factor= load_h5(args.sc_cnv_tracks, "scale_factor") internal_scale_factor = load_h5(args.internal_cnv_tracks, "scale_factor") sc_rpb= load_h5(args.sc_cnv_tracks, "reads_per_bin") internal_rpb= load_h5(args.internal_cnv_tracks, "reads_per_bin") X = load_h5(args.sc_cnv_tracks, "cnv_tracks").values nbins = X.shape[1] Q = np.zeros((nnodes, nbins), dtype=X.dtype) Q[0:ncells, :] = X del X Q[ncells:, :] = load_h5(args.internal_cnv_tracks, "cnv_tracks").values store = pd.HDFStore(outs.node_cnv_tracks, "w") store["constants"] = constants store["windows"] = sc_windows.append(internal_windows) store["ploidy_conf"] = sc_ploidy_conf.append(internal_ploidy_conf) store["scale_factor"] = sc_scale_factor.append(internal_scale_factor) store["reads_per_bin"] = sc_rpb.append(internal_rpb) store["cnv_tracks"] = pd.DataFrame(Q) store.close() ## Compute heterogeneity and store in tree_data ref = contig_manager.contig_manager(args.reference_path) chroms = ref.primary_contigs(allow_sex_chromosomes=True) if args.tracks is None: gmask = np.ones(nbins, dtype=bool) else: gmask = [] maptrack = pd.HDFStore(args.tracks, "r") for chrom in chroms: gmask.extend(maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD) maptrack.close( ) gmask = np.array(gmask) ## update tree data # load tree store = pd.HDFStore( args.tree_data, "r" ) Z = store["/Z"].values distances = store["/distances"].values constants = store["/constants"] store.close( ) # Compute the heterogeneity at every *internal* node of the tree # obviously the heterogeneity is zero at every leaf, so don't # store a bunch of zeros levels = 6 het = compute_heterogeneity(Q, Z, gmask, windows, levels=levels) del Q # dump to disk store = pd.HDFStore( outs.tree_data, "w" ) store["Z"] = pd.DataFrame(Z) store["het"] = pd.DataFrame(het) store["distances"] = pd.Series(distances) store["windows"] = pd.Series(windows) store["constants"] = constants store.close( ) del het ## normalized profiles sc_store = pd.HDFStore(args.sc_norm_profiles, "r") internal_store = pd.HDFStore(args.internal_norm_profiles, "r") out_store = pd.HDFStore(outs.norm_node_profiles, "w") out_store["/constants"] = sc_store["/constants"] for chrom in chroms: ## first do the /contigs X = sc_store["/contigs/"+chrom].values Y = internal_store["/contigs/"+chrom].values assert X.shape[1] == Y.shape[1] nbins = X.shape[1] Z = np.zeros((2*ncells-1, nbins), dtype=X.dtype) Z[:ncells, :] = X Z[ncells:, :] = Y out_store["/contigs/"+chrom] = pd.DataFrame(Z) del X, Y, Z ## next do the /masks out_store["/masks/"+chrom] = sc_store["/masks/"+chrom] ## gc params for key in ["scale", "linear", "quadratic"]: out_store["/gc_params/"+key] = pd.concat([sc_store["/gc_params/"+key], internal_store["/gc_params/"+key]], ignore_index=True) ## do the normalization metrics out_store["/normalization_metrics"] =sc_store["normalization_metrics"].append(internal_store["/normalization_metrics"], ignore_index=True) out_store.close() sc_store.close() internal_store.close()