def main(args, outs): raw_profiles, mask = coverage_matrix.load_matrix( args.raw_singlecell_profiles, args.reference_path) # Get mappability, GC content ncells = raw_profiles[0].shape[0] # Default: (intercept, linear, quadratic) = (1.0, 0.0, 0.0) # Sum up all single-cell profiles try: print('DEBUG 0') result = estimate_gc_bias(args.raw_singlecell_profiles, args.tracks, args.reference_path) print('DEBUG result') print(result) (quadratic, linear, intercept) = result['Summary']['quadratic_coefficients'] print('DEBUG intercept=%f, linear=%f, quadratic=%f' % (intercept, linear, quadratic)) except Exception as error: martian.alarm( "stages/copy_number_processor/estimate_gc_bias_coefficients/__init__ encountered an exception. Error: %s" % repr(error)) # try/except # # Export scale factor and GC bias coefficients outs.linear = linear outs.quadratic = quadratic
def load_data(fn, reference_path): X, Xm = coverage_matrix.load_matrix(fn, reference_path) store = pd.HDFStore(fn, "r") barcodes = np.array(store["barcodes"]) store.close() return X, Xm, barcodes
def main(args, outs): raw_profiles, mask = coverage_matrix.load_matrix( args.raw_singlecell_profiles, args.reference_path) chromosomes = coverage_matrix.list_primary_contigs( args.raw_singlecell_profiles, args.reference_path) print(chromosomes) bin_size = coverage_matrix.get_bin_size(args.raw_singlecell_profiles) tracks = pd.HDFStore(args.tracks, 'r') coverage_matrix.store_matrix(file_name=outs.normalized_singlecell_profiles, chroms=chromosomes, profiles=raw_profiles, tracks=tracks, window_size=bin_size) tracks.close()
def _load_hdf5_file(self, filename, reference_path, reuse_mask_from=None): X, Xm = load_matrix(filename, reference_path) if len(X) == 0: raise ValueError( "Loading profiles from %s returned zero contigs matching the reference at %s" % (filename, reference_path)) # it's low cost to hang on to a contig_manager instance self._contig_manager = contig_manager.contig_manager(reference_path) # this list has to match the list inside load_matrix primary_contigs = self._contig_manager.primary_contigs( allow_sex_chromosomes=True) store = pd.HDFStore(filename, "r") self._window_size = store['constants']['window_size'] if 'barcodes' in store: self._barcodes = np.array(store['barcodes']) self._num_cells = len(self._barcodes) else: self._barcodes = None self._num_cells = len(X[0]) profile_contigs = set(list_all_contigs(store)) i = 0 self._contig_list = [] self._contig_coverage = {} self._contig_mask = {} self._contig_idx = {} for chrom in primary_contigs: if chrom in profile_contigs: self._contig_list.append(chrom) self._contig_coverage[chrom] = X[i] if reuse_mask_from: self._contig_mask[chrom] = reuse_mask_from._contig_mask[ chrom] else: self._contig_mask[chrom] = Xm[i] nbins = X[i].shape[1] self._contig_idx[chrom] = np.arange(1, nbins * self._window_size, self._window_size) i = i + 1 store.close()
def main(args, outs): normalized_singlecell_profiles, mask = coverage_matrix.load_matrix( args.normalized_singlecell_profiles, args.reference_path) print('DEBUG generate_final_clusters/__init__.main():') print('normalized_singlecell_profiles[0].shape') print(normalized_singlecell_profiles[0].shape) ncells = normalized_singlecell_profiles[0].shape[0] results = [range(ncells)] try: if args.skip_clustering: print('Skipping clustering.') else: ## NOTE: this is a temporary short circuit of clustering when there are more than ## 500 cells. We will revisit this module and fix the issue later. if True: # ncells < 500: # results = cluster_jedna.cluster(normalized_singlecell_profiles, mask, n_merge=25, score_cutoff=10) results = cluster_jedna.cluster(normalized_singlecell_profiles, mask, n_merge=25, score_cutoff=5) else: martian.alarm( "Too many cells for clustering. Putting all cells in one cluster." ) # if ncells else # if skip_clustering else except Exception as error: martian.alarm( "Clustering encountered an exception. Putting all cells in one cluster. Error: %s" % repr(error)) # try/except # out_file = open(outs.clusters, 'w') out_file.write(tenkit.safe_json.safe_jsonify(results)) out_file.close()
def main(args, outs): # Read in heuristics dvr_segment_length = int(args.params["dvr_segment_length"]) dvr_f_cutoff = args.params["dvr_f_cutoff"] dvr_trim_level = args.params["dvr_trim_level"] dvr_min_length = int(args.params["dvr_min_length"]) # print('-' * 80) print('Entering __init__.main()') # default_n_merge = 50 default_bin_size = 2e4 target_bin_count = 200.0 confident_genome_fraction = 1.0 raw_profiles, _ = coverage_matrix.load_matrix(args.raw_profiles, args.reference_path) # # Iterate over all chrom_name chromosomes = coverage_matrix.list_primary_contigs( args.normalized_profiles, args.reference_path, allow_sex_chromosomes=False) # TODO: allow sex chromosomes normalized_profiles_h5 = pd.HDFStore(args.normalized_profiles, 'r') try: original_bin_size = normalized_profiles_h5["constants"]["window_size"] except Exception as error: print('__init__.main() caught an error %s' % repr(error)) original_bin_size = default_bin_size # try/except # segments = [] profiles_list = [] for chrom_name in chromosomes: tmp = [] chr_profiles = normalized_profiles_h5['/contigs/' + chrom_name].astype( np.float64).values mask = coverage_matrix.contig_to_mask(normalized_profiles_h5, chrom_name) # #print(mask[:10]) assert (mask.shape[0] == chr_profiles.shape[1]) n_nodes = chr_profiles.shape[0] for node_id in xrange(n_nodes): # Dynamically determine bin resolution raw_cell_counts = scn.get_single_cell_counts(raw_profiles, node_id) n_bins = raw_cell_counts.shape[0] n_merge = scn.estimate_n_merge(raw_cell_counts, target_bin_count, confident_genome_fraction) if (np.isnan(n_merge) | ~np.isfinite(n_merge) | (n_merge > n_bins)): n_merge = default_n_merge # if NaN bin_size = original_bin_size * n_merge #print('n_merge=%d' % n_merge) #print('bin_size=%d' % bin_size) # weights = mask.copy() weights = weights.astype(float) weights = scn.merge_bins_single_cell_single_chrom(weights, n_merge, average=True) weights[weights <= 0.2] = np.nan #print('weights:') #print(weights) # cnv_profile = chr_profiles[node_id, :] #print('cnv_profile before masking:') #print(cnv_profile[:20]) cnv_profile[~mask] = 0.0 #print('cnv_profile after masking:') #print(cnv_profile[:20]) # # Merge bins: cnv_profile = scn.merge_bins_single_cell_single_chrom(cnv_profile, n_merge, average=True) cnv_profile /= weights # #print('type(cnv_profile):') #print(type(cnv_profile)) #print('cnv_profile.shape:') #print(cnv_profile.shape) #print('cnv_profile:') #print(cnv_profile) #print(cnv_profile.tolist()) # # Call CNVs on this profile. The result is an array with the copy number # per bin at each node_id. # block = dvr.partition_profile(cnv_profile, segment_length=dvr_segment_length, f_cutoff=dvr_f_cutoff, trim_level=dvr_trim_level, min_length=dvr_min_length) block['NodeID'] = node_id block['Chr'] = chrom_name block['Start'] = block['Start'] * bin_size + 1 block['End'] = (block['End'] + 1) * bin_size segments.append(block) # # Debugging: tmp.append(cnv_profile) # # for node_id # # Debugging: profiles_list.append(tmp) # # for chrom_name normalized_profiles_h5.close() # #print('block:') #print(block) export_segments(outs.cnvs, segments) # # Debugging: np.save(outs.cnvs + '_profiles.npy', profiles_list) print('Leaving __init__.main()') print('.' * 80)
def load_data(file_name, reference_path): profiles, mask = coverage_matrix.load_matrix(file_name, reference_path) return((profiles, mask))
def main(args, outs): """Compute a CNV confidence score from the profile for a specific choice of cluster and contig.""" martian.log_info('Entering __init__.main()') node_start = args.chunk['start'] # exclusive end node_end = args.chunk['end'] raw_profiles, mask = coverage_matrix.load_matrix(args.raw_profiles, args.reference_path, start_cell=node_start, end_cell=node_end) bin_size = coverage_matrix.get_bin_size(args.raw_profiles) ## read in CNV data for nodes of interest node_column = COLUMN_NAMES.index("NodeID") cnv_calls = read_cnv_data(args.cnv_calls, node_start, node_end, node_column) # scale = get_scaling_factors(raw_profiles, cnv_calls) with open(args.gc_norm_params, "r") as handle: gc_norm_params = json.load(handle) linear = gc_norm_params["linear"] quadratic = gc_norm_params["quadratic"] # ref = contig_manager.contig_manager(args.reference_path) # # Get mappability, GC content: bin_parameters = [] vesna.load_track_parameters(args.tracks, bin_parameters, ref) # logp, cnv_calls2 = ccs.process_cnv_calls(raw_profiles, mask, bin_parameters, args.reference_path, args.sex, scale, linear, quadratic, cnv_calls, bin_size) export_segments(outs.cnvs, cnv_calls2, node_start) # free some memory del cnv_calls del cnv_calls2 # # Compute confidence values for unmerged, broken-up CNV calls # unmerged_cnv_calls = read_cnv_data(args.unmerged_cnv_calls, node_start, node_end, node_column) _, unmerged_cnv_calls2 = ccs.process_cnv_calls(raw_profiles, mask, bin_parameters, args.reference_path, args.sex, scale, linear, quadratic, unmerged_cnv_calls, bin_size, logp=logp) export_segments(outs.unmerged_cnvs, unmerged_cnv_calls2, node_start) # # Debugging: # martian.log_info('Leaving __init__.main()') martian.log_info('.' * 80)
def main(args, outs): normalized_profiles = [] raw_profiles, mask = coverage_matrix.load_matrix( args.raw_singlecell_profiles, args.reference_path) print('len(mask)=%d' % len(mask)) print('len(raw_profiles)=%d' % len(raw_profiles)) chromosomes = coverage_matrix.list_primary_contigs( args.raw_singlecell_profiles, args.reference_path) print('chromosomes:') print(chromosomes) n_chrom = len(chromosomes) # # Get mappability, GC content: bin_parameters = [] vesna.load_track_parameters(args.tracks, bin_parameters) n_cells = raw_profiles[0].shape[0] linear = args.linear quadratic = args.quadratic gc0 = 0.45 # TODO: Replace this with mean of GC in good bins across entire genome # remove = [] for chrom_index, chrom_name in enumerate(chromosomes): try: mappability = get_mappability(bin_parameters, chrom_name, ordered_chromosomes) gc_gc0 = get_gc(bin_parameters, gc0, chrom_name, ordered_chromosomes) print('len(mappability)=%d' % len(mappability)) print('len(gc_gc0)=%d' % len(gc_gc0)) print('raw_profiles[chrom_index].shape:') print(raw_profiles[chrom_index].shape) expectation = mappability * (1.0 + linear * gc_gc0 + quadratic * gc_gc0 * gc_gc0) #print('expectation') #print(expectation.tolist()) tmp = np.zeros(raw_profiles[chrom_index].shape, dtype='float') for cell in range(n_cells): #print('tmp[cell, :] before:') #print(tmp[cell, :].tolist()) tmp[cell, :] = raw_profiles[chrom_index][cell, :] / expectation tmp[cell, tmp[cell, :] < 0.0] = 0.0 #print('tmp[cell, :] after:') #print(tmp[cell, :].tolist()) # for cell normalized_profiles.append(tmp) except Exception as error: martian.alarm( "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s" % repr(error)) print( "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s" % repr(error)) print( 'Removing chrom_name=%s, chrom_index=%d (absent from input raw profiles)' % (chrom_name, chrom_index)) remove.append(chrom_name) # try/except # for chrom for chrom_name in remove: if chrom_name in chromosomes: chromosomes.remove(chrom_name) # if chrom_name # for chrom_name # # Export normalized cell profiles bin_size = 20000 # TODO: Fetch this value from input raw_profiles h5 file tracks = pd.HDFStore(args.tracks, 'r') coverage_matrix.store_matrix(file_name=outs.normalized_singlecell_profiles, chroms=chromosomes, profiles=normalized_profiles, tracks=tracks, window_size=bin_size) tracks.close()