def test_resize_kernel(): """ Ensure resized kernels are of appropriate size and centered and contain expected values. """ m = 15 # Restrict minimum and maximum dimensions of resized kernels min_allowed_dim, max_allowed_dim = 5, 101 # Use a simple point to check if result is centered point_kernel = np.zeros((m, m)) point_kernel[m // 2, m // 2] = 10 # Try with different combinations of source and target resolutions res_list = [3, 900, 5000, 10000] for kernel_res in res_list: for signal_res in res_list: exp_dim = int(m * kernel_res / signal_res) if not exp_dim % 2: exp_dim -= 1 obs_kernel = preproc.resize_kernel( point_kernel, kernel_res=kernel_res, signal_res=signal_res, min_size=min_allowed_dim, max_size=max_allowed_dim, ) obs_kernel_factor = preproc.resize_kernel( point_kernel, factor=kernel_res / signal_res, min_size=min_allowed_dim, max_size=max_allowed_dim, ) obs_dim = obs_kernel.shape[0] obs_dim_factor = obs_kernel_factor.shape[0] assert obs_dim == obs_kernel.shape[1] assert obs_dim == obs_dim_factor assert obs_dim == max( min(max_allowed_dim, exp_dim), min_allowed_dim ) assert np.max(obs_kernel) == obs_kernel[obs_dim // 2, obs_dim // 2]
def test_missing_corr(cfg): """Test if chromosight's correlation scores are identical to scipy.stats' pearsonr values. """ # We'll correlate the kernel with a zoomed (centered) version # of itself kernel = np.array(cfg['kernels'][0]) mat = sp.csr_matrix(cup.resize_kernel(kernel, factor=10)) # Mask rows in the matrix to simulate missing bins mask = sp.csr_matrix(mat.shape, dtype=bool) center_m, center_n = np.array(mat.shape) // 2 kh, kw = np.array(kernel.shape) // 2 + 1 miss_rows = np.array([-2, 1, 2]) miss_rows += center_m mat[miss_rows, :] = 0.0 mask[miss_rows, :] = True # Use triu to simulate intrachromosomal matrix mat = sp.triu(mat).tocsr() # Convolute kernel on the fake map corr_mat = cud.normxcorr2(mat, kernel, missing_mask=mask, sym_upper=True, full=True)[0] # Retrieve the center pixel: the correlation between the # kernel and the center of the zoomed kernel. obs = corr_mat[center_m, center_n] # Compute Pearson correlation between the center of the # zoomed kernel and the kernel, after removing missing values left, right = center_m - kh + 1, center_m + kh high, low = center_n - kw + 1, center_n + kh mask += sp.tril(sp.csr_matrix(np.ones(mask.shape, dtype=bool))) flat_mask = mask.toarray()[high:low, left:right].flat == 1 exp = pearsonr(mat[high:low, left:right].toarray().flat[~flat_mask], kernel.flat[~flat_mask])[0] assert np.isclose(obs, exp, rtol=0.1)
def cmd_detect(arguments): # Parse command line arguments for detect kernel_config_path = arguments["--kernel-config"] dump = arguments["--dump"] interchrom = arguments["--inter"] iterations = arguments["--iterations"] mat_path = arguments["<contact_map>"] max_dist = arguments["--max-dist"] min_dist = arguments["--min-dist"] min_separation = arguments["--min-separation"] n_mads = float(arguments["--n-mads"]) pattern = arguments["--pattern"] perc_undetected = arguments["--perc-undetected"] precision = arguments["--precision"] resize = arguments["--resize-kernel"] threads = arguments["--threads"] output = arguments["<output>"] win_fmt = arguments["--win-fmt"] subsample = arguments["--subsample"] if subsample == "no": subsample = None plotting_enabled = False if arguments["--no-plotting"] else True smooth_trend = arguments["--smooth-trend"] if smooth_trend is None: smooth_trend = False # If output is not specified, use current directory if not output: output = pathlib.Path() else: output = pathlib.Path(output) output.mkdir(exist_ok=True) if win_fmt not in ["npy", "json"]: sys.stderr.write("Error: --win-fmt must be either json or npy.\n") sys.exit(1) # Read a user-provided kernel config if custom is true # Else, load a preset kernel config for input pattern # Configs are JSON files containing all parameter associated with the pattern # They are loaded into a dictionary in the form : # {"max_iterations": 3, "kernels": [kernel1, kernel2, ...], ...} # Where each kernel is a 2D numpy array representing the pattern if kernel_config_path is not None: custom = True # Loading input path as config config_path = kernel_config_path else: custom = False # Will use a preset config file matching pattern name config_path = pattern ### 0: LOAD INPUT params = { "max_iterations": (iterations, int), "precision": (precision, float), "max_dist": (max_dist, int), "min_dist": (min_dist, int), "min_separation": (min_separation, int), "max_perc_undetected": (perc_undetected, float), } kernel_config = cio.load_kernel_config(config_path, custom) for param_name, (param_value, param_type) in params.items(): kernel_config = _override_kernel_config( param_name, param_value, param_type, kernel_config ) # NOTE: Temporary warning if interchrom: sys.stderr.write( "WARNING: Detection on interchromosomal matrices is expensive in RAM\n" ) hic_genome = HicGenome( mat_path, inter=interchrom, kernel_config=kernel_config, dump=dump, smooth=smooth_trend, ) ### 1: Process input signal # Adapt size of kernel matrices based on the signal resolution if resize: for i, mat in enumerate(kernel_config["kernels"]): kernel_config["kernels"][i] = resize_kernel( mat, kernel_res=kernel_config["resolution"], signal_res=hic_genome.resolution, ) hic_genome.kernel_config = kernel_config # Subsample Hi-C contacts from the matrix, if requested # NOTE: Subsampling has to be done before normalisation hic_genome.subsample(subsample) # Normalize (balance) matrix using ICE hic_genome.normalize(n_mads=n_mads) # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() all_pattern_coords = [] all_pattern_windows = [] ### 2: DETECTION ON EACH SUBMATRIX pool = mp.Pool(int(threads)) n_sub_mats = hic_genome.sub_mats.shape[0] # Loop over the different kernel matrices for input pattern run_id = 0 total_runs = ( len(kernel_config["kernels"]) * kernel_config["max_iterations"] ) sys.stderr.write("Detecting patterns...\n") for kernel_id, kernel_matrix in enumerate(kernel_config["kernels"]): # Adjust kernel iteratively for i in range(kernel_config["max_iterations"]): cio.progress( run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n" ) # Apply detection procedure to all sub matrices in parallel sub_mat_data = zip( hic_genome.sub_mats.iterrows(), [kernel_config for i in range(n_sub_mats)], [kernel_matrix for i in range(n_sub_mats)], [dump for i in range(n_sub_mats)], ) # Run detection in parallel on different sub matrices, and show progress when # gathering results sub_mat_results = [] for i, result in enumerate(pool.imap_unordered(_detect_sub_mat, sub_mat_data, 1)): chr1 = hic_genome.sub_mats.chr1[i] chr2 = hic_genome.sub_mats.chr2[i] cio.progress(i, n_sub_mats, f"{chr1}-{chr2}") sub_mat_results.append(result) #sub_mat_results = map(_detect_sub_mat, sub_mat_data) # Convert coordinates from chromosome to whole genome bins kernel_coords = [ hic_genome.get_full_mat_pattern( d["chr1"], d["chr2"], d["coords"] ) for d in sub_mat_results if d["coords"] is not None ] # Gather newly detected pattern coordinates try: # Extract surrounding windows for each sub_matrix kernel_windows = np.concatenate( [ w["windows"] for w in sub_mat_results if w["windows"] is not None ], axis=0, ) all_pattern_coords.append( pd.concat(kernel_coords, axis=0).reset_index(drop=True) ) # Add info about kernel and iteration which detected these patterns all_pattern_coords[-1]["kernel_id"] = kernel_id all_pattern_coords[-1]["iteration"] = i all_pattern_windows.append(kernel_windows) # If no pattern was found with this kernel # skip directly to the next one, skipping iterations except ValueError: break # Update kernel with patterns detected at current iteration kernel_matrix = cid.pileup_patterns(kernel_windows) run_id += 1 cio.progress(run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n") # If no pattern detected on any chromosome, with any kernel, exit gracefully if len(all_pattern_coords) == 0: sys.stderr.write("No pattern detected ! Exiting.\n") sys.exit(0) # Combine patterns of all kernel matrices into a single array all_pattern_coords = pd.concat(all_pattern_coords, axis=0).reset_index( drop=True ) # Combine all windows from different kernels into a single pile of windows all_pattern_windows = np.concatenate(all_pattern_windows, axis=0) # Compute minimum separation in bins and make sure it has a reasonable value separation_bins = int( kernel_config["min_separation"] // hic_genome.resolution ) if separation_bins < 1: separation_bins = 1 print(f"Minimum pattern separation is : {separation_bins}") # Remove patterns with overlapping windows (smeared patterns) distinct_patterns = cid.remove_neighbours( all_pattern_coords, win_size=separation_bins ) # Drop patterns that are too close to each other all_pattern_coords = all_pattern_coords.loc[distinct_patterns, :] all_pattern_windows = all_pattern_windows[distinct_patterns, :, :] # Get from bins into basepair coordinates coords_1 = hic_genome.bins_to_coords(all_pattern_coords.bin1).reset_index( drop=True ) coords_1.columns = [str(col) + "1" for col in coords_1.columns] coords_2 = hic_genome.bins_to_coords(all_pattern_coords.bin2).reset_index( drop=True ) coords_2.columns = [str(col) + "2" for col in coords_2.columns] all_pattern_coords = pd.concat( [all_pattern_coords.reset_index(drop=True), coords_1, coords_2], axis=1 ) # Filter patterns closer than minimum distance from the diagonal if any min_dist_drop_mask = ( all_pattern_coords.chrom1 == all_pattern_coords.chrom2 ) & ( np.abs(all_pattern_coords.start2 - all_pattern_coords.start1) < int(kernel_config["min_dist"]) ) # Reorder columns at the same time all_pattern_coords = all_pattern_coords.loc[ ~min_dist_drop_mask, [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "kernel_id", "iteration", "score", ], ] all_pattern_windows = all_pattern_windows[~min_dist_drop_mask, :, :] ### 3: WRITE OUTPUT sys.stderr.write(f"{all_pattern_coords.shape[0]} patterns detected\n") # Save patterns and their coordinates in a tsv file cio.write_patterns( all_pattern_coords, kernel_config["name"] + "_out", output ) # Save windows as an array in an npy file cio.save_windows( all_pattern_windows, kernel_config["name"] + "_out", output, format=win_fmt, ) # Generate pileup visualisations if requested if plotting_enabled: # Compute and plot pileup pileup_fname = ("pileup_of_{n}_{pattern}").format( pattern=kernel_config["name"], n=all_pattern_windows.shape[0] ) windows_pileup = cid.pileup_patterns(all_pattern_windows) pileup_plot(windows_pileup, name=pileup_fname, output=output)
def cmd_generate_config(arguments): # Parse command line arguments for generate_config prefix = arguments["<prefix>"] pattern = arguments["--preset"] click_find = arguments["--click"] n_mads = float(arguments["--n-mads"]) win_size = arguments["--win-size"] cfg = cio.load_kernel_config(pattern, False) # If prefix involves a directory, create it if os.path.dirname(prefix): os.makedirs(os.path.dirname(prefix), exist_ok=True) # If a specific window size if requested, resize all kernels if win_size != "auto": win_size = int(win_size) resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0]) cfg['kernels'] = [resize(k) for k in cfg['kernels']] # Otherwise, just inherit window size from the kernel config else: win_size = cfg["kernels"][0].shape[0] # If click mode is enabled, build a kernel from scratch using # graphical display, otherwise, just inherit the pattern's kernel if click_find: hic_genome = HicGenome( click_find, inter=True, kernel_config=cfg, ) # Normalize (balance) the whole genome matrix hic_genome.normalize(n_mads=n_mads) # enforce full scanning distance in kernel config hic_genome.max_dist = hic_genome.matrix.shape[0] * hic_genome.resolution # Process each sub-matrix individually (detrend diag for intra) hic_genome.make_sub_matrices() processed_mat = hic_genome.gather_sub_matrices().tocsr() windows = click_finder(processed_mat, half_w=int((win_size - 1) / 2)) # Pileup all recorded windows and convert to JSON serializable list pileup = ndi.gaussian_filter(cid.pileup_patterns(windows), 1) cfg['kernels'] = [pileup.tolist()] # Show the newly generate kernel to the user, use zscore to highlight contrast hm = plt.imshow( np.log(pileup), vmax=np.percentile(pileup, 99), cmap='afmhot_r', ) cbar = plt.colorbar(hm) cbar.set_label('Log10 Hi-C contacts') plt.title("Manually generated kernel") plt.show() # Write kernel matrices to files with input prefix and replace kernels # by their path in config for mat_id, mat in enumerate(cfg["kernels"]): mat_path = f"{prefix}.{mat_id+1}.txt" np.savetxt(mat_path, mat) cfg["kernels"][mat_id] = mat_path # Write config to JSON file using prefix with open(f"{prefix}.json", "w") as config_handle: json.dump(cfg, config_handle, indent=4)
def cmd_quantify(arguments): bed2d_path = arguments["<bed2d>"] mat_path = arguments["<contact_map>"] output = pathlib.Path(arguments["<output>"]) n_mads = float(arguments["--n-mads"]) pattern = arguments["--pattern"] inter = arguments["--inter"] win_size = arguments["--win-size"] if win_size != "auto": win_size = int(win_size) subsample = arguments["--subsample"] # Create directory if it does not exist if not output.exists(): os.makedirs(output, exist_ok=True) # Load 6 cols from 2D BED file and infer header bed2d = cio.load_bed2d(bed2d_path) # Warn user if --inter is disabled but list contains inter patterns if not inter and len(bed2d.start1[bed2d.chrom1 != bed2d.chrom2]) > 0: sys.stderr.write( "Warning: The bed2d file contains interchromosomal patterns. " "These patterns will not be scanned unless --inter is used.\n" ) # Parse kernel config kernel_config = cio.load_kernel_config(pattern, False) # Instantiate and preprocess contact map hic_genome = HicGenome(mat_path, inter=inter, kernel_config=kernel_config) # enforce full scanning distance in kernel config kernel_config["max_dist"] = ( hic_genome.matrix.shape[0] * hic_genome.resolution ) kernel_config["min_dist"] = 0 # Notify contact map instance of changes in scanning distance hic_genome.kernel_config = kernel_config # Subsample Hi-C contacts from the matrix, if requested if subsample != "no": hic_genome.subsample(subsample) # Normalize (balance) matrix using ICE hic_genome.normalize(n_mads) # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() # Initialize output structures bed2d["score"] = 0.0 positions = bed2d.copy() if win_size != "auto": km = kn = win_size else: km, kn = kernel_config["kernels"][0].shape windows = np.zeros((positions.shape[0], km, kn)) # For each position, we use the center of the BED interval positions["pos1"] = (positions.start1 + positions.end1) // 2 positions["pos2"] = (positions.start2 + positions.end2) // 2 # Use each kernel matrix available for the pattern for kernel_id, kernel_matrix in enumerate(kernel_config["kernels"]): # Only resize kernel matrix if explicitely requested if win_size != "auto": kernel_matrix = resize_kernel(kernel_matrix, factor=win_size / km) kh = (km - 1) // 2 kw = (kn - 1) // 2 # Iterate over intra- and inter-chromosomal sub-matrices for sub_mat in hic_genome.sub_mats.iterrows(): mat = sub_mat[1] # Filter patterns falling onto this sub-matrix sub_pat = positions.loc[ (positions.chrom1 == mat.chr1) & (positions.chrom2 == mat.chr2) ] sub_pat_idx = sub_pat.index.values # Convert genomic coordinates to bins for horizontal and vertical axes for ax in [1, 2]: sub_pat_ax = sub_pat.loc[:, [f"chrom{ax}", f"pos{ax}"]].rename( columns={f"chrom{ax}": "chrom", f"pos{ax}": "pos"} ) sub_pat_bins = hic_genome.coords_to_bins(sub_pat_ax) sub_pat[f"bin{ax}"] = sub_pat_bins # Check for nan bins (coords that do not match any Hi-C fragments fall_out = np.isnan(sub_pat['bin1']) | np.isnan(sub_pat['bin2']) if np.any(fall_out): n_out = len(sub_pat_bins[fall_out]) sys.stderr.write( f"{n_out} entr{'ies' if n_out > 1 else 'y'} outside " "genomic coordinates of the Hi-C matrix will be ignored.\n" ) # Convert bins from whole genome matrix to sub matrix sub_pat = hic_genome.get_sub_mat_pattern( mat.chr1, mat.chr2, sub_pat ) m = mat.contact_map.matrix.tocsr() # Iterate over patterns from the 2D BED file for i, x, y in zip(sub_pat_idx, sub_pat.bin1, sub_pat.bin2): # Check if the window goes out of bound if np.all(np.isfinite([x, y])) and ( x - kh >= 0 and x + kh + 1 < m.shape[0] and y - kw >= 0 and y + kw + 1 < m.shape[1] ): x = int(x) y = int(y) # For each pattern, compute correlation score with all kernels # but only keep the best win = m[x - kh : x + kh + 1, y - kw : y + kw + 1].toarray() try: score = ss.pearsonr( win.flatten(), kernel_matrix.flatten() )[0] # In case of NaNs introduced by division by 0 during detrend except ValueError: score = 0 if score > bed2d["score"][i] or kernel_id == 0: bed2d["score"][i] = score # Pattern falls outside or at the edge of the matrix else: win = np.zeros((km, kn)) bed2d["score"][i] = np.nan if kernel_id == 0: windows[i, :, :] = win bed2d.to_csv( output / f"{pattern}_quant.txt", sep="\t", header=True, index=False ) cio.save_windows( windows, f"{pattern}_quant", output_dir=output, format=arguments["--win-fmt"], )
def logo_version(logo, ver): small_logo = resize_kernel(logo, factor=0.33, quiet=True) ascii_logo = print_ascii_mat(small_logo, colored=False, print_str=False) return f"{ascii_logo} Chromosight version {ver}"
def cmd_detect(args): # Parse command line arguments for detect dump = args["--dump"] norm = args["--norm"] interchrom = args["--inter"] iterations = args["--iterations"] kernel_config_path = args["--kernel-config"] mat_path = args["<contact_map>"] max_dist = args["--max-dist"] min_dist = args["--min-dist"] min_separation = args["--min-separation"] n_mads = float(args["--n-mads"]) prefix = args["<prefix>"] pattern = args["--pattern"] pearson = args["--pearson"] perc_zero = args["--perc-zero"] perc_undetected = args["--perc-undetected"] subsample = args["--subsample"] threads = int(args["--threads"]) tsvd = 0.999 if args["--tsvd"] else None win_fmt = args["--win-fmt"] win_size = args["--win-size"] if subsample == "no": subsample = None plotting_enabled = False if args["--no-plotting"] else True smooth_trend = args["--smooth-trend"] if smooth_trend is None: smooth_trend = False # If prefix involves a directory, crash if it does not exist cio.check_prefix_dir(prefix) if win_fmt not in ["npy", "json"]: sys.stderr.write("Error: --win-fmt must be either json or npy.\n") sys.exit(1) # Read a user-provided kernel config if custom is true # Else, load a preset kernel config for input pattern # Configs are JSON files containing all parameter associated with the pattern # They are loaded into a dictionary in the form : # {"max_iterations": 3, "kernels": [kernel1, kernel2, ...], ...} # Where each kernel is a 2D numpy array representing the pattern if kernel_config_path is not None: custom = True # Loading input path as config config_path = kernel_config_path else: custom = False # Will use a preset config file matching pattern name config_path = pattern ### 0: LOAD INPUT params = { "max_iterations": (iterations, int), "pearson": (pearson, float), "max_dist": (max_dist, int), "min_dist": (min_dist, int), "min_separation": (min_separation, int), "max_perc_undetected": (perc_undetected, float), "max_perc_zero": (perc_zero, float), } cfg = cio.load_kernel_config(config_path, custom) for param_name, (param_value, param_type) in params.items(): cfg = _override_kernel_config(param_name, param_value, param_type, cfg) # Resize kernels if requested if win_size != "auto": win_size = int(win_size) if not win_size % 2: raise ValueError("--win-size must be odd") resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0]) cfg["kernels"] = [resize(k) for k in cfg["kernels"]] if interchrom: sys.stderr.write( "WARNING: Detection on interchromosomal matrices is expensive in RAM\n" ) hic_genome = HicGenome( mat_path, inter=interchrom, kernel_config=cfg, dump=dump, smooth=smooth_trend, sample=subsample, ) ### 1: Process input signal hic_genome.kernel_config = cfg # Normalize (balance) matrix using ICE hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads) # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() all_coords = [] all_windows = [] ### 2: DETECTION ON EACH SUBMATRIX n_sub_mats = hic_genome.sub_mats.shape[0] # Loop over the different kernel matrices for input pattern run_id = 0 # Use cfg to inform jobs whether they should run full convolution cfg["tsvd"] = tsvd total_runs = len(cfg["kernels"]) * cfg["max_iterations"] sys.stderr.write("Detecting patterns...\n") for kernel_id, kernel_matrix in enumerate(cfg["kernels"]): # Adjust kernel iteratively for i in range(cfg["max_iterations"]): cio.progress( run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n" ) # Apply detection procedure to all sub matrices in parallel sub_mat_data = zip( hic_genome.sub_mats.iterrows(), [cfg for i in range(n_sub_mats)], [kernel_matrix for i in range(n_sub_mats)], [dump for i in range(n_sub_mats)], ) # Run detection in parallel on different sub matrices, and show progress when # gathering results sub_mat_results = [] # Run in multiprocessing subprocesses if threads > 1: pool = mp.Pool(threads) dispatcher = pool.imap(_detect_sub_mat, sub_mat_data, 1) else: dispatcher = map(_detect_sub_mat, sub_mat_data) for s, result in enumerate(dispatcher): cio.progress(s, n_sub_mats, f"{result['chr1']}-{result['chr2']}") sub_mat_results.append(result) # Convert coordinates from chromosome to whole genome bins kernel_coords = [ hic_genome.get_full_mat_pattern( d["chr1"], d["chr2"], d["coords"] ) for d in sub_mat_results if d["coords"] is not None ] # Gather newly detected pattern coordinates try: # Extract surrounding windows for each sub_matrix kernel_windows = np.concatenate( [ w["windows"] for w in sub_mat_results if w["windows"] is not None ], axis=0, ) all_coords.append( pd.concat(kernel_coords, axis=0).reset_index(drop=True) ) # Add info about kernel and iteration which detected these patterns all_coords[-1]["kernel_id"] = kernel_id all_coords[-1]["iteration"] = i all_windows.append(kernel_windows) # If no pattern was found with this kernel # skip directly to the next one, skipping iterations except ValueError: break # Update kernel with patterns detected at current iteration kernel_matrix = cid.pileup_patterns(kernel_windows) run_id += 1 cio.progress(run_id, total_runs, f"Kernel: {kernel_id}, Iteration: {i}\n") # If no pattern detected on any chromosome, with any kernel, exit gracefully if len(all_coords) == 0: sys.stderr.write("No pattern detected ! Exiting.\n") sys.exit(0) # Finish parallelized part if threads > 1: pool.close() # Combine patterns of all kernel matrices into a single array all_coords = pd.concat(all_coords, axis=0).reset_index(drop=True) # Combine all windows from different kernels into a single pile of windows all_windows = np.concatenate(all_windows, axis=0) # Compute minimum separation in bins and make sure it has a reasonable value separation_bins = int(cfg["min_separation"] // hic_genome.clr.binsize) if separation_bins < 1: separation_bins = 1 print(f"Minimum pattern separation is : {separation_bins}") # Remove patterns with overlapping windows (smeared patterns) distinct_patterns = cid.remove_neighbours( all_coords, win_size=separation_bins ) # Drop patterns that are too close to each other all_coords = all_coords.loc[distinct_patterns, :] all_windows = all_windows[distinct_patterns, :, :] # Get from bins into basepair coordinates coords_1 = hic_genome.bins_to_coords(all_coords.bin1).reset_index( drop=True ) coords_1.columns = [str(col) + "1" for col in coords_1.columns] coords_2 = hic_genome.bins_to_coords(all_coords.bin2).reset_index( drop=True ) coords_2.columns = [str(col) + "2" for col in coords_2.columns] all_coords = pd.concat( [all_coords.reset_index(drop=True), coords_1, coords_2], axis=1 ) # Filter patterns closer than minimum distance from the diagonal if any min_dist_drop_mask = (all_coords.chrom1 == all_coords.chrom2) & ( np.abs(all_coords.start2 - all_coords.start1) < cfg["min_dist"] ) all_coords = all_coords.loc[~min_dist_drop_mask, :] all_windows = all_windows[~min_dist_drop_mask, :, :] del min_dist_drop_mask # Remove patterns with nan p-values (no contact in window) pval_mask = all_coords.pvalue.isnull() all_coords = all_coords.loc[~pval_mask, :] all_windows = all_windows[~pval_mask, :, :] del pval_mask # Correct p-values for multiple testing using FDR all_coords["qvalue"] = fdr_correction(all_coords["pvalue"]) # Reorder columns all_coords = all_coords.loc[ :, [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "kernel_id", "iteration", "score", "pvalue", "qvalue", ], ] ### 3: WRITE OUTPUT sys.stderr.write(f"{all_coords.shape[0]} patterns detected\n") # Save patterns and their coordinates in a tsv file sys.stderr.write(f"Saving patterns in {prefix}.tsv\n") cio.write_patterns(all_coords, prefix) # Save windows as an array in an npy file sys.stderr.write(f"Saving patterns in {prefix}.{win_fmt}\n") cio.save_windows(all_windows, prefix, fmt=win_fmt) # Generate pileup visualisations if requested if plotting_enabled: # Compute and plot pileup pileup_title = ("Pileup of {n} {pattern}").format( pattern=cfg["name"], n=all_windows.shape[0] ) windows_pileup = cid.pileup_patterns(all_windows) # Symmetrize pileup for diagonal patterns if not cfg["max_dist"]: # Replace nan below diag by 0 windows_pileup = np.nan_to_num(windows_pileup) # Add transpose windows_pileup += np.transpose(windows_pileup) - np.diag( np.diag(windows_pileup) ) sys.stderr.write(f"Saving pileup plots in {prefix}.pdf\n") pileup_plot(windows_pileup, prefix, name=pileup_title)
def cmd_generate_config(args): # Parse command line args for generate_config prefix = args["<prefix>"] pattern = args["--preset"] click_find = args["--click"] n_mads = float(args["--n-mads"]) norm = args["--norm"] win_size = args["--win-size"] threads = int(args["--threads"]) inter = args["--inter"] chroms = args["--chroms"] cfg = cio.load_kernel_config(pattern, False) # If prefix involves a directory, crash if it does not exist cio.check_prefix_dir(prefix) # If a specific window size if requested, resize all kernels if win_size != "auto": win_size = int(win_size) if not win_size % 2: raise ValueError("--win-size must be odd") resize = lambda m: resize_kernel(m, factor=win_size / m.shape[0]) cfg["kernels"] = [resize(k) for k in cfg["kernels"]] # Otherwise, just inherit window size from the kernel config else: win_size = cfg["kernels"][0].shape[0] # If click mode is enabled, build a kernel from scratch using # graphical display, otherwise, just inherit the pattern's kernel if click_find: hic_genome = HicGenome(click_find, inter=inter, kernel_config=cfg) # Normalize (balance) the whole genome matrix hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads) # enforce full scanning distance in kernel config hic_genome.max_dist = hic_genome.clr.shape[0] * hic_genome.clr.binsize # Process each sub-matrix individually (detrend diag for intra) hic_genome.make_sub_matrices() # By default, the whole genome is showed at once (takes lots of RAM) if chroms is None: for sub in hic_genome.sub_mats.iterrows(): sub_mat = sub[1].contact_map sub_mat.create_mat() processed_mat = hic_genome.gather_sub_matrices().tocsr() windows = click_finder(processed_mat, half_w=int((win_size - 1) / 2)) # If chromosomes were specified, their submatrices are shown one by one # taking less memory (but more tedious for the user) else: chroms = chroms.split(',') # Generate chromosome pairs to scan if inter: chroms = it.combinations_with_replacement(chroms, 2) else: chroms = [(ch, ch) for ch in chroms] windows = [] for c1, c2 in chroms: try: sub_mat = hic_genome.sub_mats.query( '(chr1 == @c1) & (chr2 == @c2)' )['contact_map'].values[0] # In case chromosomes have been entered in a different order except IndexError: c1, c2 = c2, c1 sub_mat = hic_genome.sub_mats.query( '(chr1 == @c1) & (chr2 == @c2)' )['contact_map'].values[0] sub_mat.create_mat() chrom_wins = click_finder( sub_mat.matrix.tocsr(), half_w=int((win_size - 1) / 2), xlab=c2, ylab=c1 ) windows.append(chrom_wins) sub_mat.destroy_mat() windows = np.concatenate(windows, axis=0) # Pileup all recorded windows and convert to JSON serializable list pileup = ndi.gaussian_filter(cid.pileup_patterns(windows), 1) cfg["kernels"] = [pileup.tolist()] # Show the newly generate kernel to the user, use zscore to highlight contrast hm = plt.imshow( np.log(pileup), vmax=np.percentile(pileup, 99), cmap="afmhot_r" ) cbar = plt.colorbar(hm) cbar.set_label("Log10 Hi-C contacts") plt.title("Manually generated kernel") plt.show() # Write kernel matrices to files with input prefix and replace kernels # by their path in config for mat_id, mat in enumerate(cfg["kernels"]): mat_path = f"{prefix}.{mat_id+1}.txt" np.savetxt(mat_path, mat) cfg["kernels"][mat_id] = mat_path # Write config to JSON file using prefix with open(f"{prefix}.json", "w") as config_handle: json.dump(cfg, config_handle, indent=4)
def cmd_quantify(args): bed2d_path = args["<bed2d>"] mat_path = args["<contact_map>"] prefix = args["<prefix>"] n_mads = float(args["--n-mads"]) pattern = args["--pattern"] inter = args["--inter"] kernel_config_path = args["--kernel-config"] perc_zero = args["--perc-zero"] perc_undetected = args["--perc-undetected"] plotting_enabled = False if args["--no-plotting"] else True threads = int(args["--threads"]) norm = args["--norm"] tsvd = 0.999 if args["--tsvd"] else None win_fmt = args["--win-fmt"] if win_fmt not in ["npy", "json"]: sys.stderr.write("Error: --win-fmt must be either json or npy.\n") sys.exit(1) win_size = args["--win-size"] if win_size != "auto": win_size = int(win_size) subsample = args["--subsample"] # If prefix involves a directory, crash if it does not exist cio.check_prefix_dir(prefix) # Load 6 cols from 2D BED file and infer header bed2d = cio.load_bed2d(bed2d_path) # Warn user if --inter is disabled but list contains inter patterns if not inter and len(bed2d.start1[bed2d.chrom1 != bed2d.chrom2]) > 0: sys.stderr.write( "Warning: The bed2d file contains interchromosomal patterns. " "These patterns will not be scanned unless --inter is used.\n" ) if kernel_config_path is not None: custom = True # Loading input path as config config_path = kernel_config_path else: custom = False # Will use a preset config file matching pattern name config_path = pattern cfg = cio.load_kernel_config(config_path, custom) # Subsample Hi-C contacts from the matrix, if requested if subsample == "no": subsample = None # Instantiate and preprocess contact map hic_genome = HicGenome( mat_path, inter=inter, kernel_config=cfg, sample=subsample ) # enforce max scanning distance to pattern at longest distance furthest = np.max(bed2d.start2 - bed2d.start1) max_diag = hic_genome.clr.shape[0] * hic_genome.clr.binsize cfg["max_dist"] = min(furthest, max_diag) cfg["min_dist"] = 0 cfg["tsvd"] = tsvd cfg = _override_kernel_config("max_perc_zero", perc_zero, float, cfg) cfg = _override_kernel_config( "max_perc_undetected", perc_undetected, float, cfg ) # Notify contact map instance of changes in scanning distance hic_genome.kernel_config = cfg # Normalize (balance) matrix using ICE hic_genome.normalize(norm=norm, n_mads=n_mads, threads=threads) # Initialize output structures bed2d["score"] = np.nan bed2d["pvalue"] = np.nan positions = bed2d.copy() # Only resize kernel matrix if explicitely requested km, kn = cfg["kernels"][0].shape n_kernels = len(cfg['kernels']) if win_size != "auto": if not win_size % 2: raise ValueError("--win-size must be odd") for i, k in enumerate(cfg["kernels"]): cfg["kernels"][i] = resize_kernel(k, factor=win_size / km) km = kn = win_size # Update kernel config after resizing kernels hic_genome.kernel_config = cfg # Define how many diagonals should be used in intra-matrices hic_genome.compute_max_dist() # Split whole genome matrix into intra- and inter- sub matrices. Each sub # matrix is processed on the fly (obs / exp, trimming diagonals > max dist) hic_genome.make_sub_matrices() windows = np.full((positions.shape[0], km, kn), np.nan) # We will store a copy of coordinates for each kernel bed2d_out = [bed2d.copy() for _ in range(n_kernels)] windows_out = [windows.copy() for _ in range(n_kernels)] # For each position, we use the center of the BED interval positions["pos1"] = (positions.start1 + positions.end1) // 2 positions["pos2"] = (positions.start2 + positions.end2) // 2 # Use each kernel matrix available for the pattern for kernel_id, kernel_matrix in enumerate(cfg["kernels"]): cio.progress(kernel_id, len(cfg["kernels"]), f"Kernel: {kernel_id}\n") n_sub_mats = hic_genome.sub_mats.shape[0] # Retrieve input positions for each submatrix and convert # coordinates from whole genome to submatrix. sub_pos = [ _get_chrom_pos(positions, hic_genome, m[1].chr1, m[1].chr2) for m in hic_genome.sub_mats.iterrows() ] # Apply quantification procedure to all sub matrices in parallel sub_mat_data = zip( hic_genome.sub_mats.iterrows(), [cfg for _ in range(n_sub_mats)], [kernel_matrix for _ in range(n_sub_mats)], [s[1] for s in sub_pos], ) # Run quantification in parallel on different sub matrices, # and show progress when gathering results sub_mat_results = [] # Run in multiprocessing subprocesses if threads > 1: pool = mp.Pool(threads) dispatcher = pool.imap(_quantify_sub_mat, sub_mat_data, 1) else: dispatcher = map(_quantify_sub_mat, sub_mat_data) for s, result in enumerate(dispatcher): cio.progress(s, n_sub_mats, f"{result['chr1']}-{result['chr2']}") sub_mat_results.append(result) for i, r in enumerate(sub_mat_results): # If there were no patterns on that sub matrix, just skip it if r['coords'] is None: continue sub_pat_idx = sub_pos[i][0] # For each coordinate, keep the highest coefficient # among all kernels. try: bed2d_out[kernel_id]['score'][sub_pat_idx] = r['coords'].score.values bed2d_out[kernel_id]["pvalue"][sub_pat_idx] = r["coords"].pvalue.values windows_out[kernel_id][sub_pat_idx, :, :] = r["windows"] # Do nothing if no pattern was detected or matrix # is smaller than the kernel (-> patterns is None) except AttributeError: pass # Select the best score for each coordinate (among the different kernels) bed2d = pd.concat(bed2d_out, axis=0).reset_index(drop=True) windows = np.concatenate(windows_out, axis=0) bed2d = ( bed2d .sort_values('score', ascending=True) .groupby(['chrom1', 'start1', 'chrom2', 'start2'], sort=False) .tail(1) ) windows = windows[bed2d.index, :, :] bed2d = bed2d.reset_index(drop=True) bed2d["bin1"] = hic_genome.coords_to_bins( bed2d.loc[:, ["chrom1", "start1"]].rename( columns={"chrom1": "chrom", "start1": "pos"} ) ) bed2d["bin2"] = hic_genome.coords_to_bins( bed2d.loc[:, ["chrom2", "start2"]].rename( columns={"chrom2": "chrom", "start2": "pos"} ) ) bed2d["qvalue"] = fdr_correction(bed2d["pvalue"]) bed2d = bed2d.loc[ :, [ "chrom1", "start1", "end1", "chrom2", "start2", "end2", "bin1", "bin2", "score", "pvalue", "qvalue", ], ] # Set p-values of invalid scores to nan bed2d.loc[np.isnan(bed2d.score), "pvalue"] = np.nan bed2d.loc[np.isnan(bed2d.score), "qvalue"] = np.nan # Sort by whole genome coordinates to match input order bed2d = ( bed2d .sort_values(['bin1', 'bin2'], ascending=True) .reset_index(drop=True) ) cio.write_patterns(bed2d, prefix) cio.save_windows(windows, prefix, fmt=win_fmt) # Generate pileup visualisations if requested if plotting_enabled: # Compute and plot pileup pileup_title = ("pileup_of_{n}_{pattern}").format( pattern=cfg["name"], n=windows.shape[0] ) windows_pileup = cid.pileup_patterns(windows) # Symmetrize pileup for diagonal patterns if not cfg["max_dist"]: # Replace nan below diag by 0 windows_pileup = np.nan_to_num(windows_pileup) # Add transpose windows_pileup += np.transpose(windows_pileup) - np.diag( np.diag(windows_pileup) ) sys.stderr.write(f"Saving pileup plots in {prefix}.pdf\n") pileup_plot(windows_pileup, prefix, name=pileup_title)