def compute_mc_associations(frg_inf, pos_crd, bin_bnd, n_perm=1000, pos_ids=None, verbose=True): from utilities import hasOL, flatten # initialization n_bin = bin_bnd.shape[0] # re-index circles frg_inf[:, 0] = np.unique(frg_inf[:, 0], return_inverse=True)[1] + 1 n_read = np.max(frg_inf[:, 0]) # convert fragments to bin-coverage cfb_lst = [list() for i in range(n_read + 1)] n_frg = frg_inf.shape[0] for fi in range(n_frg): bin_idx = np.where(hasOL(frg_inf[fi, 2:4], bin_bnd))[0] cfb_lst[frg_inf[fi, 0]].append(list(bin_idx)) # select positive/negative circles if pos_ids is not None: assert len(pos_crd) == 0 is_pos = np.isin(frg_inf[:, 0], pos_ids) else: is_pos = np.where(hasOL(pos_crd, frg_inf[:, 1:4]))[0] frg_pos = frg_inf[np.isin(frg_inf[:, 0], frg_inf[is_pos, 0]), :] frg_neg = frg_inf[~np.isin(frg_inf[:, 0], frg_inf[is_pos, 0]), :] cfb_pos = [cfb_lst[i] for i in np.unique(frg_pos[:, 0])] cfb_neg = [cfb_lst[i] for i in np.unique(frg_neg[:, 0])] n_pos = len(cfb_pos) n_neg = len(cfb_neg) # make positive profile prf_pos = np.zeros(n_bin) for pi in range(n_pos): bin_lst = flatten(cfb_pos[pi]) prf_pos[bin_lst] += 1 # make background profile from negative set prf_rnd = np.zeros([n_perm, n_bin]) neg_lst = range(n_neg) for ei in np.arange(n_perm): if verbose and (((ei + 1) % 200) == 0): print '\t{:d} randomized profiles are computed.'.format(ei + 1) np.random.shuffle(neg_lst) for rd_idx in neg_lst[:n_pos]: f2b_rnd = cfb_neg[rd_idx] np.random.shuffle(f2b_rnd) prf_rnd[ei, flatten( f2b_rnd[1:] )] += 1 # making sure one element is randomly removed everytime return prf_pos, prf_rnd, frg_pos, frg_neg
def plot_reads_per_category(config_lst): import subprocess from matplotlib import pyplot as plt from utilities import load_mc4c, hasOL # initialization configs = config_lst[0] if configs['output_file'] is None: configs['output_file'] = configs[ 'output_dir'] + '/qc_readCategories_' + configs['run_id'] + '.pdf' # load number of sequenced reads n_seq = 0 print 'Loading number of sequenced reads from fastq files ...' for configs in config_lst: seq_fname = './reads/rd_' + configs['run_id'] + '.fasta.gz' print '\tscanning {:s}'.format(seq_fname) cmd_str = 'zgrep ">" ' + seq_fname + ' | wc -l' map_prs = subprocess.Popen(cmd_str, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_out, std_err = map_prs.communicate() assert std_err == '', 'Reading fastq file failed.' n_seq += int(std_out.strip()) # load raw reads frg_dp = load_mc4c(config_lst, unique_only=False, valid_only=False, min_mq=0, reindex_reads=True) reads_raw = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd', 'MQ']].values n_map = len(np.unique(reads_raw[:, 0])) del frg_dp # remove VP fragments vp_crd = np.array( [configs['vp_cnum'], configs['vp_start'], configs['vp_end']]) is_vp = hasOL(vp_crd, reads_raw[:, 1:4], offset=0) reads_nvp = reads_raw[~is_vp, :] n_nvp = len(np.unique(reads_nvp[:, 0])) # select ROI reads roi_crd = np.array( [configs['vp_cnum'], configs['roi_start'], configs['roi_end']]) is_roi = hasOL(roi_crd, reads_nvp[:, 1:4], offset=0) reads_roi = reads_nvp[is_roi, :] n_roi = len(np.unique(reads_roi[:, 0])) # select informative reads (#frg > 1) MAX_ReadID = np.max(reads_roi[:, 0]) read_n_roi = np.bincount(reads_roi[:, 0], minlength=MAX_ReadID + 1) is_inf = np.isin(reads_raw[:, 0], reads_roi[read_n_roi[reads_roi[:, 0]] > 1, 0]) reads_inf = reads_raw[is_inf, :] n_inf = len(np.unique(reads_inf[:, 0])) # load unique reads frg_dp = load_mc4c(config_lst, unique_only=True, valid_only=True, min_mq=20, reindex_reads=True) reads_pcr = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd', 'MQ']].values n_pcr = len(np.unique(reads_pcr[:, 0])) del frg_dp # plotting the bar name_lst = [ '#Sequenced', '#Mapped>0', 'Only non-VP\nfragments', '#ROI>0', '#ROI>1', '#Unique' ] n_bar = len(name_lst) clr_map = [ '#fd8181', '#fda981', '#fcc631', '#b8c903', '#38c903', '#04f1ba', '#0472f1' ] plt.figure(figsize=(8, 5)) plt_h = [None] * n_bar for cls_idx, n_read in enumerate( [n_seq, n_map, n_nvp, n_roi, n_inf, n_pcr]): plt_h[cls_idx] = plt.bar(cls_idx, n_read, width=0.8, color=clr_map[cls_idx])[0] plt.text(cls_idx, n_read, '{:0.0f}%\n'.format(n_read * 1e2 / n_seq) + '#{:,d}'.format(n_read), verticalalignment='bottom', horizontalalignment='center') plt.xticks(range(n_bar), name_lst) y_ticks = plt.yticks()[0] y_tick_lbl = ['{:0.0f}k'.format(y / 1e3) for y in y_ticks] plt.yticks(y_ticks, y_tick_lbl) # plt.xlabel('Categories') plt.ylabel('#reads') plt.xlim([-1, n_bar]) plt.ylim([0, n_seq * 1.12]) plt.title(configs['run_id']) # plt.legend(plt_h, []) plt.savefig(configs['output_file'], bbox_inches='tight')
def plot_overallProfile(configs, min_n_frg=2): from matplotlib import pyplot as plt, patches from utilities import hasOL, load_mc4c, load_annotation # initialization if configs['output_file'] is None: configs['output_file'] = configs[ 'output_dir'] + '/qc_OverallProfile_' + configs['run_id'] + '.pdf' edge_lst = np.linspace(configs['roi_start'], configs['roi_end'], num=201, dtype=np.int64).reshape(-1, 1) bin_bnd = np.hstack([edge_lst[:-1], edge_lst[1:] - 1]) bin_width = bin_bnd[0, 1] - bin_bnd[0, 0] bin_cen = np.mean(bin_bnd, axis=1) n_bin = bin_bnd.shape[0] del edge_lst vp_crd = np.array( [configs['vp_cnum'], configs['vp_start'], configs['vp_end']]) roi_crd = np.array( [configs['vp_cnum'], configs['roi_start'], configs['roi_end']]) # loop over datasets bin_frq = np.zeros([2, n_bin], dtype=np.int) n_read = np.zeros(2, dtype=np.int) for di in range(2): # load MC-HC data frg_dp = load_mc4c(configs, unique_only=di != 0, valid_only=True, min_mq=20, reindex_reads=True) frg_np = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd']].values # filter small circles is_vp = hasOL(vp_crd, frg_np[:, 1:4], offset=0) is_roi = hasOL(roi_crd, frg_np[:, 1:4], offset=0) frg_nvp = frg_np[~is_vp & is_roi, :] cir_size = np.bincount(frg_nvp[:, 0])[frg_nvp[:, 0]] is_inf = np.isin(frg_np[:, 0], frg_nvp[cir_size >= min_n_frg, 0]) frg_inf = frg_np[is_inf, :] # select within roi fragments is_roi = hasOL(roi_crd, frg_inf[:, 1:4], offset=0) frg_roi = frg_inf[is_roi, :] n_read[di] = len(np.unique(frg_roi[:, 0])) # looping over bins for bi in range(n_bin): is_in = hasOL(bin_bnd[bi, :], frg_roi[:, 2:4]) bin_frq[di, bi] = len(np.unique( frg_roi[is_in, 0])) # each circle can contribute only once to a bin # set vp bins to nan # is_vp = hasOL([configs['vp_start'], configs['vp_end']], bin_bnd) # bin_frq[:, is_vp] = np.nan vpb_idx = hasOL([configs['vp_start'], configs['vp_end']], bin_bnd) vpd_bnd = [bin_bnd[vpb_idx][0, 0], bin_bnd[vpb_idx][-1, 1]] # plotting plt.figure(figsize=(15, 3)) plt_h = [None] * 2 clr_map = ['#d0d0d0', '#43ff14'] bin_nrm = np.zeros([2, n_bin]) for di in range(2): bin_nrm[di, :] = bin_frq[di, :] * 100.0 / n_read[di] bin_nrm[di, vpb_idx] = np.nan plt_h[di] = plt.bar(bin_cen, bin_nrm[di, :], width=bin_width, color=clr_map[di], alpha=0.7) # add vp area y_lim = [0, np.nanmax(bin_nrm) * 1.1] plt.gca().add_patch( patches.Rectangle([vpd_bnd[0], 0], vpd_bnd[1] - vpd_bnd[0], y_lim[1], linewidth=0, edgecolor='None', facecolor='orange')) # add annotations ant_pd = load_annotation(configs['genome_build'], roi_crd=roi_crd).reset_index(drop=True) for ai in range(ant_pd.shape[0]): ant_pos = ant_pd.loc[ai, 'ant_pos'] plt.text(ant_pos, y_lim[1], ant_pd.loc[ai, 'ant_name'], horizontalalignment='center', verticalalignment='bottom') plt.plot([ant_pos, ant_pos], y_lim, ':', color='#bfbfbf', linewidth=1, alpha=0.5) # final adjustments plt.xlim([configs['roi_start'], configs['roi_end']]) x_ticks = np.linspace(configs['roi_start'], configs['roi_end'], 20, dtype=np.int64) x_tick_label = ['{:0.2f}m'.format(x / 1e6) for x in x_ticks] plt.xticks(x_ticks, x_tick_label, rotation=20) plt.ylabel('Frequency (% of reads)') plt.ylim(y_lim) plt.legend(plt_h, [ 'All reads (n={:0,.0f})'.format(n_read[0]), 'Unique reads (n={:0,.0f})'.format(n_read[1]) ]) plt.title('Overall profile (#roiFrg>{:d}, ex. vp), {:s}\n'.format( min_n_frg - 1, configs['run_id'])) plt.savefig(configs['output_file'], bbox_inches='tight')
def plot_cirSizeDistribution(configs, roi_only=True, uniq_only=True): from matplotlib import pyplot as plt, cm from utilities import accum_array, load_mc4c # initialization MAX_SIZE = 8 edge_lst = np.linspace(1, MAX_SIZE, num=MAX_SIZE) n_edge = len(edge_lst) # Load MC-HC data frg_dp = load_mc4c(configs, min_mq=20, reindex_reads=True, unique_only=uniq_only) frg_np = frg_dp[[ 'ReadID', 'Chr', 'ExtStart', 'ExtEnd', 'MQ', 'ReadLength' ]].values del frg_dp # select requested fragments if uniq_only: filter_lst = ['uniq'] else: filter_lst = [] if roi_only: from utilities import hasOL vp_crd = np.array( [configs['vp_cnum'], configs['vp_start'], configs['vp_end']]) roi_crd = np.array( [configs['vp_cnum'], configs['roi_start'], configs['roi_end']]) is_vp = hasOL(vp_crd, frg_np[:, 1:4], offset=0) is_roi = hasOL(roi_crd, frg_np[:, 1:4], offset=0) frg_np = frg_np[~is_vp & is_roi, :] filter_lst += ['roi', 'ex.vp'] # group circles read_grp = accum_array(frg_np[:, 0] - 1, frg_np, rebuild_index=True) n_grp = len(read_grp) # Looping over circles size_dist = np.zeros([4, n_edge], dtype=np.int64) print 'Computing circle size from {:d} reads:'.format(n_grp) for read_idx, frg_set in enumerate(read_grp): if read_idx % 50000 == 0: print('\t{:,d}/{:,d} Reads are processed.'.format(read_idx, n_grp)) n_frg = frg_set.shape[0] if n_frg == 0: continue n_bp = frg_set[0, 5] if n_frg > MAX_SIZE: n_frg = MAX_SIZE bin_idx = np.digitize(n_frg, edge_lst) - 1 if n_bp < 1500: size_dist[0, bin_idx] += 1 elif n_bp < 8000: size_dist[1, bin_idx] += 1 else: size_dist[2, bin_idx] += 1 size_dist[3, bin_idx] += 1 # calculate measures n_map0 = np.sum(size_dist[3, :]) n_map1 = np.sum(size_dist[3, 1:]) n_map2 = np.sum(size_dist[3, 2:]) # Plotting clr_map = [cm.Blues(x) for x in np.linspace(0.3, 1.0, 3)] + [(1.0, 0.5, 0.25)] plt.figure(figsize=(7, 5)) plt_h = [None] * 4 for cls_idx in range(4): plt_h[cls_idx] = plt.bar(edge_lst, size_dist[cls_idx, :] * 100.0 / np.sum(size_dist[cls_idx, :]), width=0.95 - cls_idx / 4.0, color=clr_map[cls_idx])[0] plt.xlim([0, MAX_SIZE + 1]) plt.xticks(edge_lst) plt.xlabel('Read size (#fragment)') plt.ylabel('Frequency (%)') # plt.ylim([0, 70]) title_msg = configs['run_id'] if len(filter_lst) != 0: title_msg += ' ({:s})'.format(', '.join(filter_lst)) title_msg += '\n#map>0={:,d};\n'.format(n_map0) + \ '#map>1={:,d} ({:0.0f}%); '.format(n_map1, n_map1 * 1e2 / n_map0) + \ '#map>2={:,d} ({:0.0f}%)'.format(n_map2, n_map2 * 1e2 / n_map0) plt.title(title_msg) plt.legend(plt_h, [ 'read #bp <1.5kb (n={:,d})'.format(np.sum(size_dist[0, :])), 'read #bp <8kb (n={:,d})'.format(np.sum(size_dist[1, :])), 'read #bp >8kb (n={:,d})'.format(np.sum( size_dist[2, :])), 'All (n={:,d})'.format(np.sum(size_dist[3, :])) ]) if configs['output_file'] is None: configs['output_file'] = configs[ 'output_dir'] + '/qc_CirSizeDistribution_' + configs['run_id'] if roi_only or uniq_only: configs['output_file'] += '_{:s}.pdf'.format('-'.join(filter_lst)) else: configs['output_file'] += '.pdf' plt.savefig(configs['output_file'], bbox_inches='tight')
def perform_at_across_roi(config_lst, min_n_frg=2, n_perm=1000): import platform import matplotlib if platform.system() == 'Linux': matplotlib.use('Agg') from matplotlib import pyplot as plt, patches from matplotlib.colors import LinearSegmentedColormap from utilities import load_mc4c, load_annotation, hasOL, flatten, limit_to_roi # initialization run_id = ','.join([config['run_id'] for config in config_lst]) configs = config_lst[0] if configs['output_file'] is None: configs['output_file'] = configs[ 'output_dir'] + '/analysis_atAcrossROI_{:s}.pdf'.format(run_id) # create bin list edge_lst = np.linspace(configs['roi_start'], configs['roi_end'], num=201, dtype=np.int64).reshape(-1, 1) bin_bnd = np.hstack([edge_lst[:-1], edge_lst[1:] - 1]) bin_w = bin_bnd[0, 1] - bin_bnd[0, 0] n_bin = bin_bnd.shape[0] # make block list bin_cen = np.mean(bin_bnd, axis=1, dtype=np.int64).reshape(-1, 1) # blk_crd = np.hstack([np.repeat(configs['vp_cnum'], n_bin / 3).reshape(-1, 1), edge_lst[:-3:3], edge_lst[3::3] - 1]) blk_crd = np.hstack([ np.repeat(configs['vp_cnum'], n_bin).reshape(-1, 1), bin_cen - int(bin_w * 1.5), bin_cen + int(bin_w * 1.5) - 1 ]) blk_w = blk_crd[0, 2] - blk_crd[0, 1] n_blk = blk_crd.shape[0] del edge_lst # define areas roi_cen = np.mean( [np.min(configs['prm_start']), np.max(configs['prm_end'])], dtype=np.int) vp_crd = np.array([ configs['vp_cnum'], roi_cen - int(bin_w * 1.5), roi_cen + int(bin_w * 1.5) ]) roi_crd = [configs['vp_cnum'], configs['roi_start'], configs['roi_end']] # load MC-HC data frg_dp = load_mc4c(config_lst, unique_only=True, valid_only=True, min_mq=20, reindex_reads=True, verbose=True) read_all = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd']].values del frg_dp # select >2 roi-fragments read_inf = limit_to_roi(read_all[:, :4], vp_crd=vp_crd, roi_crd=roi_crd, min_n_frg=min_n_frg) del read_all # re-index reads read_inf[:, 0] = np.unique(read_inf[:, 0], return_inverse=True)[1] + 1 n_read = len(np.unique(read_inf[:, 0])) # convert fragments to bin-coverage print 'Mapping reads to bins ...' cfb_lst = [list() for i in range(n_read + 1)] n_frg = read_inf.shape[0] for fi in range(n_frg): bin_idx = np.where(hasOL(read_inf[fi, 2:4], bin_bnd))[0] cfb_lst[read_inf[fi, 0]].append(bin_idx.tolist()) # filter circles for (>1 bin cvg) 'Selecting only reads with >1 bins covered' valid_lst = [] for rd_nid in range(1, n_read + 1): fb_lst = cfb_lst[rd_nid] bin_cvg = np.unique(flatten(fb_lst)) if len(bin_cvg) > 1: valid_lst.append(rd_nid) read_inf = read_inf[np.isin(read_inf[:, 0], valid_lst), :] # subsample reads # rnd_ids = np.random.choice(np.unique(read_inf[:, 0]), 6870, replace=False) # read_inf = read_inf[np.isin(read_inf[:, 0], rnd_ids), :] # reindexing reads read_inf[:, 0] = np.unique(read_inf[:, 0], return_inverse=True)[1] + 1 n_read = np.max(read_inf[:, 0]) print '{:,d} reads are left after bin-coverage filter.'.format(n_read) # get soi info ant_pd = load_annotation(configs['genome_build'], roi_crd=roi_crd) ant_bnd = np.hstack( [ant_pd[['ant_pos']].values, ant_pd[['ant_pos']].values]) # compute score for annotations print 'Computing expected profile for {:d} blocks (required coverage: {:d} reads):'.format( n_blk, MIN_N_POS) blk_scr = np.full([n_blk, n_blk], fill_value=np.nan) # x_tick_lbl = [' '] * n_blk y_tick_lbl = [' '] * n_blk n_ignored = 0 for bi in range(n_blk): showprogress(bi, n_blk, n_step=20) # add axes labels ant_idx = np.where(hasOL(blk_crd[bi, 1:], ant_bnd, offset=0))[0] if len(ant_idx) > 0: ant_name = ','.join([ant_pd.loc[i, 'ant_name'] for i in ant_idx]) # x_tick_lbl[bi] = ('{:s}, #{:0.0f}'.format(ant_name, n_pos)) y_tick_lbl[bi] = ant_name # else: # x_tick_lbl[bi] = ('#{:0.0f}'.format(n_pos)) # ignore if vp if hasOL(blk_crd[bi, :], vp_crd, offset=blk_w)[0]: continue # compute the observe and background blk_obs, blk_rnd, read_pos = compute_mc_associations(read_inf, blk_crd[bi, :], blk_crd[:, 1:], n_perm=n_perm, verbose=False)[:3] n_pos = len(np.unique(read_pos[:, 0])) if n_pos < MIN_N_POS: n_ignored += 1 continue # compute the scores blk_exp = np.mean(blk_rnd, axis=0) blk_std = np.std(blk_rnd, axis=0, ddof=0) np.seterr(all='ignore') blk_scr[:, bi] = np.divide(blk_obs - blk_exp, blk_std) np.seterr(all=None) # remove scores overlapping with positive set is_nei = hasOL(blk_crd[bi, 1:], blk_crd[:, 1:], offset=blk_w) blk_scr[is_nei, bi] = np.nan if n_ignored != 0: print '[w] {:d}/{:d} blocks are ignored due to low coverage.'.format( n_ignored, n_blk) # set self scores to nan # np.fill_diagonal(blk_scr, val=np.nan) # clean up tick labels # plotting the scores plt.figure(figsize=(15, 13)) ax_scr = plt.subplot2grid((40, 40), (0, 0), rowspan=39, colspan=39) ax_cmp = plt.subplot2grid((40, 40), (0, 39), rowspan=20, colspan=1) # set up color bar c_lim = [-6, 6] clr_lst = [ '#ff1a1a', '#ff7575', '#ffcccc', '#ffffff', '#ffffff', '#ffffff', '#ccdfff', '#3d84ff', '#3900f5' ] clr_map = LinearSegmentedColormap.from_list('test', clr_lst, N=9) clr_map.set_bad('gray', 0.1) norm = matplotlib.colors.Normalize(vmin=c_lim[0], vmax=c_lim[1]) cbar_h = matplotlib.colorbar.ColorbarBase(ax_cmp, cmap=clr_map, norm=norm) # cbar_h.ax.tick_params(labelsize=12) cbar_h.ax.set_ylabel('z-score', rotation=90) cbar_edge = np.round(cbar_h.cmap(norm(c_lim)), decimals=2) # add score scatter matrix x_lim = [0, n_blk] ax_scr.imshow(blk_scr, extent=x_lim + x_lim, cmap=clr_map, vmin=c_lim[0], vmax=c_lim[1], interpolation='nearest', origin='bottom') ax_scr.set_xlim(x_lim) ax_scr.set_ylim(x_lim) # add vp patches vp_idx = np.where(hasOL(vp_crd, blk_crd, offset=blk_w))[0] ax_scr.add_patch( patches.Rectangle([0, vp_idx[0]], n_blk, vp_idx[-1] - vp_idx[0], linewidth=0, edgecolor='None', facecolor='orange')) ax_scr.add_patch( patches.Rectangle([vp_idx[0], 0], vp_idx[-1] - vp_idx[0], n_blk, linewidth=0, edgecolor='None', facecolor='orange')) # add score values to each box # for bi in range(n_blk): # for bj in range(n_blk): # if np.isnan(blk_scr[bi, bj]): # continue # ant_clr = np.round(img_h.cmap(img_h.norm(blk_scr[bi, bj])), decimals=2) # if np.array_equal(ant_clr, cbar_edge[0]) or np.array_equal(ant_clr, cbar_edge[1]): # txt_clr = '#ffffff' # else: # txt_clr = '#000000' # ax_scr.text(bj + 0.5, bi + 0.5, '{:+0.1f}'.format(blk_scr[bi, bj]), color=txt_clr, # horizontalalignment='center', verticalalignment='center', fontsize=12) # adjust ticks for lbl in np.unique(y_tick_lbl): if lbl == ' ': continue idx_lst = np.where(np.isin(y_tick_lbl, lbl))[0] if len(idx_lst) > 1: kpt_idx = np.mean(idx_lst, dtype=np.int) for idx in idx_lst: y_tick_lbl[idx] = 'l' y_tick_lbl[kpt_idx] = lbl + ' ' # final adjustments ax_scr.set_xticks(np.arange(n_blk) + 0.5) ax_scr.set_yticks(np.arange(n_blk) + 0.5) ax_scr.set_xticklabels(y_tick_lbl, rotation=90) ax_scr.set_yticklabels(y_tick_lbl) ax_scr.set_xlabel('Selected SOIs') ax_scr.set_title( 'Association matrix from {:s}\n'.format(configs['run_id']) + '#read (#roiFrg>{:d}, ex. vp)={:,d}, '.format(min_n_frg - 1, n_read) + 'bin-w={:0.0f}; block-w={:0.0f}; #perm={:d}'.format( bin_w, blk_w, n_perm)) plt.savefig(configs['output_file'], bbox_inches='tight')
def perform_mc_analysis(configs, min_n_frg=2): import platform if platform.system() == 'Linux': import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt, patches from matplotlib.colors import LinearSegmentedColormap from utilities import load_mc4c, load_annotation, hasOL # initialization if configs['output_file'] is None: configs['output_file'] = configs[ 'output_dir'] + '/analysis_mcTest_' + configs['run_id'] + '.pdf' edge_lst = np.linspace(configs['roi_start'], configs['roi_end'], num=201, dtype=np.int64).reshape(-1, 1) bin_bnd = np.hstack([edge_lst[:-1], edge_lst[1:] - 1]) n_bin = bin_bnd.shape[0] n_epoch = 1000 x_lim = [configs['roi_start'], configs['roi_end']] # load MC-HC data frg_dp = load_mc4c(configs, unique_only=True, valid_only=True, min_mq=20, reindex_reads=False) frg_np = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd']].values del frg_dp # select within roi fragments vp_crd = [configs['vp_cnum'], configs['vp_start'], configs['vp_end']] roi_crd = [configs['vp_cnum'], configs['roi_start'], configs['roi_end']] is_vp = hasOL(vp_crd, frg_np[:, 1:4]) is_roi = hasOL(roi_crd, frg_np[:, 1:4]) frg_roi = frg_np[~is_vp & is_roi, :] del frg_np # filter small circles cir_size = np.bincount(frg_roi[:, 0])[frg_roi[:, 0]] frg_roi = frg_roi[cir_size >= min_n_frg, :] n_read = len(np.unique(frg_roi[:, 0])) # re-index circles frg_roi[:, 0] = np.unique(frg_roi[:, 0], return_inverse=True)[1] # convert reads to bin coverage cvg_lst = [list() for i in range(n_read)] for fi in range(frg_roi.shape[0]): bin_idx = np.where(hasOL(frg_roi[fi, 2:4], bin_bnd))[0] cvg_lst[frg_roi[fi, 0]].extend(bin_idx) cvg_lst = [np.unique(cvg_lst[i]) for i in range(n_read)] # looping over bins print 'Performing the MC analysis using {:d} reads ...'.format(n_read) mat_freq = np.full([n_bin, n_bin], fill_value=np.nan) mat_zscr = np.full([n_bin, n_bin], fill_value=np.nan) for bi in range(n_bin): if bi % (n_bin / 10) == 0: print '{:0.0f}%,'.format(bi * 100.0 / n_bin), is_pos = hasOL(bin_bnd[bi, :], frg_roi[:, 2:4]) frg_pos = frg_roi[np.isin(frg_roi[:, 0], frg_roi[is_pos, 0]), :] frg_neg = frg_roi[~np.isin(frg_roi[:, 0], frg_pos[:, 0]), :] ids_pos = np.unique(frg_pos[:, 0]) ids_neg = np.unique(frg_neg[:, 0]) n_pos = len(ids_pos) n_neg = len(ids_neg) assert n_pos <= n_neg if n_pos < 100: continue # calculate the background rnd_freq = np.zeros([n_epoch, n_bin]) for ei in np.arange(n_epoch): rnd_lst = np.random.choice(ids_neg, n_pos, replace=False) for rd_idx in rnd_lst: bin_cvg = cvg_lst[rd_idx] n_cvg = len(bin_cvg) rnd_freq[ei, bin_cvg] += 1 rnd_freq[ei, bin_cvg[np.random.randint(n_cvg)]] -= 1 # calculate observed for bj in range(bi + 1, n_bin): is_cov = hasOL(bin_bnd[bj, :], frg_pos[:, 2:4]) mat_freq[bi, bj] = len(np.unique(frg_pos[is_cov, 0])) zscr_avg = np.mean(rnd_freq[:, bj]) zscr_std = np.std(rnd_freq[:, bj]) if zscr_std == 0: continue mat_zscr[bi, bj] = (mat_freq[bi, bj] - zscr_avg) / zscr_std mat_zscr[bj, bi] = mat_zscr[bi, bj] # set vp bins to nan is_vp = hasOL([configs['vp_start'], configs['vp_end']], bin_bnd) mat_zscr[is_vp, :] = np.nan mat_zscr[:, is_vp] = np.nan vp_bnd = [bin_bnd[is_vp, 0][0], bin_bnd[is_vp, 1][-1]] # plotting plt.figure(figsize=(17, 9)) clr_lst = [ '#ff1a1a', '#ff8a8a', '#ffffff', '#ffffff', '#ffffff', '#8ab5ff', '#3900f5' ] clr_map = LinearSegmentedColormap.from_list('test', clr_lst, N=10) clr_map.set_bad('gray', 0.05) plt.imshow(mat_zscr, extent=x_lim + x_lim, cmap=clr_map, origin='bottom', interpolation='nearest') plt.gca().add_patch( patches.Rectangle([vp_bnd[0], x_lim[0]], vp_bnd[1] - vp_bnd[0], x_lim[1] - x_lim[0], linewidth=0, edgecolor='None', facecolor='orange')) plt.gca().add_patch( patches.Rectangle([x_lim[0], vp_bnd[0]], x_lim[1] - x_lim[0], vp_bnd[1] - vp_bnd[0], linewidth=0, edgecolor='None', facecolor='orange')) cbar_h = plt.colorbar() cbar_h.ax.tick_params(labelsize=14) plt.clim(-6, 6) # add annotations ant_pd = load_annotation( configs['genome_build'], roi_crd=[configs['vp_cnum'], configs['roi_start'], configs['roi_end']]) for ai in range(ant_pd.shape[0]): ant_pos = ant_pd.loc[ai, 'ant_pos'] plt.text(ant_pos, x_lim[1], ant_pd.loc[ai, 'ant_name'], horizontalalignment='left', verticalalignment='bottom', rotation=60) plt.text(x_lim[1], ant_pos, ' ' + ant_pd.loc[ai, 'ant_name'], horizontalalignment='left', verticalalignment='center') plt.plot([ant_pos, ant_pos], x_lim, ':', color='#bfbfbf', linewidth=1, alpha=0.4) plt.plot(x_lim, [ant_pos, ant_pos], ':', color='#bfbfbf', linewidth=1, alpha=0.4) # final adjustments plt.xlim(x_lim) plt.ylim(x_lim) x_ticks = np.linspace(configs['roi_start'], configs['roi_end'], 7, dtype=np.int64) x_tick_label = ['{:0.2f}m'.format(x / 1e6) for x in x_ticks] plt.xticks(x_ticks, x_tick_label, rotation=0, horizontalalignment='center') plt.yticks(x_ticks, x_tick_label, rotation=0) plt.title('Multicontact matrix, {:s}\n'.format(configs['run_id']) + '#read (#roiFrg>{:d}, ex. vp)={:,d}\n\n\n'.format( min_n_frg - 1, n_read)) plt.savefig(configs['output_file'], bbox_inches='tight')
def perform_soisoi_analysis(config_lst, min_n_frg=2, n_perm=1000): import platform import matplotlib if platform.system() == 'Linux': matplotlib.use('Agg') from matplotlib import pyplot as plt from matplotlib.colors import LinearSegmentedColormap from utilities import load_mc4c, load_annotation, hasOL, flatten # initialization run_id = ','.join([config['run_id'] for config in config_lst]) if config_lst[0]['output_file'] is None: config_lst[0]['output_file'] = config_lst[0][ 'output_dir'] + '/analysis_atSOI-SOI_{:s}.pdf'.format(run_id) edge_lst = np.linspace(config_lst[0]['roi_start'], config_lst[0]['roi_end'], num=201, dtype=np.int64).reshape(-1, 1) bin_bnd = np.hstack([edge_lst[:-1], edge_lst[1:] - 1]) bin_w = bin_bnd[0, 1] - bin_bnd[0, 0] del edge_lst # load MC-HC data frg_dp = load_mc4c(config_lst, unique_only=True, valid_only=True, min_mq=20, reindex_reads=True, verbose=True) frg_np = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd']].values del frg_dp # select within roi fragments vp_crd = [ config_lst[0]['vp_cnum'], config_lst[0]['vp_start'], config_lst[0]['vp_end'] ] roi_crd = [ config_lst[0]['vp_cnum'], config_lst[0]['roi_start'], config_lst[0]['roi_end'] ] is_vp = hasOL(vp_crd, frg_np[:, 1:4]) is_roi = hasOL(roi_crd, frg_np[:, 1:4]) frg_roi = frg_np[~is_vp & is_roi, :] del frg_np # filter small read (>1 roi-frg, ex.) cir_size = np.bincount(frg_roi[:, 0])[frg_roi[:, 0]] frg_inf = frg_roi[cir_size >= min_n_frg, :] frg_inf[:, 0] = np.unique(frg_inf[:, 0], return_inverse=True)[1] + 1 n_read = len(np.unique(frg_inf[:, 0])) # convert fragments to bin-coverage cfb_lst = [list() for i in range(n_read + 1)] n_frg = frg_inf.shape[0] for fi in range(n_frg): bin_idx = np.where(hasOL(frg_inf[fi, 2:4], bin_bnd))[0] cfb_lst[frg_inf[fi, 0]].append(bin_idx.tolist()) # filter reads for (>1 bin cvg) valid_lst = [] for rd_nid in range(1, n_read + 1): fb_lst = cfb_lst[rd_nid] bin_cvg = np.unique(flatten(fb_lst)) if len(bin_cvg) > 1: valid_lst.append(rd_nid) frg_inf = frg_inf[np.isin(frg_inf[:, 0], valid_lst), :] # Downsample and re-index # rnd_rid = np.random.choice(np.unique(frg_inf[:, 0]), 8618, replace=False) ### random selection # frg_inf = frg_inf[np.isin(frg_inf[:, 0], rnd_rid), :] frg_inf[:, 0] = np.unique(frg_inf[:, 0], return_inverse=True)[1] + 1 n_read = np.max(frg_inf[:, 0]) # loop over each SOI ant_pd = load_annotation(config_lst[0]['genome_build'], roi_crd=roi_crd).reset_index(drop=True) n_ant = ant_pd.shape[0] ant_name_lst = ant_pd['ant_name'].values ant_scr = np.full(shape=[n_ant, n_ant], fill_value=np.nan) n_pos = np.zeros(n_ant, dtype=np.int) x_tick_lbl = [] for ai in range(n_ant): soi_pd = ant_pd.loc[ai, :] soi_crd = [ soi_pd['ant_cnum'], soi_pd['ant_pos'] - int(bin_w * 1.5), soi_pd['ant_pos'] + int(bin_w * 1.5) ] if hasOL(vp_crd[1:], soi_crd[1:]): x_tick_lbl.append(ant_name_lst[ai]) continue # compute score for annotations print 'Computing expected profile for {:s}:'.format(soi_pd['ant_name']) ant_pos = ant_pd['ant_pos'].values.reshape(-1, 1) ant_bnd = np.hstack( [ant_pos - int(bin_w * 1.5), ant_pos + int(bin_w * 1.5)]) ant_obs, soi_rnd, frg_pos = compute_mc_associations(frg_inf, soi_crd, ant_bnd, n_perm=n_perm)[:3] n_pos[ai] = len(np.unique(frg_pos[:, 0])) x_tick_lbl.append('{:s}\n#{:,d}'.format(ant_name_lst[ai], n_pos[ai])) del frg_pos # check number of positive reads if n_pos[ai] <= MIN_N_POS: print '[w] #reads (n={:d}) in the positive set is insufficient '.format(n_pos[ai]) + \ '(required >{:d}). This analysis is ignored ...'.format(MIN_N_POS) continue # calculate expected profile ant_exp = np.mean(soi_rnd, axis=0) ant_std = np.std(soi_rnd, axis=0, ddof=0) np.seterr(all='ignore') ant_scr[:, ai] = np.divide(ant_obs - ant_exp, ant_std) np.seterr(all=None) # set vp score to nan is_vp = hasOL(vp_crd[1:], ant_bnd) is_soi = hasOL(soi_crd[1:3], ant_bnd) ant_scr[is_vp | is_soi, ai] = np.nan # plotting plt.figure(figsize=(8, 7)) ax_scr = plt.subplot2grid((40, 40), (0, 0), rowspan=39, colspan=39) ax_cmp = plt.subplot2grid((40, 40), (0, 39), rowspan=20, colspan=1) # set up colorbar c_lim = [-6, 6] clr_lst = [ '#ff1a1a', '#ff7575', '#ffcccc', '#ffffff', '#ffffff', '#ffffff', '#ccdfff', '#3d84ff', '#3900f5' ] clr_map = LinearSegmentedColormap.from_list('test', clr_lst, N=9) clr_map.set_bad('gray', 0.2) norm = matplotlib.colors.Normalize(vmin=c_lim[0], vmax=c_lim[1]) cbar_h = matplotlib.colorbar.ColorbarBase(ax_cmp, cmap=clr_map, norm=norm) # cbar_h.ax.tick_params(labelsize=12) cbar_h.ax.set_ylabel('z-score', rotation=90) cbar_edge = np.round(cbar_h.cmap(norm(c_lim)), decimals=2) # add score scatter matrix x_lim = [0, n_ant] img_h = ax_scr.imshow(ant_scr, extent=x_lim + x_lim, cmap=clr_map, vmin=c_lim[0], vmax=c_lim[1], interpolation='nearest', origin='bottom') ax_scr.set_xlim(x_lim) ax_scr.set_ylim(x_lim) # add score values to each box for ai in range(n_ant): for aj in range(n_ant): if np.isnan(ant_scr[ai, aj]): continue ant_clr = np.round(img_h.cmap(img_h.norm(ant_scr[ai, aj])), decimals=2) if np.array_equal(ant_clr, cbar_edge[0]) or np.array_equal( ant_clr, cbar_edge[1]): txt_clr = '#ffffff' else: txt_clr = '#000000' ax_scr.text(aj + 0.5, ai + 0.5, '{:+0.1f}'.format(ant_scr[ai, aj]), color=txt_clr, horizontalalignment='center', verticalalignment='center', fontsize=12) # final adjustments ax_scr.set_xticks(np.arange(n_ant) + 0.5) ax_scr.set_yticks(np.arange(n_ant) + 0.5) ax_scr.set_xticklabels(x_tick_lbl) ax_scr.set_yticklabels(ant_name_lst) ax_scr.set_xlabel('Selected SOIs') ax_scr.set_title( 'Association matrix from {:s}\n'.format(run_id) + '#read (#roiFrg>{:d}, ex. vp)={:,d}, '.format(min_n_frg - 1, n_read) + 'bin-w={:d}; #perm={:d}'.format(config_lst[0]['bin_width'], n_perm)) plt.savefig(config_lst[0]['output_file'], bbox_inches='tight')
def perform_vpsoi_analysis(configs, soi_name, min_n_frg=2, n_perm=1000): import platform import matplotlib if platform.system() == 'Linux': matplotlib.use('Agg') from matplotlib import pyplot as plt, patches from matplotlib.colors import LinearSegmentedColormap from utilities import load_mc4c, load_annotation, hasOL, flatten # initialization if configs['output_file'] is None: configs['output_file'] = configs[ 'output_dir'] + '/analysis_atVP-SOI_{:s}_{:s}.pdf'.format( configs['run_id'], soi_name) edge_lst = np.linspace(configs['roi_start'], configs['roi_end'], num=201, dtype=np.int64).reshape(-1, 1) bin_bnd = np.hstack([edge_lst[:-1], edge_lst[1:] - 1]) bin_cen = np.mean(bin_bnd, axis=1, dtype=np.int64) bin_w = bin_bnd[0, 1] - bin_bnd[0, 0] x_lim = [configs['roi_start'], configs['roi_end']] y_lim = [0, 10] # load MC-HC data frg_dp = load_mc4c(configs, unique_only=True, valid_only=True, min_mq=20, reindex_reads=True, verbose=True) frg_np = frg_dp[['ReadID', 'Chr', 'ExtStart', 'ExtEnd']].values del frg_dp # select within roi fragments vp_crd = [configs['vp_cnum'], configs['vp_start'], configs['vp_end']] roi_crd = [configs['vp_cnum'], configs['roi_start'], configs['roi_end']] is_vp = hasOL(vp_crd, frg_np[:, 1:4]) is_roi = hasOL(roi_crd, frg_np[:, 1:4]) frg_roi = frg_np[~is_vp & is_roi, :] del frg_np # filter small circles (>1 roi-frg, ex.) cir_size = np.bincount(frg_roi[:, 0])[frg_roi[:, 0]] frg_inf = frg_roi[cir_size >= min_n_frg, :] frg_inf[:, 0] = np.unique(frg_inf[:, 0], return_inverse=True)[1] + 1 n_read = len(np.unique(frg_inf[:, 0])) # convert fragments to bin-coverage cfb_lst = [list() for i in range(n_read + 1)] n_frg = frg_inf.shape[0] for fi in range(n_frg): bin_idx = np.where(hasOL(frg_inf[fi, 2:4], bin_bnd))[0] cfb_lst[frg_inf[fi, 0]].append(bin_idx.tolist()) # filter circles for (>1 bin cvg) valid_lst = [] for rd_nid in range(1, n_read + 1): fb_lst = cfb_lst[rd_nid] bin_cvg = np.unique(flatten(fb_lst)) if len(bin_cvg) > 1: valid_lst.append(rd_nid) frg_inf = frg_inf[np.isin(frg_inf[:, 0], valid_lst), :] frg_inf[:, 0] = np.unique(frg_inf[:, 0], return_inverse=True)[1] + 1 n_read = np.max(frg_inf[:, 0]) # get soi info ant_pd = load_annotation(configs['genome_build'], roi_crd=roi_crd).reset_index(drop=True) n_ant = ant_pd.shape[0] is_in = np.where(np.isin(ant_pd['ant_name'], soi_name))[0] assert len(is_in) == 1 soi_pd = ant_pd.loc[is_in[0], :] soi_crd = [ soi_pd['ant_cnum'], soi_pd['ant_pos'] - int(bin_w * 1.5), soi_pd['ant_pos'] + int(bin_w * 1.5) ] if hasOL(soi_crd, vp_crd)[0]: '[w] Selected SOI coordinate overlaps with the view point. Ignoring the analysis' return # compute positive profile and backgrounds print 'Computing expected profile for bins:' prf_frq, prf_rnd, frg_pos, frg_neg = compute_mc_associations(frg_inf, soi_crd, bin_bnd, n_perm=n_perm) n_pos = len(np.unique(frg_pos[:, 0])) prf_obs = prf_frq * 100.0 / n_pos print '{:,d} reads are found to cover '.format(n_pos) + \ '{:s} area ({:s}:{:d}-{:d})'.format(soi_pd['ant_name'], soi_pd['ant_chr'], soi_crd[1], soi_crd[2]) # check enough #pos if n_pos < MIN_N_POS: print '[w] #reads in the positive set is insufficient (n={:d}, required >{:d})'.format( n_pos, MIN_N_POS) print 'Analysis is ignored ...' return # compute scores nrm_rnd = prf_rnd * 100.0 / n_pos prf_exp = np.mean(nrm_rnd, axis=0) prf_std = np.std(nrm_rnd, axis=0, ddof=0) np.seterr(all='ignore') bin_scr = np.divide(prf_obs - prf_exp, prf_std) np.seterr(all=None) # set vp bins to nan vp_bnd = [configs['vp_start'], configs['vp_end']] is_vp = hasOL(vp_bnd, bin_bnd) bin_scr[is_vp] = np.nan # compute score for annotations print 'Computing expected profile for annotations:' ant_pos = ant_pd['ant_pos'].values.reshape(-1, 1) ant_bnd = np.hstack( [ant_pos - int(bin_w * 1.5), ant_pos + int(bin_w * 1.5)]) ant_obs, soi_rnd = compute_mc_associations(frg_inf, soi_crd, ant_bnd, n_perm=n_perm)[:2] ant_exp = np.mean(soi_rnd, axis=0) ant_std = np.std(soi_rnd, axis=0, ddof=0) np.seterr(all='ignore') ant_scr = np.divide(ant_obs - ant_exp, ant_std) np.seterr(all=None) # set vp score to nan is_vp = hasOL(vp_bnd, ant_bnd) is_soi = hasOL(soi_crd[1:3], ant_bnd) ant_scr[is_vp | is_soi] = np.nan # plotting fig = plt.figure(figsize=(15, 3)) ax_prf = plt.subplot2grid((20, 40), (0, 0), rowspan=19, colspan=39) ax_cmp = plt.subplot2grid((20, 40), (0, 39), rowspan=10, colspan=1) ax_scr = plt.subplot2grid((20, 40), (19, 0), rowspan=1, colspan=39) # set up colorbar c_lim = [-6, 6] clr_lst = [ '#ff1a1a', '#ff7575', '#ffcccc', '#ffffff', '#ffffff', '#ffffff', '#ccdfff', '#3d84ff', '#3900f5' ] clr_map = LinearSegmentedColormap.from_list('test', clr_lst, N=9) clr_map.set_bad('gray', 0.05) norm = matplotlib.colors.Normalize(vmin=c_lim[0], vmax=c_lim[1]) cbar_h = matplotlib.colorbar.ColorbarBase(ax_cmp, cmap=clr_map, norm=norm) # cbar_h.ax.tick_params(labelsize=12) cbar_h.ax.set_ylabel('z-score', rotation=90) # profile plot ax_prf.plot(bin_cen, prf_obs, color='#5757ff', linewidth=1) ax_prf.plot(bin_cen, prf_exp, color='#cccccc', linewidth=1) ax_prf.fill_between(bin_cen, prf_exp - prf_std, prf_exp + prf_std, color='#ebebeb', linewidth=0.2) ax_prf.add_patch( patches.Rectangle([vp_bnd[0], y_lim[0]], vp_bnd[1] - vp_bnd[0], y_lim[1] - y_lim[0], edgecolor='None', facecolor='orange', zorder=100)) ax_prf.add_patch( patches.Rectangle([soi_crd[1], y_lim[0]], soi_crd[2] - soi_crd[1], y_lim[1] - y_lim[0], edgecolor='None', facecolor='green', zorder=100)) ax_prf.set_xlim(x_lim) ax_prf.set_ylim(y_lim) ax_prf.set_xticks([]) # add score plot ax_scr.imshow(bin_scr.reshape(1, -1), extent=x_lim + [-500, 500], cmap=clr_map, vmin=c_lim[0], vmax=c_lim[1], interpolation='nearest') ax_scr.set_xlim(x_lim) ax_scr.set_yticks([]) # add annotations for ai in range(n_ant): ax_prf.text(ant_pos[ai], y_lim[1], ant_pd.loc[ai, 'ant_name'], horizontalalignment='center', verticalalignment='bottom', rotation=60) ax_prf.plot(ant_pos[[ai, ai]], y_lim, ':', color='#bfbfbf', linewidth=1, alpha=0.4) if not np.isnan(ant_scr[ai]): ax_prf.add_patch( patches.Rectangle([ant_bnd[ai, 0], y_lim[1] - 0.15], ant_bnd[ai, 1] - ant_bnd[ai, 0], 0.15, edgecolor='None', facecolor=clr_map(ant_scr[ai]), zorder=10)) ax_prf.text(ant_pos[ai], y_lim[1] - 0.2, '{:+0.1f}'.format(ant_scr[ai]), horizontalalignment='center', verticalalignment='top', fontweight='bold', fontsize=6) # final adjustments x_ticks = np.linspace(configs['roi_start'], configs['roi_end'], 7, dtype=np.int64) y_ticks = ax_prf.get_yticks() x_tick_label = ['{:0.2f}m'.format(x / 1e6) for x in x_ticks] y_tick_label = ['{:0.0f}%'.format(y) for y in y_ticks] ax_scr.set_xticks(x_ticks) ax_scr.set_xticklabels(x_tick_label) ax_prf.set_yticklabels(y_tick_label) ax_prf.set_ylabel('Percentage of reads') ax_prf.set_title( 'VP-SOI from {:s}, using as SOI {:s}\n'.format(configs['run_id'], soi_name) + '#read (#roiFrg>{:d}, ex. vp)={:,d}, '.format(min_n_frg - 1, n_read) + '#pos = {:d}\n#perm={:d}\n\n\n'.format(n_pos, n_perm)) plt.savefig(configs['output_file'], bbox_inches='tight')