def doSaddle(filename, eig, gen): c = cooler.Cooler(filename) gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) gen.setResolution(getResolution(filename)) saddles = [] for chrom in range(gen.chrmCount): saddle = np.zeros((5,5), dtype = float) st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = c.matrix(balance=False).fetch(gen.idx2label[chrom]) cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) saddle[i, j] += cur[np.ix_(mask1, mask2)].mean() saddles.append(saddle) return saddles
def doSaddles(q, E1_values, genome_db): saddles = {} for chrom in range(genome_db.chrmCount): saddle = np.ones((5, 5), dtype=float) st = genome_db.chrmStartsBinCont[chrom] end = genome_db.chrmEndsBinCont[chrom] cur = q[st:end, st:end] E1 = E1_values[st:end] mask = np.sum(cur, axis=0) > 0 if sum(mask) > 5: cur = cur[mask] cur = cur[:, mask] cur = observedOverExpected(cur) E1 = E1[mask] assert cur.shape[0] == cur.shape[1] == len(E1) for i in range(5): for j in range(5): P1, P2 = np.percentile(E1, [20 * i, 20 * i + 20]) mask1 = (E1 > P1) * (E1 < P2) P1, P2 = np.percentile(E1, [20 * j, 20 * j + 20]) mask2 = (E1 > P1) * (E1 < P2) if sum(mask1) * sum(mask2) != 0: saddle[i, j] = np.nanmean(cur[np.ix_(mask1, mask2)]) else: saddle[i, j] = None saddles[genome_db.idx2label[chrom]] = saddle else: pass #print "Ommiting chromsome ",genome_db.idx2label[chrom] all_average = np.zeros((5, 5), dtype=float) for i in range(5): for j in range(5): all_average[i, j] = np.average([ saddles[c][i, j] for c in saddles if not np.isnan(saddles[c][i, j]) ]) saddles["all_average"] = all_average strength = math.log(all_average[0, 0] * all_average[-1, -1] / (all_average[0, -1] * all_average[0, -1])) return saddles, strength
def doSaddleError(filename, eig, gen, correct=False): gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"]) cur = 0 data = h5dict(filename,'r')["heatmap"] if correct: data = completeIC(data) gen.setResolution(getResolution(filename)) if eig == "GC": eig = np.concatenate(gen.GCBin) saddles = [] permutted = [] saddle = np.zeros((5,5), dtype = float) for i in range(100): permutted.append(np.zeros((5,5), dtype = float)) for chrom in range(gen.chrmCount): st = gen.chrmStartsBinCont[chrom] end = gen.chrmEndsBinCont[chrom] cur = data[st:end, st:end] cur = observedOverExpected(cur) mask = np.sum(cur , axis=0) > 0 cur = cur [mask] cur = cur [:, mask] GC = eig[st:end] GC = GC[mask] if len(GC) > 5: for i in range(5): for j in range(5): G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20]) mask1 = (GC > G1) * (GC < G2) G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20]) mask2 = (GC > G1) * (GC < G2) addition = cur[np.ix_(mask1, mask2)] addition = np.reshape(addition, (-1)) for k in range(100): resampled = np.random.choice(addition, len(addition), replace=True) permutted[k][i,j] += resampled.mean() saddle[i, j] += addition.mean() return saddle, permutted
def get_by_chr_E1(genome_db, resolution): if heatmap_filepath.endswith(".IC"): raw = heatmap_filepath[:-3] else: raw = heatmap_filepath print "Using raw heatmap ", raw global BD_raw BD_raw = binnedData.binnedData(resolution, genome_db) BD_raw.simpleLoad(raw, 'heatmap') BD_raw.removeDiagonal() # Remove bins with less than half of a bin sequenced BD_raw.removeBySequencedCount(0.5) # We'll do iterative correction and Eigenvector expansion on trans data only! # We want to remove cis, because later we want to remove poor regions in trans BD_raw.removeCis() # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts) # Do this before removing poor regions, because single blowouts may give # lots of contacts to a region which does not have much contacts otehrwise. BD_raw.truncTrans(high=0.0005) # Remove 1% of regions with low coverage BD_raw.removePoorRegions(cutoff=1) # Fake cis counts. Data gets iteratively corrected during this process... BD_raw.fakeCis() # Remove bins with zero counts for eigenvector analysis --> This will be done for each chromosome in for loop # BD.removeZeros() # Perform eigenvector expansion. result = {"OE": {}, "Classic": {}, "genome_wide_Classic": {}} genom_wide_E1 = np.genfromtxt(raw + ".eig", dtype=None)['f2'] for chrom in range(genome_db.chrmCount): st = genome_db.chrmStartsBinCont[chrom] end = genome_db.chrmEndsBinCont[chrom] cur = BD_raw.dataDict['heatmap'][st:end, st:end] mask = np.sum(cur, axis=0) > 0 if sum(mask) > 5: cur = cur[mask] cur = cur[:, mask] currentEIG, eigenvalues = EIG(cur, numPCs=1) if spearmanr(currentEIG[0], BD_raw.trackDict["GC"][st:end][mask])[0] < 0: currentEIG[0] = -currentEIG[0] E1 = np.empty(shape=(len(mask), )) * np.nan E1[mask] = currentEIG[0] result["Classic"][chrom] = E1 cur = observedOverExpected(cur) mask = np.sum(cur, axis=0) > 0 if sum(mask) > 5: cur = cur[mask] cur = cur[:, mask] currentEIG, eigenvalues = EIG(cur, numPCs=1) if spearmanr(currentEIG[0], BD_raw.trackDict["GC"][st:end][mask])[0] < 0: currentEIG[0] = -currentEIG[0] E1 = np.empty(shape=(len(mask), )) * np.nan E1[mask] = currentEIG[0] result["OE"][chrom] = E1 result["genome_wide_Classic"][chrom] = genom_wide_E1[st:end] return result
def snip(segmentation, map, output_prefix, format, balance, niter, window, enrichment_only, diagonals_to_remove): """ Create snips for TADs and calculate enrichment with shuffled control. OUTPUT_PREFIX: The prefix for writing output files (pickle with snips and tsv file with TAD info). Output files to be created: {OUTPUT_PREFIX}.TADmetadata.tsv if not --enrichment-only: {OUTPUT_PREFIX}.TADsnips.pickle {OUTPUT_PREFIX}.TADsnips_shuf0.pickle etc. Example run: avTAD snip data/OSC_TADS.bed data/OSC_dm3.cool tmp_results --format cool --diagonals-to-remove 2 --balance --niter 2 """ logger = get_logger(__name__) logger.info( f"Running snipping for: segmentation file {segmentation}, heatmap {map} in {format} format ..." ) logger.info( f"Reading {map} file with balance={balance} in {format} format ...") if format == 'cool': dataset, chrms, resolution = read_cooler(map, balance=balance) elif format == 'hiclib_heatmap': dataset, chrms, resolution = read_hiclib_heatmap(map, balance=balance) elif format == 'hiclib_bychr': dataset, chrms, resolution = read_hiclib_bychr(map, balance=balance) else: raise Exception(f'Map format {format} is not supported ...') logger.info(f"Reading segmentation file: {segmentation}") df_segmentation = pd.read_csv(segmentation, sep='\s', header=None, engine='python') add_columns = list(df_segmentation.columns[3:]) if len( df_segmentation.columns) > 3 else [] df_segmentation.columns = ['ch', 'bgn', 'end'] + add_columns df_segmentation.loc[:, 'bgn_bin'] = df_segmentation.bgn // resolution df_segmentation.loc[:, 'end_bin'] = df_segmentation.end // resolution df_segmentation.loc[:, 'TAD_size'] = df_segmentation.end_bin - df_segmentation.bgn_bin df_segmentation = df_segmentation.drop_duplicates().sort_values( ['ch', 'bgn_bin']).reset_index(drop=True) chrms_used = np.unique(df_segmentation.loc[:, 'ch'].values) chrms = [ch for ch in chrms if ch in chrms_used] logger.info(f"Chromosomes in the dataset: {dataset.keys()}") logger.info( f"Lengths of chromosomes in bins of {resolution} bp: \n{[(ch, len(dataset[ch])) for ch in chrms]}" ) logger.info(f"Selected chromosomes are: {chrms}") # Creating shuffled segmentations def shuffle_segmentation_dataframe(x): segmentation = x[['bgn_bin', 'end_bin']].values shuf, order = shuffle_segmentation(segmentation) #print(segmentation[0:5], shuf[0:5], order[0:5]) ret = pd.DataFrame(shuf, columns=['bgn_bin', 'end_bin']).astype(int) ret.loc[:, 'index'] = order ret.loc[:, 'TAD_size'] = ret.end_bin - ret.bgn_bin ret = ret.sort_values('index').reset_index(drop=True) assert np.all(x['TAD_size'].values == ret['TAD_size'].values) return ret for i in range(niter): df_segmentation_shuffled = df_segmentation.groupby('ch').apply(shuffle_segmentation_dataframe)\ .reset_index().drop(['level_1', 'index', 'ch', 'TAD_size'], axis=1) df_segmentation_shuffled.columns = [ f'bgn_bin_shuf{i}', f'end_bin_shuf{i}' ] df_segmentation = pd.merge(df_segmentation, df_segmentation_shuffled, left_index=True, right_index=True) # Computing observed over expected dataset_obsexp = {} for ch in chrms: mtx = numutils.observedOverExpected(dataset[ch]) #inx_lower_triangle = np.tril_indices(len(mtx)) #mtx[inx_lower_triangle] = np.nan for i in range(1, diagonals_to_remove): np.fill_diagonal(mtx[i:, :-i], np.nan) np.fill_diagonal(mtx[:-i, i:], np.nan) if diagonals_to_remove: np.fill_diagonal(mtx, np.nan) dataset_obsexp.update({ch: mtx}) for mod in [''] + [f'_shuf{i}' for i in range(niter)]: enrichments = [] for i, r in df_segmentation.iterrows(): mtx = np.log2( dataset_obsexp[r.ch][r[f'bgn_bin{mod}']:r[f'end_bin{mod}'], r[f'bgn_bin{mod}']:r[f'end_bin{mod}']]) mtx[np.isinf(mtx)] = np.nan enrichment = (np.nansum(mtx), np.nanmean(mtx), np.nanmedian(mtx), np.sum(np.isfinite(mtx))) enrichments.append(np.array(enrichment)) enrichments = np.array(enrichments) df_segmentation.loc[:, f"sum{mod}"] = enrichments[:, 0] df_segmentation.loc[:, f"mean{mod}"] = enrichments[:, 1] df_segmentation.loc[:, f"median{mod}"] = enrichments[:, 2] df_segmentation.loc[:, f"nelements{mod}"] = enrichments[:, 3] # Save enrichment dataframe to a file: cols = ['bgn_bin', 'end_bin', 'sum', 'mean', 'median', 'nelements'] columns = ['ch', 'bgn', 'end', 'TAD_size'] + add_columns + cols + [ f'{x}_shuf{i}' for i in range(niter) for x in cols ] df_segmentation[columns].to_csv(f"{output_prefix}.TADmetadata.tsv", sep='\t', index=True, header=True) if not enrichment_only: # Retrieval of snippets, log2 and filling inf with nans included: snips = snipper(segmentations=df_segmentation, dataset=dataset_obsexp, window=window) # Save snippets to file: pickle.dump(snips, open(f"{output_prefix}.TADsnips.pickle", 'wb')) for i in range(niter): # Retrieval of snippets: snips = snipper(segmentations=df_segmentation, dataset=dataset_obsexp, window=window, key_bgn=f'bgn_bin_shuf{i}', key_end=f'end_bin_shuf{i}') # Save snippets to file: pickle.dump(snips, open(f"{output_prefix}.TADsnips_shuf{i}.pickle", 'wb'))
def cis_eig(A, k=3, robust=True, gc=None, classic=False): """ Compute compartment eigenvector on a cis matrix Parameters ---------- A : 2D array balanced whole genome contact matrix k : int number of eigenvectors to compute; default = 3 robust : bool Clip top 0.1 percentile and smooth first two diagonals gc : 1D array, optional GC content per bin for choosing and orienting the primary compartment eigenvector; not performed if no array is provided classic : bool Do it old-school Returns ------- eigenvalues, eigenvectors """ A = np.array(A) A[~np.isfinite(A)] = 0 mask = A.sum(axis=0) > 0 if A.shape[0] <= 5 or mask.sum() <= 5: return (np.array([np.nan for i in range(k)]), np.array([np.ones(A.shape[0]) * np.nan for i in range(k)])) if robust: A = np.clip(A, 0, np.percentile(A, 99.9)) fill_value = np.mean(np.diag(A, 2) * 2) for d in [-1, 0, 1]: numutils.fillDiagonal(A, fill_value, d) A[~mask, :] = 0 A[:, ~mask] = 0 OE = numutils.observedOverExpected(A[mask, :][:, mask]) if robust: OE = np.clip(OE, 0, np.percentile(OE, 99.9)) if classic: OE = numutils.iterativeCorrection(OE)[0] if (~np.isfinite(OE)).sum() > 0: return ( np.array([np.ones(A.shape[0]) * np.nan for i in range(k)]), np.array([np.nan for i in range(k)]), ) # mean-centered (subtract mean) eigvecs_compressed, eigvals = numutils.EIG(OE, k) else: eigvecs_compressed, eigvals = numutils.EIG((OE - 1.0), k, subtractMean=False, divideByMean=False) # Restore full eigs eigvecs = [] for i in range(k): v = np.ones(mask.shape[0]) * np.nan v[mask] = eigvecs_compressed[i] eigvecs.append(v) eigvecs = np.array(eigvecs) # Orient and reorder eigvals, eigvecs = _orient_eigs(eigvals, eigvecs, gc) return eigvals, eigvecs
sys.path.append("/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/utils") import figPath import ntpath figure_path=figPath.figure_path+ntpath.basename(heatmap_filepath)+"_"+'Compartment_strength' genome_db = genome.Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/", readChrms=[], chrmFileTemplate="%s.fna") resolution = int(heatmap_filepath.split("-")[-1].split("k")[0])*1000 print "Resolution determined: ",resolution print "Loading file "+heatmap_filepath BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(heatmap_filepath, 'heatmap') q=BD.dataDict['heatmap'] obs_exp = observedOverExpected(q) E1_values = np.loadtxt(E1_file) assert E1_values.shape()[0] = genome_db.numBins bins = dict([(i,np.zeros(len(genome_db.chrmLenBins[i]))) for i in range(genome_db.chmCount)]) for i in E1_values: chr = genome_db.label2idx[i["f1"]] nt_start = genome_db.label2idx[i["f2"]] bin_start = nt_start/resolution assert bins[chr][bin_start] == 0 bins[chr][bin_start] = i
saddles = {} for chrom in range(genome_db.chrmCount): saddle = np.ones((5, 5), dtype=float) st = genome_db.chrmStartsBinCont[chrom] end = genome_db.chrmEndsBinCont[chrom] cur = q[st:end, st:end] E1 = E1_values[st:end] mask = np.sum(cur, axis=0) > 0 if sum(mask) > 5: cur = cur[mask] cur = cur[:, mask] cur = observedOverExpected(cur) E1 = E1[mask] assert cur.shape[0] == cur.shape[1] == len(E1) for i in range(5): for j in range(5): P1, P2 = np.percentile(E1, [20 * i, 20 * i + 20]) mask1 = (E1 > P1) * (E1 < P2) P1, P2 = np.percentile(E1, [20 * j, 20 * j + 20]) mask2 = (E1 > P1) * (E1 < P2) if sum(mask1) * sum(mask2) != 0: saddle[i, j] = np.nanmean(cur[np.ix_(mask1, mask2)]) else: saddle[i, j] = None saddles[genome_db.idx2label[chrom]] = saddle else: