def get_chromosome_arms(genome, exclude=None): # Uses bioframe to get chromosomal regions if exclude is not None: if isinstance(exclude, str): exclude = [exclude] exclude = [str(item) for item in exclude] else: exclude = [] try: chromlengths = bioframe.fetch_chromsizes(genome) centromeres = bioframe.fetch_centromeres(genome).set_index('chrom') except: print(f'Information for genome {genome} could not be found.') return None arms = [] for chrom, length in chromlengths.iteritems(): if chrom in exclude: continue if chrom in centromeres.index: mid = centromeres.loc[chrom, 'mid'] arms.append((chrom, 0, mid)) arms.append((chrom, mid, length)) else: arms.append((chrom, 0, length)) return arms
def get_chroms(genome, ignoreXYMT=True): "Get list of chroms to analyze" print("Using chroms from " + genome) chromsizes = bioframe.fetch_chromsizes(genome) chr_list = list(chromsizes.index) if ignoreXYMT == True: chr_list = [i for i in chr_list if i not in ("chrM", "chrX", "chrY")] return chr_list
def cooler_global_scaling(cool, genome, trans=True, mapper=map, balance='weight', thres=None, ignore_diags=2): row_masker = col_masker = cooler_mask(cool, header=balance, thres=thres) matrix_fetcher = cooler_matrix_generator(cool, header=balance) resolution = cool.info['bin-size'] chrom_arms = DNA_info.get_chromosome_arms(genome) cis_regions = [(arm, arm) for arm in chrom_arms] cis_results = cis_binning(cis_regions, matrix_fetcher, row_masker, col_masker, resolution, ignore_diags, mapper=mapper) cis_results = pd.concat(cis_results) cis_results = cis_results.reset_index().rename( columns={'region1': 'region'}) del cis_results['region2'] cis_results.set_index(['region', 'diag'], inplace=True, drop=True) if trans: print('Computing trans expected') chromsizes = bioframe.fetch_chromsizes(genome) trans_regions = [(bioframe.parse_region(cool.chromnames[i], chromsizes=chromsizes), bioframe.parse_region(cool.chromnames[j], chromsizes=chromsizes)) for i in range(len(cool.chromnames)) for j in range(i + 1, len(cool.chromnames))] trans_results = trans_binning(trans_regions, matrix_fetcher, row_masker, col_masker, resolution, mapper=mapper) trans_results = [ result for result in trans_results if result is not None ] trans_results = pd.concat(trans_results) trans_results['chrom1'] = trans_results.index.map( lambda x: x[0][0]).values trans_results['chrom2'] = trans_results.index.map( lambda x: x[1][0]).values trans_results.set_index(['chrom1', 'chrom2'], inplace=True) return cis_results, trans_results return cis_results
def gene_content(genome, binsize, gc=True): chrom_sizes = bioframe.fetch_chromsizes(genome) chrom_table = binnify(chrom_sizes, binsize) gene_count = frac_gene_coverage(chrom_table, genome) if gc: fasta_path = f'/net/levsha/share/lab/genomes/{genome}/{genome}.fa' fasta_records = load_fasta(fasta_path) gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records) return gene_count
def gene_content(genome, binsize, gc=True, fasta_path=None): chrom_sizes = bioframe.fetch_chromsizes(genome) chrom_table = binnify(chrom_sizes, binsize) gene_count = frac_gene_coverage(chrom_table, genome) if gc: assert fasta_path is not None, 'Please provide valid fasta file path if you want GC content' fasta_records = load_fasta(fasta_path) gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records) return gene_count
def compute_trans_scaling(cooler_path, out_path, resolution, regions1, regions2, labels, title): chromsizes = bioframe.fetch_chromsizes('sacCer3', filter_chroms=False, as_bed=True) avg_contacts = cooltools.expected.diagsum_asymm( clr=cooler.Cooler('::/resolutions/'.join( (cooler_path, str(resolution)))), supports1=list(regions1), supports2=list(regions2), transforms={ 'balanced': lambda p: p['count'] * p['weight1'] * p['weight2'] }) avg_contacts['balanced.avg'] = avg_contacts['balanced.sum'] / avg_contacts( 'n_valid') print('...')
def plot_insulation(clr, insulation, windows, resolution, out_path, exclude_chroms, title): dir_path = os.path.join(os.path.dirname(out_path), title) if not os.path.exists(dir_path): os.mkdir(dir_path) chromsizes = bioframe.fetch_chromsizes('sacCer3', filter_chroms=False) regions = [(k, 0, v) for k, v in chromsizes.drop('chrM').iteritems()] for region in regions: norm = LogNorm(vmax=0.1, vmin=0.001) data = clr.matrix(balance=True).fetch(region) fig, ax = plt.subplots(figsize=(20, 4)) img = plot_45_mat(ax, data, start=0, resolution=resolution, norm=norm, cmap='fall') ax.set_aspect(0.5) ax.set_ylim(0, 30000) format_ticks(ax, rotate=False) ax.xaxis.set_visible(False) divider = make_axes_locatable(ax) cax = divider.append_axes('right', size='1%' ,pad=0.1, aspect=6) plt.colorbar(img, cax=cax) insul_region = bioframe.select(insulation, region) ins_ax = divider.append_axes('bottom', size='50%', pad=0.0, sharex=ax) ins_ax.set_prop_cycle(plt.cycler('color', plt.cm.plasma(np.linspace(0, 1, 5)))) for window in windows: ins_ax.plot(insul_region[['start', 'end']].mean(axis=1), insul_region[f'log2_insulation_score_{window}'], label=f'{window} bp window', lw=1) ins_ax.legend(bbox_to_anchor=(1.125, 1.05), loc='upper right') fig.suptitle(f'{title}: {region[0]}') path = os.path.join(dir_path, '_'.join((region[0], os.path.basename(out_path)))) plt.savefig(path, dpi=300)
def get_arms_hg19() -> pd.DataFrame: """Downloads the coordinates for chromosomal arms of the genome assembly hg19 and returns it as a dataframe.""" # download chromosomal sizes chromsizes = bioframe.fetch_chromsizes("hg19") # download centromers centromeres = bioframe.fetch_centromeres("hg19") centromeres.set_index("chrom", inplace=True) centromeres = centromeres.mid # define chromosomes that are well defined (filter out unassigned contigs) good_chroms = list(chromsizes.index[:23]) # construct arm regions (for each chromosome fro 0-centromere and from centromere to the end) arms = [ arm for chrom in good_chroms for arm in ( (chrom, 0, centromeres.get(chrom, 0)), (chrom, centromeres.get(chrom, 0), chromsizes.get(chrom, 0)), ) ] # construct dataframe out of arms arms = pd.DataFrame(arms, columns=["chrom", "start", "end"]) return arms
def exclude_regions(df, regions_to_keep=[], genome=None, print_final=False): if len(regions_to_keep): assert genome is not None, 'Please provide valid genome' chromsizes = bioframe.fetch_chromsizes(genome) else: if print_final: print(np.asarray(df.region.unique())) return df regions_to_keep = [ bioframe.parse_region(reg, chromsizes) for reg in regions_to_keep ] assert 'region' in df.columns regions = df['region'].apply( lambda x: bioframe.parse_region(x, chromsizes)).values chrom, start, end = list(zip(*regions)) df['chrom'] = chrom df['start'] = start df['end'] = end new_df = [] for chrom, start, end in regions_to_keep: sub_df = bioframe.bedslice(df, (chrom, start, end)) new_df.append(sub_df) new_df = pd.concat(new_df) if print_final: print(np.asarray(new_df.region.unique())) del new_df['chrom'] del new_df['start'] del new_df['end'] return new_df
def compute_scaling(pairs_paths, out_path, region, exclude_chroms, assembly, centromeres_path, split_arms, normalized, plot_slope, show_average_trans, labels, title, no_cache): """ Compute and plot contact frequency vs genomic separation curves for one or more pairs files. """ labels = list(labels) # parse left/right arm parameter of chromosomes to exclude exclude_chroms = [chrom.split(':') for chrom in exclude_chroms] chromsizes = bioframe.fetch_chromsizes(assembly, filter_chroms=False, as_bed=True) chromsizes = chromsizes[~chromsizes.chrom.isin(exclude_chroms)] if centromeres_path: centromeres = {} with open(centromeres_path) as file: for line in file: cols = line.split(' ') centromeres[cols[0]] = (int(cols[1]) + int(cols[2])) // 2 else: centromeres = bioframe.fetch_centromeres(assembly) centromeres.set_index('chrom', inplace=True) centromeres = centromeres.mid.to_dict() if len(labels) != 0 and len(pairs_paths) != len(labels) and not split_arms: sys.exit('Please provide as many labels as pairs paths.') if region: regions = bioframe.select(chromsizes, region).reset_index() else: # use chromosomal arms as separate regions if no regions are specified arms = bioframe.split(chromsizes, centromeres) # remove user-excluded chromosomes/arms for chrom in exclude_chroms: if len(chrom) == 1: # no arm specified, remove entire chromosome arms = arms[arms.chrom != chrom[0]] elif chrom[1] == 'left': # remove specified chromosome with start == 0 (left arm) arms = arms[~((arms.chrom == chrom[0]) & (arms.start == 0))] elif chrom[1] == 'right': # remove specified chromosome with start != 0 (right arm) arms = arms[~((arms.chrom == chrom[0]) & (arms.start != 0))] # remove 40kb from each side (80kb total) of an arm to remove centromere and telomere regions arms = bioframe.ops.expand(arms, -ARM_PADDING) # remove arms arms with a length of < 0 after removing side regions regions = arms[arms.start < arms.end].reset_index() all_scalings = [] all_avg_trans_levels = [] for idx, path in enumerate(pairs_paths): cis_scalings, avg_trans = None, None if split_arms: # calculate scalings per arm per chromosome cis_scalings, trans_levels = pairlib.scalings.compute_scaling( path, regions, chromsizes, dist_range=(int(1e1), int(1e9)), n_dist_bins=128, chunksize=int(1e7)) # remove unassigned pairs with start/end positions < 0 cis_scalings = cis_scalings[(cis_scalings.start1 > 0) & (cis_scalings.end1 > 0) & (cis_scalings.start2 > 0) & (cis_scalings.end2 > 0)] sc_agg = (cis_scalings.groupby( ['chrom1', 'start1', 'min_dist', 'max_dist']).agg({ 'n_pairs': 'sum', 'n_bp2': 'sum' }).reset_index()) avail_chroms = set(sc_agg.chrom1) for chrom in avail_chroms: # calculate scalings for left/right arms (left arms start at position 0 + ARM_PADDING) sc_left, avg_trans_left = (calc_pair_freqs( sc_agg[(sc_agg.chrom1 == chrom) & (sc_agg.start1 == ARM_PADDING)], trans_levels, show_average_trans, normalized)) sc_right, avg_trans_right = (calc_pair_freqs( sc_agg[(sc_agg.chrom1 == chrom) & (sc_agg.start1 != ARM_PADDING)], trans_levels, show_average_trans, normalized)) dir_path = os.path.join(os.path.dirname(out_path), os.path.basename(path)) if not os.path.exists(dir_path): os.mkdir(dir_path) chrom_path = os.path.join( dir_path, '_'.join((chrom, os.path.basename(out_path)))) (plot_scalings( scalings=[sc_left, sc_right], avg_trans_levels=[avg_trans_left, avg_trans_right], plot_slope=plot_slope, labels=['left', 'right'], title=chrom, out_path=chrom_path)) else: if not no_cache: # get cached values cached = cache.get(path) if cached is not None: cis_scalings = cached['cis_scalings'] if cached[ 'normalized'] == normalized else None avg_trans = cached['avg_trans'] if no_cache or cis_scalings is None or (avg_trans is None and show_average_trans): print( f'Computing scalings for file {idx + 1}/{len(pairs_paths)} ...', end='\r') # caching disabled or no cached values found cis_scalings, trans_levels = pairlib.scalings.compute_scaling( path, regions, chromsizes, dist_range=(int(1e1), int(1e9)), n_dist_bins=128, chunksize=int(1e7)) # remove unassigned pairs with start/end positions < 0 cis_scalings = cis_scalings[(cis_scalings.start1 >= 0) & (cis_scalings.end1 >= 0) & (cis_scalings.start2 >= 0) & (cis_scalings.end2 >= 0)] sc_agg = (cis_scalings.groupby(['min_dist', 'max_dist']).agg({ 'n_pairs': 'sum', 'n_bp2': 'sum' }).reset_index()) cis_scalings, avg_trans = calc_pair_freqs( sc_agg, trans_levels, show_average_trans, normalized) if not no_cache: cache.set( path, { 'cis_scalings': cis_scalings, 'avg_trans': avg_trans, 'normalized': normalized }) else: print( f'Retrieved cached values for file {idx + 1}/{len(pairs_paths)}.', end='\r') # use file names as labels if labels have not been provided labels.append( os.path.basename) if len(labels) < len(pairs_paths) else None all_scalings.append(cis_scalings) all_avg_trans_levels.append( avg_trans) if avg_trans is not None else None if len(all_scalings) > 0 and not split_arms: plot_scalings(all_scalings, all_avg_trans_levels, plot_slope, labels, title, out_path)
import os.path as op import pandas as pd import bioframe import cooler import cooltools.expected chromsizes = bioframe.fetch_chromsizes("mm9") chromosomes = list(chromsizes.index) supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] def test_diagsum(request): clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) tables = cooltools.expected.diagsum( clr, supports, transforms={ "balanced": lambda p: p["count"] * p["weight1"] * p["weight2"] }, chunksize=10000000, ) pd.concat( [tables[support] for support in supports], keys=[support[0] for support in supports], names=["chrom"], ) def test_blocksum(request):
def pileup_multiple_dot_lists(cool_file, dot_file_list, exp_cool, resolution, flank, anchor_dist, anchor_flank, plot_name): i = 0 filename1 = cool_file[0].split("/")[-2].split("_hg38")[0] filename2 = cool_file[1].split("/")[-2].split("_hg38")[0] filename3 = cool_file[2].split("/")[-2].split("_hg38")[0] cool = [filename1, filename2, filename3] exp_cool = [exp_cool[0], exp_cool[1], exp_cool[2]] conditions = ['HiC-FA-DpnII', 'HiC-DSG-DpnII', 'MicroC-DSG-MNase'] print(filename1) print(filename2) print(filename3) resolution = resolution flank = flank #resolution=sys.argv[4] hg38 = bioframe.fetch_chromsizes('hg38') chromsizes = bioframe.fetch_chromsizes('hg38') chromosomes = list(chromsizes.index) binsize = resolution cooler_paths = { 'HiC-FA-DpnII': cool_file[0], 'HiC-DSG-DpnII': cool_file[1], 'MicroC-DSG-MNase': cool_file[2], } exp_paths = { 'HiC-FA-DpnII': exp_cool[0], 'HiC-DSG-DpnII': exp_cool[1], 'MicroC-DSG-MNase': exp_cool[2], } long_names = { 'HiC-FA-DpnII': 'HiC-FA-DpnII', 'HiC-DSG-DpnII': 'HiC-DSG-DpnII', 'MicroC-DSG-MNase': 'MicroC-DSG-MNase', } pal = sns.color_palette('colorblind') colors = { filename1: pal[0], filename2: '#333333', filename3: pal[2], } clrs = {cond: cooler.Cooler(cooler_paths[cond]) for cond in conditions} anchor_dist = anchor_dist anchor_flank = flank # dot file list gs = plt.GridSpec(nrows=len(conditions), ncols=len(dot_file_list) + 1) plt.figure(figsize=(6 * len(conditions) + 1, 7)) mean_list = {} for dot_file in dot_file_list: print(dot_file) sites = pd.read_table(dot_file) mid1 = (sites['start1'] + sites['end1']) / 2 mid2 = (sites['start2'] + sites['end2']) / 2 new_file = pd.DataFrame() new_file = pd.concat([sites['chrom1'], mid1, sites['chrom2'], mid2], axis=1) # "convergent" orientation of paired CTCF motifs # sites = sites[(sites['strand1'] == '+') & (sites['strand2'] == '-')] ## not working new_file.columns = ['chrom1', 'mid1', 'chrom2', 'mid2'] #print(len(new_file)) new_file.head() supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] snippet_flank = flank windows1 = snipping.make_bin_aligned_windows(binsize, new_file['chrom1'], new_file['mid1'], flank_bp=snippet_flank) # windows1['strand'] = sites['strand1'] windows2 = snipping.make_bin_aligned_windows(binsize, new_file['chrom2'], new_file['mid2'], flank_bp=snippet_flank) windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=('1', '2')) windows = snipping.assign_regions(windows, supports) windows = windows.dropna() windows.head() stacks = {} piles = {} # mid point distplot k = 0 r_list = [] mean_1 = [] for cond in conditions: expected = pd.read_table(exp_paths[cond]) snipper = snipping.ObsExpSnipper(clrs[cond], expected) #print(snipper) stack = snipping.pileup(windows, snipper.select, snipper.snip) stacks[cond] = stack piles[cond] = np.nanmean(stack, axis=2) mid_pixel_norm = [] sq_size = piles[cond].shape[0] midpoint = np.int(np.floor(sq_size / 2)) background_size_start = np.int(np.ceil(sq_size * 40 / 100)) background_size_end = np.int(np.floor(sq_size * 60 / 100)) print(midpoint) print(background_size_start) print(background_size_end) slice_ = piles[cond] # mid point of each dot mid_pixel = slice_[midpoint, midpoint] #mid_list_9pixels=np.nanmean(slice_[midpoint-1:midpoint+2,midpoint-1:midpoint+2]) # upper left up_left = np.nanmean( slice_[:background_size_start, :background_size_start]) # upper right up_right = np.nanmean(slice_[:background_size_start, background_size_end:]) # upper left lower_left = np.nanmean( slice_[background_size_end:, :background_size_start]) # upper right lower_right = np.nanmean(slice_[background_size_end:, background_size_end:]) # mid point of each dot mid_pixel = slice_[midpoint, midpoint] # Stripe up stripe_up = np.nanmean( slice_[:background_size_start, background_size_start:background_size_end]) # stripe down stripe_down = np.nanmean( slice_[background_size_end:, background_size_start:background_size_end]) # stripe left stripe_left = np.nanmean( slice_[background_size_start:background_size_end, : background_size_start]) # stripe right stripe_right = np.nanmean( slice_[background_size_start:background_size_end, background_size_end:]) stripes_mean = (stripe_up + stripe_right) / 2 corners_mean = (up_left + up_right + lower_right) / 3 mid_pixel_norm.append(mid_pixel / ((stripes_mean + corners_mean) / 2)) #mid_pixel_norm.append(mid_list_9pixels/((stripes_mean+corners_mean)/2)) ax = plt.subplot(gs[k, i]) new_list = mid_pixel_norm m = np.mean(new_list) mean_1.append(np.mean(new_list)) mean_list[dot_file] = mean_1 ax = sns.kdeplot(new_list, shade=True) plt.axvline(m, color='k', linestyle='dashed', linewidth=2) min_ylim, max_ylim = plt.ylim() k = k + 1 ax.yaxis.set_visible(True) ax.xaxis.set_visible(True) if k > 0: ax.yaxis.set_visible(True) ax.xaxis.set_visible(True) ax = plt.subplot(gs[len(conditions)]) i = i + 1 plt.title(plot_name) #plt.savefig(plot_name) w = csv.writer(open(plot_name + ".csv", "w")) for key, val in mean_list.items(): w.writerow([key, val])
def save_bigwig(vectors, savepath, genome, columns=['E1', 'E2', 'E3']): chroms = fetch_chromsizes(genome) for item in columns: save = savepath+'.{}.bw'.format(item) create_dir(save) to_bigwig(vectors, chroms, save, value_field=item)
def pileup_multiple_dot_lists(cool_file,dot_file_list, exp_cool,resolution,flank,anchor_dist,anchor_flank,pileup_name): i=0 filename1=cool_file[0].split("/")[-2].split("_hg38")[0] filename2=cool_file[1].split("/")[-2].split("_hg38")[0] filename3=cool_file[2].split("/")[-2].split("_hg38")[0] cool = [filename1,filename2,filename3] exp_cool = [exp_cool[0], exp_cool[1], exp_cool[2]] conditions = ['HiC-FA-DpnII', 'HiC-DSG-DpnII','MicroC-DSG-MNase'] print(filename1) print(filename2) print(filename3) resolution=resolution flank = flank #resolution=sys.argv[4] hg38 = bioframe.fetch_chromsizes('hg38') chromsizes = bioframe.fetch_chromsizes('hg38') chromosomes = list(chromsizes.index) binsize = resolution cooler_paths = { 'HiC-FA-DpnII' : cool_file[0], 'HiC-DSG-DpnII' : cool_file[1], 'MicroC-DSG-MNase' : cool_file[2], } exp_paths = { 'HiC-FA-DpnII' : exp_cool[0], 'HiC-DSG-DpnII' : exp_cool[1], 'MicroC-DSG-MNase' : exp_cool[2], } long_names = { 'HiC-FA-DpnII': 'HiC-FA-DpnII', 'HiC-DSG-DpnII': 'HiC-DSG-DpnII', 'MicroC-DSG-MNase': 'MicroC-DSG-MNase', } pal = sns.color_palette('colorblind') colors = { filename1: pal[0], filename2 : '#333333', filename3: pal[2], } clrs = { cond: cooler.Cooler(cooler_paths[cond]) for cond in conditions } anchor_dist = anchor_dist anchor_flank = flank # dot file list gs = plt.GridSpec(nrows=len(conditions), ncols=len(dot_file_list) + 1) plt.figure(figsize=(6 * len(conditions)+1, 7)) for dot_file in dot_file_list: print(dot_file) sites = pd.read_table(dot_file) mid1=(sites['start1']+sites['end1'])/2 mid2=(sites['start2']+sites['end2'])/2 new_file=pd.DataFrame() new_file = pd.concat([sites['chrom1'],mid1,sites['chrom2'],mid2],axis=1) # "convergent" orientation of paired CTCF motifs # sites = sites[(sites['strand1'] == '+') & (sites['strand2'] == '-')] ## not working new_file.columns=['chrom1','mid1','chrom2','mid2'] print(len(new_file)) new_file.head() supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] snippet_flank = flank windows1 = snipping.make_bin_aligned_windows( binsize, new_file['chrom1'], new_file['mid1'], flank_bp=snippet_flank) # windows1['strand'] = sites['strand1'] windows2 = snipping.make_bin_aligned_windows( binsize, new_file['chrom2'], new_file['mid2'], flank_bp=snippet_flank) windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=('1', '2')) windows = snipping.assign_regions(windows, supports) windows = windows.dropna() windows.head() #stacks = {} piles = {} k=0 for cond in conditions: expected = pd.read_table(exp_paths[cond]) snipper = snipping.ObsExpSnipper(clrs[cond], expected) print(snipper) stack = snipping.pileup(windows, snipper.select, snipper.snip) #stacks[cond] = stack piles[cond] = np.nanmean(stack, axis=2) opts = dict( vmin=-0.25, vmax=0.25, extent=[-flank//1000, flank//1000, -flank//1000, flank//1000], cmap='coolwarm' ) ax = plt.subplot(gs[k,i]) img = ax.matshow( np.log10(piles[cond]), #piles[cond]), **opts) #plt.title(dot_name_list[i],fontsize=7) #ax.xaxis.tick_bottom() k=k+1 ax.yaxis.set_visible(True) ax.xaxis.set_visible(False) if k > 0: ax.yaxis.set_visible(True) ax.xaxis.set_visible(False) ax = plt.subplot(gs[len(conditions)]) #plt.suptitle(f'Dot calls ({anchor_dist//1000} +/- {anchor_flank//1000})kb apart\n' # f'Hi-C resolution = {binsize//1000}kb; # of pairs = {len(windows)}') #plt.title(dot_name_list[i]) i=i+1 #plt.colorbar(img, cax=ax) plt.savefig(pileup_name)
def plot_pileup(cooler_paths, out_path, resolution, region, size, assembly, exclude_chroms, centromeres_path, title): """ Plots pileups of a specified size around centromeres for an input cooler file. Input two file paths as arguments to create a ratio of pileups instead. """ if len(cooler_paths) > 2: sys.exit('Please provide up to 2 cooler files max.') clrs = [] for path in cooler_paths: clr_ext = os.path.splitext(path)[1] if clr_ext == '.cool': clr = cooler.Cooler(path) elif clr_ext == '.mcool': clr = cooler.Cooler('::/resolutions/'.join( (path, str(resolution)))) else: sys.exit('Please provide a .cool or .mcool file.') clrs.append(clr) chromsizes = bioframe.fetch_chromsizes(assembly, filter_chroms=False, as_bed=True) chromsizes = chromsizes[~chromsizes.chrom.isin(exclude_chroms)] if centromeres_path: features = pd.read_csv(centromeres_path, delim_whitespace=True, header=None, names=['chrom', 'start', 'end', 'mid']) features['mid'] = features.apply(lambda row: (row['start'] + row['end']) // 2, axis=1) else: pass # TODO: implement flank = size // 2 stacks = [ snip_pileup(clr, resolution, features, chromsizes, flank) for clr in clrs ] vmax = -3.75 vmin = -1.75 cmap = 'fall' if len(stacks) == 2: stacks[0] = stacks[0] / stacks[1] vmax = 1 vmin = -1 cmap = 'RdBu' plt.imshow(np.log10(stacks[0]), vmax=vmax, vmin=vmin, cmap=cmap) plt.colorbar(label='log10 mean') ticks_px = np.linspace(0, flank * 2 // resolution, 5) ticks_kbp = ((ticks_px - ticks_px[-1] / 2) * resolution // 1000).astype(int) plt.xticks(ticks_px, ticks_kbp) plt.yticks(ticks_px, ticks_kbp) plt.xlabel('relative position, kbp') plt.ylabel('relative position, kbp') plt.title(title) plt.savefig(out_path, dpi=300)
import os.path as op import numpy as np import pandas as pd import bioframe import cooler import cooltools.expected chromsizes = bioframe.fetch_chromsizes('mm9') chromosomes = list(chromsizes.index) supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] def test_diagsum(request): clr = cooler.Cooler(op.join(request.fspath.dirname, 'data/CN.mm9.1000kb.cool')) tables = cooltools.expected.diagsum( clr, supports, transforms={ 'balanced': lambda p: p['count'] * p['weight1'] * p['weight2'] }, chunksize=10000000) exc = pd.concat( [tables[support] for support in supports], keys=[support[0] for support in supports], names=['chrom']) def test_blocksum(request): clr = cooler.Cooler(op.join(request.fspath.dirname, 'data/CN.mm9.1000kb.cool')) records = cooltools.expected.blocksum_pairwise(
def fetch_chromsizes(db): import bioframe chromsizes = bioframe.fetch_chromsizes(db) print(chromsizes.to_csv(sep='\t'))
from matplotlib.gridspec import GridSpec import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns mpl.style.use('seaborn-white') import multiprocess as mp import numpy as np import pandas as pd import bioframe import cooltools import cooler from cooltools.eigdecomp import cooler_cis_eig mm10 = bioframe.fetch_chromsizes('mm10') chromsizes = bioframe.fetch_chromsizes('mm10') chromosomes = list(chromsizes.index) binsize = 10000 bins = cooler.binnify(mm10, binsize) fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa') bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records) bins.head() import fnmatch import os for file in os.listdir('.'): if fnmatch.fnmatch(file, '*_10kb.cool'): clr = cooler.Cooler(file) cond = file.split('.')[0]