def main(mcoolfile,outdir,filename,window,cutoff,binsize): f = mcoolfile #Get the list of resolutions in the mcool file cooler_list = cooler.io.ls(f) old_version= False if not any([res for res in cooler_list if '/resolutions/' in res]): #gets the resolutions from a file in a older version of cooler old_version = True binsize_list = [] for res in cooler_list: cooler_path = str(f)+'::'+ res c = cooler.Cooler(cooler_path) binsize_list.append(int(c.binsize)) else: binsize_list = [] for res in cooler_list: binsize_list.append(int(res.split('/')[-1])) # Check the input parameters if binsize == -1: binsize = min(binsize_list) else: if binsize in binsize_list: if window % binsize != 0: print("Error: Window size must be multiple of binsize") sys.exit() else: print("Error: This binsize is not available in this mcool file. This is the list of binsizes availables:") print(binsize_list) sys.exit() # Creates a cooler object if old_version: res_list = [] for res in cooler_list: res_list.append(int(res.split('/')[-1])) res_index = max(res_list) cooler_path = str(f) + '::' + str(res_index) else: cooler_path = str(f) + '::' + cooler_list[binsize_list.index(binsize)] c = cooler.Cooler(cooler_path) print(c) # Gets the chromsizes chromsizes=pd.Series(c.chroms()[:]['length'].values, index=c.chroms()[:]['name'].values) #Getting insulating boundaries insul = find_insulating_boundaries(c,balance='weight',window_bp=window,min_dist_bad_bin=2) #Convert to BigWig bioframe.to_bigwig(insul, chromsizes, f'/{outdir}/{filename}.bw', f'log2_insulation_score_{window}')
def eigs_cis( cool_path, phasing_track, view, n_eigs, clr_weight_name, ignore_diags, verbose, out_prefix, bigwig, ): """ Perform eigen value decomposition on a cooler matrix to calculate compartment signal by finding the eigenvector that correlates best with the phasing track. COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : the path to a BedGraph-like file that stores phasing track as track-name named column. BedGraph-like format assumes tab-separated columns chrom, start, stop and track-name. """ clr = cooler.Cooler(cool_path) if phasing_track is not None: # TODO: This all needs to be refactored into a more generic tabular file parser # Needs to handle stdin case too. track_path, col = phasing_track buf, names = sniff_for_header(track_path) if names is None: if not isinstance(col, int): raise click.BadParameter( "No header found. " 'Cannot find "{}" column without a header.'.format(col)) track_name = "ref" kwargs = dict( header=None, usecols=[0, 1, 2, col], names=["chrom", "start", "end", track_name], ) else: if isinstance(col, int): try: col = names[col] except IndexError: raise click.BadParameter( 'Column #{} not compatible with header "{}".'.format( col, ",".join(names))) else: if col not in names: raise click.BadParameter( 'Column "{}" not found in header "{}"'.format( col, ",".join(names))) track_name = col kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) track_df = pd.read_table(buf, dtype={ "chrom": str, "start": np.int64, "end": np.int64, track_name: np.float64, }, comment="#", verbose=verbose, **kwargs) phasing_track = track_df # define view for cis compartment-calling # use input "view" BED file or all chromosomes mentioned in "track": if view is None: cooler_view_df = make_cooler_view(clr) view_df = cooler_view_df else: view_df = read_viewframe_from_file(view, clr, check_sorting=True) # TODO: Add check that view_df has the same bins as track eigvals, eigvec_table = eigdecomp.eigs_cis( clr=clr, phasing_track=phasing_track, view_df=view_df, n_eigs=n_eigs, clr_weight_name=clr_weight_name, ignore_diags=ignore_diags, clip_percentile=99.9, sort_metric=None, ) # Output eigvals.to_csv(out_prefix + ".cis" + ".lam.txt", sep="\t", index=False) eigvec_table.to_csv(out_prefix + ".cis" + ".vecs.tsv", sep="\t", index=False) if bigwig: bioframe.to_bigwig( eigvec_table, clr.chromsizes, out_prefix + ".cis" + ".bw", value_field="E1", )
def insulation( in_path, window, output, view, ignore_diags, clr_weight_name, min_frac_valid_pixels, min_dist_bad_bin, threshold, window_pixels, append_raw_scores, chunksize, verbose, bigwig, ): """ Calculate the diamond insulation scores and call insulating boundaries. IN_PATH : The paths to a .cool file with a balanced Hi-C map. WINDOW : The window size for the insulation score calculations. Multiple space-separated values can be provided. By default, the window size must be provided in units of bp. When the flag --window-pixels is set, the window sizes must be provided in units of pixels instead. """ clr = cooler.Cooler(in_path) # Create view: cooler_view_df = make_cooler_view(clr) if view is None: # full chromosomes: view_df = cooler_view_df else: # read view_df dataframe, and verify against cooler view_df = read_viewframe_from_file(view, clr, check_sorting=True) # Read list with windows: if window_pixels: window = [win * clr.info["bin-size"] for win in window] ins_table = api.insulation.insulation( clr, view_df=view_df, window_bp=window, ignore_diags=ignore_diags, clr_weight_name=clr_weight_name if clr_weight_name else None, min_frac_valid_pixels=min_frac_valid_pixels, min_dist_bad_bin=min_dist_bad_bin, threshold=threshold, append_raw_scores=append_raw_scores, chunksize=chunksize, verbose=verbose, ) # output to file if specified: if output: ins_table.to_csv(output, sep="\t", index=False, na_rep="nan") # or print into stdout otherwise: else: print(ins_table.to_csv(sep="\t", index=False, na_rep="nan")) # Write the insulation track as a bigwig: if bigwig: for w in window: bioframe.to_bigwig( ins_table, clr.chromsizes, output + "." + str(w) + ".bw", value_field=f"log2_insulation_score_{w}", )
def call_compartments(cool_path, reference_track, contact_type, n_eigs, verbose, out_prefix, bigwig): """ Perform eigen value decomposition on a cooler matrix to calculate compartment signal by finding the eigenvector that correlates best with the phasing track. COOL_PATH : the paths to a .cool file with a balanced Hi-C map. TRACK_PATH : the path to a BedGraph-like file that stores phasing track as track-name named column. BedGraph-like format assumes tab-separated columns chrom, start, stop and track-name. """ clr = cooler.Cooler(cool_path) if reference_track is not None: # TODO: This all needs to be refactored into a more generic tabular file parser # Needs to handle stdin case too. track_path, col = reference_track buf, names = sniff_for_header(track_path) if names is None: if not isinstance(col, int): raise click.BadParameter( "No header found. " 'Cannot find "{}" column without a header.'.format(col)) track_name = "ref" kwargs = dict( header=None, usecols=[0, 1, 2, col], names=["chrom", "start", "end", track_name], ) else: if isinstance(col, int): try: col = names[col] except IndexError: raise click.BadParameter( 'Column #{} not compatible with header "{}".'.format( col, ",".join(names))) else: if col not in names: raise click.BadParameter( 'Column "{}" not found in header "{}"'.format( col, ",".join(names))) track_name = col kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) track_df = pd.read_table(buf, dtype={ "chrom": str, "start": np.int64, "end": np.int64, track_name: np.float64, }, comment="#", verbose=verbose, **kwargs) # we need to merge phasing track DataFrame with the cooler bins to get # a DataFrame with phasing info aligned and validated against bins inside of # the cooler file. track = pd.merge(left=clr.bins()[:], right=track_df, how="left", on=["chrom", "start", "end"]) # sanity check would be to check if len(bins) becomes > than nbins ... # that would imply there was something in the track_df that didn't match # ["chrom", "start", "end"] - keys from the c.bins()[:] . if len(track) > len(clr.bins()): ValueError( "There is something in the {} that ".format(track_path) + "couldn't be merged with cooler-bins {}".format(cool_path)) else: track = clr.bins()[["chrom", "start", "end"]][:] track_name = None # it's contact_type dependent: if contact_type == "cis": eigvals, eigvec_table = eigdecomp.cooler_cis_eig( clr=clr, bins=track, regions=None, n_eigs=n_eigs, phasing_track_col=track_name, clip_percentile=99.9, sort_metric=None, ) elif contact_type == "trans": eigvals, eigvec_table = eigdecomp.cooler_trans_eig( clr=clr, bins=track, n_eigs=n_eigs, partition=None, phasing_track_col=track_name, sort_metric=None, ) # Output eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt", sep="\t", index=False) eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv", sep="\t", index=False) if bigwig: bioframe.to_bigwig( eigvec_table, clr.chromsizes, out_prefix + "." + contact_type + ".bw", value_field="E1", )
def save_bigwig(vectors, savepath, genome, columns=['E1', 'E2', 'E3']): chroms = fetch_chromsizes(genome) for item in columns: save = savepath+'.{}.bw'.format(item) create_dir(save) to_bigwig(vectors, chroms, save, value_field=item)
def main(mcoolfile, outdir, filename, binsize, window, bweak, bstrong, cutoff, pixels_frac): f = mcoolfile # Get the list of resolutions in the mcool file cooler_list = cooler.fileops.list_coolers(f) old_version = False # gets the resolutions from a file in a older version of cooler if not any([res for res in cooler_list if '/resolutions/' in res]): old_version = True binsize_list = [] for res in cooler_list: cooler_path = str(f) + '::' + res c = cooler.Cooler(cooler_path) binsize_list.append(int(c.binsize)) else: binsize_list = [] for res in cooler_list: binsize_list.append(int(res.split('/')[-1])) # Check the input parameters if binsize == -1: binsize = min(binsize_list) else: if binsize in binsize_list: if window % binsize != 0: print("Error: Window size must be multiple of binsize") sys.exit() else: print( "Error: This binsize is not available in this mcool file. This is the list of binsizes availables:" ) print(binsize_list) sys.exit() # Creates a cooler object if old_version: res_list = [] for res in cooler_list: res_list.append(int(res.split('/')[-1])) res_index = max(res_list) cooler_path = str(f) + '::' + str(res_index) else: cooler_path = str(f) + '::' + cooler_list[binsize_list.index(binsize)] c = cooler.Cooler(cooler_path) print(c) # Gets the chromsizes chromsizes = pd.Series(c.chroms()[:]['length'].values, index=c.chroms()[:]['name'].values) # calculate insulation score ins_table = calculate_insulation_score(c, window) # Find boundaries ins_table = find_boundaries(ins_table, pixels_frac, cutoff) # Filter out insulation score in which pixels are less than 66% of the max number of pixels icutoff = pow((window / binsize), 2) * pixels_frac ins_table_filtered = ins_table[ ins_table[f'n_valid_pixels_{window}'] >= icutoff] # Convert to BigWig bioframe.to_bigwig(ins_table_filtered, chromsizes, f'{outdir}/{filename}.bw', f'log2_insulation_score_{window}') # Classify the boundaries as strong and week # strong boundaries >= 0.5, weak 0.2>= and < 0.5 boun_table = ins_table_filtered[ ins_table_filtered[f'boundary_strength_{window}'] >= bweak].copy() boun_table['boundary_classification'] = np.where( boun_table[f'boundary_strength_{window}'] >= bstrong, 'Strong', 'Weak') # Filter boundaries abover thresholds columns = [ 'chrom', 'start', 'end', 'boundary_classification', f'boundary_strength_{window}' ] boun_table.to_csv(f'{outdir}/{filename}_boundaries.bed', sep='\t', header=None, index=False, columns=columns)
for chrom in cool.chromnames[0:22]]} #supports # Computing eigenvealues and eigenvectors lams, vectors = eigdecomp.cooler_cis_eig(cool, genes, regions=supports['cis'], phasing_track_col='gene_count', sort_metric='spearmanr') # cooler_cis_eig sorts eigenvectors by decreasing Spearman correlation with the phasing track # (gene count in this case). In the past, people have used the eigenvector associated with the max # eigenvalue. It is worth considering situations where these two sorting process differ. # Output contact_type='cis' lams.to_csv(out_prefix + '.' + contact_type +'.'+ res+ '.lam.txt', sep='\t', index=False) vectors.to_csv(out_prefix + '.' + contact_type +'.'+ res+'.vecs.tsv', sep='\t', index=False) bioframe.to_bigwig(vectors,cool.chromsizes,out_prefix +'.' + contact_type + '.'+ res+ '.bw',value_field='E1') exp = {} #Cis Expected with mp.Pool(10) as p: result = expected.diagsum(cool, supports['cis'], transforms={'balanced': lambda p: p['count']*p['weight1']*p['weight2']}, ignore_diags=2, map=p.map) result = pd.concat([result[arm] for arm in supports['cis']], keys=[(arm[0], f'{arm[0]}:{arm[1]}-{arm[2]}') for arm in supports['cis']], names=['chrom', 'region']) result = result.groupby(['chrom','diag']).sum() result['balanced.avg'] = result['balanced.sum'] / result['n_valid'] result = result.reset_index() exp['cis'] = result
def call_compartments( cool_path, reference_track, regions, contact_type, n_eigs, verbose, out_prefix, bigwig, ): """ Perform eigen value decomposition on a cooler matrix to calculate compartment signal by finding the eigenvector that correlates best with the phasing track. COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : the path to a BedGraph-like file that stores phasing track as track-name named column. BedGraph-like format assumes tab-separated columns chrom, start, stop and track-name. """ clr = cooler.Cooler(cool_path) if reference_track is not None: # TODO: This all needs to be refactored into a more generic tabular file parser # Needs to handle stdin case too. track_path, col = reference_track buf, names = sniff_for_header(track_path) if names is None: if not isinstance(col, int): raise click.BadParameter( "No header found. " 'Cannot find "{}" column without a header.'.format(col)) track_name = "ref" kwargs = dict( header=None, usecols=[0, 1, 2, col], names=["chrom", "start", "end", track_name], ) else: if isinstance(col, int): try: col = names[col] except IndexError: raise click.BadParameter( 'Column #{} not compatible with header "{}".'.format( col, ",".join(names))) else: if col not in names: raise click.BadParameter( 'Column "{}" not found in header "{}"'.format( col, ",".join(names))) track_name = col kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) track_df = pd.read_table(buf, dtype={ "chrom": str, "start": np.int64, "end": np.int64, track_name: np.float64, }, comment="#", verbose=verbose, **kwargs) # we need to merge phasing track DataFrame with the cooler bins to get # a DataFrame with phasing info aligned and validated against bins inside of # the cooler file. track = pd.merge(left=clr.bins()[:], right=track_df, how="left", on=["chrom", "start", "end"]) # sanity check would be to check if len(bins) becomes > than nbins ... # that would imply there was something in the track_df that didn't match # ["chrom", "start", "end"] - keys from the c.bins()[:] . if len(track) > len(clr.bins()): ValueError( "There is something in the {} that ".format(track_path) + "couldn't be merged with cooler-bins {}".format(cool_path)) else: # use entire bin-table from cooler, when reference-track is not provided: track = clr.bins()[["chrom", "start", "end"]][:] track_name = None # define regions for cis compartment-calling # use input "regions" BED file or all chromosomes mentioned in "track": if regions is None: # use full chromosomes referred to in the track : track_chroms = track["chrom"].unique() cis_regions_table = bioframe.parse_regions(track_chroms, clr.chromsizes) cis_regions_table["name"] = cis_regions_table["chrom"] else: if contact_type == "trans": raise NotImplementedError( "Regions not yet supported with trans contact type") # Flexible reading of the regions table: regions_buf, names = sniff_for_header(regions) cis_regions_table = pd.read_csv(regions_buf, sep="\t", header=None) if cis_regions_table.shape[1] not in (3, 4): raise ValueError( "The region file does not have three or four tab-delimited columns." "We expect a bed file with columns chrom, start, end, and optional name" ) if cis_regions_table.shape[1] == 4: cis_regions_table = cis_regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end", 3: "name" }) cis_regions_table = bioframe.parse_regions(cis_regions_table) else: cis_regions_table = cis_regions_table.rename(columns={ 0: "chrom", 1: "start", 2: "end" }) cis_regions_table = bioframe.parse_regions(cis_regions_table) # make sure custom regions are compatible with the track: track_chroms = track["chrom"].unique() cis_regions_table = cis_regions_table[cis_regions_table["chrom"].isin( track_chroms)].reset_index(drop=True) # it's contact_type dependent: if contact_type == "cis": eigvals, eigvec_table = eigdecomp.cooler_cis_eig( clr=clr, bins=track, regions=cis_regions_table, n_eigs=n_eigs, phasing_track_col=track_name, clip_percentile=99.9, sort_metric=None, ) elif contact_type == "trans": eigvals, eigvec_table = eigdecomp.cooler_trans_eig( clr=clr, bins=track, n_eigs=n_eigs, partition=None, phasing_track_col=track_name, sort_metric=None, ) # Output eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt", sep="\t", index=False) eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv", sep="\t", index=False) if bigwig: bioframe.to_bigwig( eigvec_table, clr.chromsizes, out_prefix + "." + contact_type + ".bw", value_field="E1", )