def test_bedslice(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) grouped = bins.groupby('chrom') df = util.bedslice(grouped, chromsizes, 'chr1:0-12') assert df['chrom'].tolist() == ['chr1', 'chr1'] assert df['start'].tolist() == [0, 10]
def prepare_snakemake(allc_table_path, output_dir, chrom_sizes_path, template_path, chroms=None, smoothing=True, chunk_size=50000000): output_dir = pathlib.Path(output_dir).absolute() output_dir.mkdir(exist_ok=True) allc_table = pd.read_csv(allc_table_path, sep='\t') allc_table.columns = ['allc_path', 'sample', 'group'] if allc_table['group'].unique().size != 2: raise ValueError( f"There must be two and only two different groups, got {allc_table['group'].unique().size}." ) group1, group2 = allc_table['group'].unique() group1_allc = allc_table.loc[allc_table['group'] == group1, 'allc_path'].tolist() group2_allc = allc_table.loc[allc_table['group'] == group2, 'allc_path'].tolist() group1_id = allc_table.loc[allc_table['group'] == group1, 'sample'].tolist() group2_id = allc_table.loc[allc_table['group'] == group2, 'sample'].tolist() chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms) if chroms is None: chroms = chrom_sizes.index.tolist() bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size) regions = [] for _, (chrom, start, end) in bins.iterrows(): region = f'{chrom}:{start}-{end}' regions.append(region) for region in regions: config_path = f'{output_dir}/{region}.yaml' parameters = dict(region=region, allc_paths=group1_allc + group2_allc, group1=group1_id, group2=group2_id, smoothing=smoothing) with open(config_path, 'w') as f: f.write(yaml.dump(parameters)) snakefile = f""" regions = {regions} rule main: input: expand('{{region}}.DSS.DML.hdf', region=regions) rule papermill: input: nb='{template_path}', config='{{region}}.yaml' output: nb='{{region}}.ipynb', data='{{region}}.DSS.DML.hdf' shell: 'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10' """ snakefile_path = f'{output_dir}/Snakefile' with open(snakefile_path, 'w') as f: f.write(snakefile) return snakefile_path
def test_genome_segmentation(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) gs = util.GenomeSegmentation(chromsizes, bins) df = gs.fetch('chr1') assert len(df) == 4 df = gs.fetch('chr1:2-30') assert len(df) == 3 util.balanced_partition(gs, 2, ['chr1'])
def test_get_binsize(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) assert util.get_binsize(bins) == 10 # variable-sized bins bins = pd.read_csv(op.join(datadir, 'toy.bins.var.bed'), names=['chrom', 'start', 'end'], sep='\t') assert util.get_binsize(bins) is None # ambiguous case: one bin per chromosome with different lengths bins = pd.DataFrame({ 'chrom': ['chr1', 'chr2', 'chr3'], 'start': [0, 0, 0], 'end': [100, 200, 300] }) assert util.get_binsize(bins) is None
def __init__(self, datasets, outfil, assembly='hg38', chromsizes_file=None, chroms=['#', 'X'], onlyIntra=True): self.outfil = os.path.abspath(os.path.expanduser(outfil)) if os.path.exists(self.outfil): log.error('Cooler file {} already exists, exit ...'.format( self.outfil)) sys.exit(1) self.chroms = set(chroms) self.onlyIntra = onlyIntra data = datasets ## Ready for data loading if not chromsizes_file is None: chromsizes_path = os.path.abspath( os.path.expanduser(chromsizes_file)) log.info('Read chromosome sizes from {}'.format(chromsizes_path)) chromsizes = readChromSizes(chromsizes_path, self.chroms) else: log.info('Fetch chromosome sizes from UCSC ...') chromsizes = fetchChromSizes(assembly, self.chroms) chromlist = chromsizes.keys() # sort chromosome labels tmp = list( map(str, sorted(map(int, [i for i in chromlist if i.isdigit()])))) nondigits = [i for i in chromlist if not i.isdigit()] for i in ['X', 'Y', 'M']: if i in nondigits: tmp.append(nondigits.pop(nondigits.index(i))) chromlist = tmp + sorted(nondigits) lengths = [chromsizes[i] for i in chromlist] self.chromsizes = pd.Series(data=lengths, index=chromlist) log.info('Done') ## We don't read data into memory at this point. ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels. self.Map = {} for res in data: if data[res].endswith('.npz'): self.Map[res] = {} lib = np.load(data[res]) for i in lib.files: if (not '_' in i) and ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)): # Compatible with TADLib and old version of runHiC c1 = c2 = i self.Map[res][(c1, c2)] = lib else: tmp = i.split('_') if len(tmp) != 2: continue c1, c2 = tmp check1 = ((not self.chroms) or (c1.isdigit() and '#' in self.chroms) or (c1 in self.chroms)) check2 = ((not self.chroms) or (c2.isdigit() and '#' in self.chroms) or (c2 in self.chroms)) if check1 and check2: self.Map[res][(c1, c2)] = lib else: self.Map[res] = self._scanFolder(data[res]) self._intertype = np.dtype({ 'names': ['bin1', 'bin2', 'IF'], 'formats': [np.int, np.int, np.float] }) log.info( 'Extract and save data into cooler format for each resolution ...') for res in self.Map: log.info('Current resolution: {}bp'.format(res)) byres = self.Map[res] # Extract parts of chromsizes subset = [] for c1, c2 in byres: subset.extend([c1, c2]) subset = set(subset) Bool = [(i in subset) for i in self.chromsizes.index] chromsizes = self.chromsizes[Bool] bin_cumnums = self.binCount(chromsizes, res) log.info('Generate bin table ...') bintable = binnify(chromsizes, res) pixels = self._generator(byres, chromsizes, bin_cumnums) if os.path.exists(self.outfil): append = True else: append = False cooler_uri = '{}::{}'.format(self.outfil, res) if self.onlyIntra: create(cooler_uri, bintable, pixels, assembly=assembly, append=append, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, metadata={'onlyIntra': str(self.onlyIntra)}) else: create_from_unordered( cooler_uri, bintable, pixels, assembly=assembly, append=append, metadata={'onlyIntra': str(self.onlyIntra)}, delete_temp=True, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False)
def toCooler(outfil, data_path, res, assembly, chroms=['#', 'X'], symmetric=True, count_type=float, cache_dir=None, delete_cache=True): """ Create a Cooler from TXT Hi-C data. Parameters ---------- outfil : str Path of the output Cooler file. data_path : str Path of original contact matrix file (in TXT format). res : int Resolution / Bin size of the matrix in base pairs. chroms : list List of chromosome labels. Only Hi-C data within the specified chromosomes will be included. Specially, '#' stands for chromosomes with numerical labels. If an empty list is provided, all chromosome data will be loaded. (Default: ['#', 'X']) cache_dir : str or None All intermediate or temporary files would be generated under this folder. If None, the folder returned by :py:func:`tempfile.gettempdir` will be used. (Default: None) delete_cache : Bool Whether to delete temporary files when finished. (Default: True) """ outfil = os.path.abspath(os.path.expanduser(outfil)) if os.path.exists(outfil): log.error('Cooler file {} already exists, exit ...') sys.exit(1) ## ready for data loading tl = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) kw = {'prefix': 'pixels', 'suffix': tl, 'dir': cache_dir} pixel_fil = tempfile.mktemp(**kw) # write the pixel file chromsizes = {} with open(pixel_fil, 'wb') as out: with open(data_path, 'rb') as source: for line in source: c1, p1, c2, p2, count = line.rstrip().split() c1 = c1.lstrip('chr') c2 = c2.lstrip('chr') check1 = ((not chroms) or (c1.isdigit() and ('#' in chroms)) or (c1 in chroms)) check2 = ((not chroms) or (c2.isdigit() and ('#' in chroms)) or (c2 in chroms)) if (not check1) or (not check2): continue ip_1, ip_2 = int(p1) + res, int(p2) + res if c1 in chromsizes: if ip_1 > chromsizes[c1]: chromsizes[c1] = ip_1 else: chromsizes[c1] = ip_1 if c2 in chromsizes: if ip_2 > chromsizes[c2]: chromsizes[c2] = ip_2 else: chromsizes[c2] = ip_2 newline = [c1, p1, str(ip_1), c2, p2, str(ip_2), count] out.write('\t'.join(newline) + '\n') # generate bin table chromlist = chromsizes.keys() # sort chromosome labels tmp = map(str, sorted(map(int, [i for i in chromlist if i.isdigit()]))) nondigits = [i for i in chromlist if not i.isdigit()] for i in ['X', 'Y', 'M']: if i in nondigits: tmp.append(nondigits.pop(nondigits.index(i))) chromlist = tmp + sorted(nondigits) lengths = [chromsizes[i] for i in chromlist] chromsizes = pd.Series(data=lengths, index=chromlist) bins = binnify(chromsizes, res) # output fields output_field_names = ['bin1_id', 'bin2_id', 'count'] output_field_dtypes = {'bin1_id': int, 'bin2_id': int, 'count': count_type} # input fields input_field_names = [ 'chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'count' ] input_field_dtypes = { 'chrom1': str, 'start1': int, 'end1': int, 'chrom2': str, 'start2': int, 'end2': int, 'count': count_type, } input_field_numbers = { 'chrom1': 0, 'start1': 1, 'end1': 2, 'chrom2': 3, 'start2': 4, 'end2': 5, 'count': 6, } tril_action = 'drop' if symmetric else 'reflect' pipeline = sanitize_records(bins, schema='bg2', is_one_based=False, tril_action=tril_action, sort=True) reader = pd.read_table( pixel_fil, usecols=[input_field_numbers[name] for name in input_field_names], names=input_field_names, dtype=input_field_dtypes, iterator=True, chunksize=int(40e6)) create_from_unordered(outfil, bins, map(pipeline, reader), columns=output_field_names, dtypes=output_field_dtypes, assembly=assembly, mergebuf=int(40e6), ensure_sorted=False)
def __init__(self, datasets, outfil, assembly='hg38', chromsizes_file=None, chroms=['#','X'], onlyIntra=True, dtype='int'): self.outfil = os.path.abspath(os.path.expanduser(outfil)) if os.path.exists(self.outfil): log.error('Cooler file {} already exists, exit ...'.format(self.outfil)) sys.exit(1) self.chroms = set(chroms) self.onlyIntra = onlyIntra data = datasets ## Ready for data loading if not chromsizes_file is None: chromsizes_path = os.path.abspath(os.path.expanduser(chromsizes_file)) log.info('Read chromosome sizes from {}'.format(chromsizes_path)) chromsizes = readChromSizes(chromsizes_path, self.chroms) else: log.info('Fetch chromosome sizes from UCSC ...') chromsizes = fetchChromSizes(assembly, self.chroms) chromlist = chromsizes.keys() # sort chromosome labels tmp = list(map(str, sorted(map(int, [i for i in chromlist if i.isdigit()])))) nondigits = [i for i in chromlist if not i.isdigit()] for i in ['X','Y','M']: if i in nondigits: tmp.append(nondigits.pop(nondigits.index(i))) chromlist = tmp + sorted(nondigits) lengths = [chromsizes[i] for i in chromlist] self.chromsizes = pd.Series(data=lengths, index=chromlist) log.info('Done') ## We don't read data into memory at this point. ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels. self.Map = {} for res in data: if data[res].endswith('.npz'): self.Map[res] = {} lib = np.load(data[res]) for i in lib.files: if (not '_' in i) and ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)): # Compatible with TADLib and old version of runHiC c1 = c2 = i self.Map[res][(c1,c2)] = lib else: tmp = i.split('_') if len(tmp)!=2: continue c1, c2 = tmp check1 = ((not self.chroms) or (c1.isdigit() and '#' in self.chroms) or (c1 in self.chroms)) check2 = ((not self.chroms) or (c2.isdigit() and '#' in self.chroms) or (c2 in self.chroms)) if check1 and check2: self.Map[res][(c1,c2)] = lib else: self.Map[res] = self._scanFolder(data[res]) self._intertype = np.dtype({'names':['bin1', 'bin2', 'IF'], 'formats':[np.int, np.int, np.float]}) log.info('Extract and save data into cooler format for each resolution ...') for res in self.Map: log.info('Current resolution: {}bp'.format(res)) byres = self.Map[res] # Extract parts of chromsizes subset = [] for c1, c2 in byres: subset.extend([c1,c2]) subset = set(subset) Bool = [(i in subset) for i in self.chromsizes.index] chromsizes = self.chromsizes[Bool] bin_cumnums = self.binCount(chromsizes, res) log.info('Generate bin table ...') bintable = binnify(chromsizes, res) pixels = self._generator(byres, chromsizes, bin_cumnums) if os.path.exists(self.outfil): mode = 'a' else: mode = 'w' if dtype == 'int': dtypes = {'count': np.int32} else: dtypes = {'count': np.float64} cooler_uri = '{}::{}'.format(self.outfil, res) if self.onlyIntra: create_cooler(cooler_uri, bintable, pixels, assembly=assembly, mode=mode, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True, metadata={'onlyIntra':str(self.onlyIntra)}, dtypes=dtypes) else: create_from_unordered(cooler_uri, bintable, pixels, assembly=assembly, mode=mode, metadata={'onlyIntra':str(self.onlyIntra)}, delete_temp=True, boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False, dtypes=dtypes)
def export_to_cooler( contact_table, output_prefix, cooler_resolution, fragment_table, chromsizes, query, query_columns=None, by_haplotype=False, ): results = [] if query_columns: columns = query_columns[:] else: columns = [] columns.extend(["align1_fragment_id", "align2_fragment_id"]) if by_haplotype: columns.extend(["align1_haplotype", "align2_haplotype"]) contact_df = dd.read_parquet(contact_table, engine=PQ_ENGINE, version=PQ_VERSION, columns=columns, index=False) if query: contact_df = contact_df.query(query) chrom_dict = pd.read_csv(chromsizes, sep="\t", header=None, names=["chrom", "size"], index_col=["chrom"], squeeze=True) # create even-widht bins using cooler bins_df = binnify(chrom_dict, cooler_resolution) bins_df.index.name = "bin_id" # convert to ranges for overlap bins = pr.PyRanges(bins_df.reset_index().rename(columns={ "start": "Start", "end": "End", "chrom": "Chromosome" })) fragment_df = dd.read_parquet(fragment_table, engine=PQ_ENGINE, version=PQ_VERSION).compute() midpoint_df = pr.PyRanges( fragment_df.reset_index()[[ "chrom", "start", "end", "fragment_id" ]].assign(start=lambda x: ((x.start + x.end) * 0.5).round(0).astype( int)).eval("end = start + 1").rename(columns={ "chrom": "Chromosome", "start": "Start", "end": "End" })) # use a pyranges joing to assign fragments to bins fragment_to_bin = midpoint_df.join( bins, how="left").df[["fragment_id", "bin_id"]] fragment_to_bin = fragment_to_bin.set_index( "fragment_id").sort_index() # .astype(np.uint32) nulls = fragment_to_bin["bin_id"] == -1 if nulls.any(): logger.warning( "Some fragments did not overlap bins, removing from analysis:\n{}". format(fragment_to_bin[nulls].join(fragment_df))) fragment_to_bin = fragment_to_bin[~nulls] # use a join to assign each end of a contact to a bin binned_contacts = (contact_df.merge( fragment_to_bin, how="inner", right_index=True, left_on="align1_fragment_id").merge( fragment_to_bin, how="inner", right_index=True, left_on="align2_fragment_id", suffixes=[None, "_2"]).rename(columns={ "bin_id": "bin1_id", "bin_id_2": "bin2_id" })) if not by_haplotype: cooler_path = output_prefix + ".cool" # group size == number of contacts per bin_pair pixels = binned_contacts.groupby( ["bin1_id", "bin2_id"]).size().rename("count").astype(np.int32).reset_index() create_cooler(cooler_path, bins_df, pixels, ordered=True, symmetric_upper=True, ensure_sorted=True) c = Cooler(cooler_path) logger.info(f"Created cooler: {c.info}") results.append(cooler_path) else: tmp_parquet = output_prefix + ".tmp.pq" pixels = ( # create a key to groupy by haplotype pair, order of haplotypes doesn't matter binned_contacts.assign( hap_key=lambda x: x[["align1_haplotype", "align2_haplotype"] ].apply(lambda y: "{}_{}".format(*sorted( y)).replace("-1", "nohap"), axis=1, meta="object") ).groupby(["hap_key", "bin1_id", "bin2_id"]).size().rename("count").astype( np.int32 ).reset_index().astype({"hap_key": "category"})) # save to a temporary parquet file, this might not be necessary # but want to avoid the whole contact matrix hitting memory pixels.to_parquet( tmp_parquet, write_metadata_file=True, partition_on=["hap_key"], write_index=False, engine=PQ_ENGINE, version=PQ_VERSION, ) pixels = dd.read_parquet(tmp_parquet, engine=PQ_ENGINE, version=PQ_VERSION, columns=["hap_key"], index=False) hap_keys = pixels["hap_key"].unique().compute() # create a cooler for each haplotype pair for hap_key in hap_keys: cooler_path = f"{output_prefix}.{hap_key}.cool" pixels = dd.read_parquet( tmp_parquet, filters=[("hap_key", "==", hap_key)], index=False, engine=PQ_ENGINE, version=PQ_VERSION, columns=["bin1_id", "bin2_id", "count"], ) create_cooler(cooler_path, bins_df, pixels, ordered=True, symmetric_upper=True, ensure_sorted=True) c = Cooler(cooler_path) logger.info(f"Created cooler: {c.info}") results.append(cooler_path) shutil.rmtree(tmp_parquet) return results
def prepare_snakemake(allc_table_path, output_dir, chrom_sizes_path, template_path, chroms=None, test_covariate='group', match_covariate=None, adjust_covariate=None, cutoff=0.1, min_num_region=3, smooth=True, bp_span=1000, min_in_span=30, max_gap_smooth=2500, max_gap=1000, verbose=True, max_perms=10, stat="stat", block=False, block_size=5000, chrs_per_chunk=1, cpu=40, chunk_size=5000000000): output_dir = pathlib.Path(output_dir).absolute() output_dir.mkdir(exist_ok=True) chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms) if chroms is None: chroms = chrom_sizes.index.tolist() bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size) regions = [] for _, (chrom, start, end) in bins.iterrows(): region = f'{chrom}:{start}-{end}' regions.append(region) for region in regions: config_path = f'{output_dir}/{region}.yaml' parameters = { 'region': region, 'allc_table_path': allc_table_path, 'test_covariate': test_covariate, 'match_covariate': match_covariate, 'adjust_covariate': adjust_covariate, 'cutoff': cutoff, 'min_num_region': min_num_region, 'smooth': smooth, 'bp_span': bp_span, 'min_in_span': min_in_span, 'max_gap_smooth': max_gap_smooth, 'max_gap': max_gap, 'verbose': verbose, 'max_perms': max_perms, 'stat': stat, 'block': block, 'block_size': block_size, 'chrs_per_chunk': chrs_per_chunk, 'cpu': cpu } with open(config_path, 'w') as f: f.write(yaml.dump(parameters)) snakefile = f""" regions = {regions} rule main: input: expand('{{region}}.DMR.hdf', region=regions) rule papermill: input: nb='{template_path}', config='{{region}}.yaml' output: nb='{{region}}.ipynb', data='{{region}}.DMR.hdf' threads: 1 #{cpu} shell: 'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10' """ snakefile_path = f'{output_dir}/Snakefile' with open(snakefile_path, 'w') as f: f.write(snakefile) return regions
lock.acquire() print("right before collapse ...{} {}".format( i, spans[i:i + batchsize])) results = self._map(self.aggregate, spans[i:i + batchsize]) finally: lock.release() for df in results: # yield {k: v.values for k, v in six.iteritems(df)} yield df input_uri = "" c = Cooler(input_uri) new_bins = binnify(c.chromsizes, 2 * c.binsize) iterator = CoolerAggregator(input_uri, new_bins, 1000000, batchsize=1, map=map) # # last message before it fails ... # # INFO:cooler:17868809 17872380 # for ii in iterator: # print(ii) # from cooler.api import Cooler lo, hi = 17869999, 17872300 # lo, hi = 17868809, 17872380 clr = Cooler(input_uri) # convert_enum=False returns chroms as raw ints table = clr.pixels(join=True, convert_enum=False)
def test_check_bins(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) bins['chrom'] = bins['chrom'].astype(str) bins = util.check_bins(bins, chromsizes) assert pd.api.types.is_categorical(bins["chrom"])
def test_get_chromsizes(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) assert np.allclose(util.get_chromsizes(bins), chromsizes)
def test_binnify(): chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes')) bins = util.binnify(chromsizes, 10) assert len(bins) == 8