def from_bed( cls, bed, location, chrom_size_path, region_dim="region", sort_bed=True ): """ Create empty RegionDS from a bed file. Parameters ---------- bed location region_dim chrom_size_path sort_bed Returns ------- """ # sort bed based on chrom_size_path if isinstance(bed, (str, pathlib.PosixPath)): if sort_bed: bed = BedTool(bed).sort(g=chrom_size_path).to_dataframe() else: bed = BedTool(bed) else: bed = bed n_cols = bed.shape[1] if n_cols == 3: bed.index = bed.index.map(lambda i: f"{region_dim}_{i}") elif n_cols == 4: bed.set_index(bed.columns[3], inplace=True) else: raise ValueError( "bed file need to be either 3 columns (chrom, start, end) " "or 4 columns (chrom, start, end, name)" ) bed.index.name = region_dim bed.columns = ["chrom", "start", "end"] ds = xr.Dataset({}) region_dim = bed.index.name for k, v in bed.items(): key = f"{region_dim}_{k}" ds.coords[key] = v if ds.coords[key].dtype == "object": ds.coords[key] = ds.coords[key].astype(str) location = pathlib.Path(location).absolute() location.mkdir(exist_ok=True, parents=True) region_ds = cls( ds, region_dim=region_dim, location=location, chrom_size_path=chrom_size_path, ) region_ds.save() return region_ds
def preprocess(self, chrlenPath, genomePath, w=100, upStream=1000, downStream=1000, overlap=0.5, method='mean', col=4, type='bed', n_workers=4): assert upStream % w == 0 and downStream % w == 0 window = BedTool().window_maker(g=chrlenPath, w=w) genes = BedTool(genomePath).to_dataframe() genes = genes[genes['feature'] == 'gene'][[ 'seqname', 'start', 'strand', 'attributes' ]] genes['attributes'] = genes['attributes'].apply( lambda x: x[x.find('=') + 1:x.find(';')]) genes['start'] = genes['start'].apply(lambda x: x - upStream) genes['end'] = genes['start'] + upStream + downStream genes = genes[['seqname', 'start', 'end', 'attributes', 'strand']] genes.columns = ['chrom', 'start', 'end', 'ID', 'strand'] genes = genes[genes.start >= 0] genes.chrom = genes.chrom.apply(lambda x: x[0].lower() + x[1:]) atlas = BedTool.from_dataframe(genes[['chrom', 'start', 'end', 'ID']]).sort() genes = genes.set_index(['ID']) def worker(atlas, window, path, genes, col, method, overlap, type): p = BedTool(path).sort() a = None if type == 'bed': a = window.map(p, c=1, o='count', F=overlap) elif type == 'sigbed': a = window.map(p, o=method, c=col, F=overlap) tmp = atlas.intersect(a, loj=True, wa=True, wb=True).to_dataframe() grps = tmp.groupby(['name']) data = [] for ind in genes.index: row = grps.get_group(ind)['thickEnd'].tolist() data.append(row if genes.ix[ind].strand == '+' else row[::-1]) return pd.DataFrame(data, index=genes.index.tolist()) self.raw = {} with futures.ThreadPoolExecutor(max_workers=n_workers) as executor: jobs = {} for i, path in enumerate(self.paths): job = executor.submit(worker, atlas, window, path, genes, col, method, overlap, type) jobs[job] = self.names[i] for job in futures.as_completed(jobs): self.raw[jobs[job]] = job.result().dropna() if self.genes == None: self.genes = self.raw[jobs[job]].index.tolist()