def _create_query_dmr_ds(reptile, dmr_regions_bed_df): query_dmr_ds = RegionDS.from_bed( dmr_regions_bed_df, chrom_size_path=reptile.chrom_size_path, location=reptile.output_path, region_dim="query-dmr", ) query_dmr_ds.save() return
def _create_query_region_ds(reptile): pybedtools.BedTool().makewindows( g=reptile.chrom_size_path, s=reptile.step_size, w=reptile.window_size ).saveas(f"{reptile.output_path}/query_region.bed") query_region_ds = RegionDS.from_bed( f"{reptile.output_path}/query_region.bed", chrom_size_path=reptile.chrom_size_path, location=reptile.output_path, region_dim="query-region", ) subprocess.run(f"rm -f {reptile.output_path}/query_region.bed", shell=True) query_region_ds.save() return
def _create_train_dmr_ds(reptile, train_regions_bed, train_label): # total DMRs dmr_regions_bed = pybedtools.BedTool(reptile.dmr_regions).sort( g=reptile.chrom_size_path ) dmr_regions_bed_df = dmr_regions_bed.to_dataframe() # train DMRs and train DMR labels train_dmr = train_regions_bed.map(dmr_regions_bed, c=4, o="collapse").to_dataframe() dmr_label = defaultdict(list) for _, row in train_dmr.iterrows(): *_, train_region, dmrs = row if dmrs == ".": continue dmrs = dmrs.split(",") for dmr in dmrs: dmr_label[dmr].append(train_label[train_region]) # some DMR might have multiple labels consistent_dmr_label = {} for dmr, dmr_labels in dmr_label.items(): if (len(dmr_labels) == 1) or (len(set(dmr_labels)) == 1): consistent_dmr_label[dmr] = dmr_labels[0] else: # dmr has in consistent label continue dmr_label = pd.Series(consistent_dmr_label) dmr_label.index.name = "train-dmr" train_dmr_regions_bed_df = ( dmr_regions_bed_df.set_index("name") .loc[dmr_label.index] .reset_index() .iloc[:, [1, 2, 3, 0]] ) # train DMR RegionDS train_dmr_ds = RegionDS.from_bed( train_dmr_regions_bed_df, chrom_size_path=reptile.chrom_size_path, location=reptile.output_path, region_dim="train-dmr", ) train_dmr_ds.coords["train-dmr_label"] = dmr_label train_dmr_ds.save() return dmr_regions_bed_df
def _create_train_region_ds(reptile): # train regions train_regions_bed = pybedtools.BedTool(reptile.train_regions).sort( g=reptile.chrom_size_path ) # train region labels train_label = pd.read_csv( reptile.train_region_labels, sep="\t", index_col=0, squeeze=True ) train_label.index.name = "train-region" train_regions_bed_df = train_regions_bed.to_dataframe() # train RegionDS train_region_ds = RegionDS.from_bed( train_regions_bed_df, chrom_size_path=reptile.chrom_size_path, location=reptile.output_path, region_dim="train-region", ) train_region_ds.coords["train-region_label"] = train_label train_region_ds.save() return train_regions_bed, train_label