Exemple #1
0
def _create_query_dmr_ds(reptile, dmr_regions_bed_df):
    query_dmr_ds = RegionDS.from_bed(
        dmr_regions_bed_df,
        chrom_size_path=reptile.chrom_size_path,
        location=reptile.output_path,
        region_dim="query-dmr",
    )
    query_dmr_ds.save()
    return
Exemple #2
0
def _create_query_region_ds(reptile):
    pybedtools.BedTool().makewindows(
        g=reptile.chrom_size_path, s=reptile.step_size, w=reptile.window_size
    ).saveas(f"{reptile.output_path}/query_region.bed")

    query_region_ds = RegionDS.from_bed(
        f"{reptile.output_path}/query_region.bed",
        chrom_size_path=reptile.chrom_size_path,
        location=reptile.output_path,
        region_dim="query-region",
    )

    subprocess.run(f"rm -f {reptile.output_path}/query_region.bed", shell=True)
    query_region_ds.save()
    return
Exemple #3
0
def _create_train_dmr_ds(reptile, train_regions_bed, train_label):
    # total DMRs
    dmr_regions_bed = pybedtools.BedTool(reptile.dmr_regions).sort(
        g=reptile.chrom_size_path
    )
    dmr_regions_bed_df = dmr_regions_bed.to_dataframe()

    # train DMRs and train DMR labels
    train_dmr = train_regions_bed.map(dmr_regions_bed, c=4, o="collapse").to_dataframe()

    dmr_label = defaultdict(list)
    for _, row in train_dmr.iterrows():
        *_, train_region, dmrs = row
        if dmrs == ".":
            continue
        dmrs = dmrs.split(",")
        for dmr in dmrs:
            dmr_label[dmr].append(train_label[train_region])

    # some DMR might have multiple labels
    consistent_dmr_label = {}
    for dmr, dmr_labels in dmr_label.items():
        if (len(dmr_labels) == 1) or (len(set(dmr_labels)) == 1):
            consistent_dmr_label[dmr] = dmr_labels[0]
        else:
            # dmr has in consistent label
            continue
    dmr_label = pd.Series(consistent_dmr_label)
    dmr_label.index.name = "train-dmr"

    train_dmr_regions_bed_df = (
        dmr_regions_bed_df.set_index("name")
            .loc[dmr_label.index]
            .reset_index()
            .iloc[:, [1, 2, 3, 0]]
    )

    # train DMR RegionDS
    train_dmr_ds = RegionDS.from_bed(
        train_dmr_regions_bed_df,
        chrom_size_path=reptile.chrom_size_path,
        location=reptile.output_path,
        region_dim="train-dmr",
    )

    train_dmr_ds.coords["train-dmr_label"] = dmr_label
    train_dmr_ds.save()
    return dmr_regions_bed_df
Exemple #4
0
def _create_train_region_ds(reptile):
    # train regions
    train_regions_bed = pybedtools.BedTool(reptile.train_regions).sort(
        g=reptile.chrom_size_path
    )

    # train region labels
    train_label = pd.read_csv(
        reptile.train_region_labels, sep="\t", index_col=0, squeeze=True
    )
    train_label.index.name = "train-region"
    train_regions_bed_df = train_regions_bed.to_dataframe()

    # train RegionDS
    train_region_ds = RegionDS.from_bed(
        train_regions_bed_df,
        chrom_size_path=reptile.chrom_size_path,
        location=reptile.output_path,
        region_dim="train-region",
    )

    train_region_ds.coords["train-region_label"] = train_label
    train_region_ds.save()
    return train_regions_bed, train_label