Esempio n. 1
0
def _nearest(scdf, ocdf, suffix="_b", how=None, overlap=True, **kwargs):

    if overlap:
        nearest_df, df_to_find_nearest_in = _overlapping_for_nearest(
            scdf, ocdf, suffix, **kwargs)
    else:
        df_to_find_nearest_in = scdf

    df_to_find_nearest_in = sort_one_by_one(df_to_find_nearest_in, "Start",
                                            "End")
    ocdf = sort_one_by_one(ocdf, "Start", "End")
    df_to_find_nearest_in.index = pd.Index(range(len(df_to_find_nearest_in)))

    if how == "next":
        r_idx, dist = _next_nonoverlapping(df_to_find_nearest_in.End,
                                           ocdf.Start, ocdf.index.values)
    elif how == "previous":
        r_idx, dist = _previous_nonoverlapping(df_to_find_nearest_in.Start,
                                               ocdf.End)
    else:
        previous_r_idx, previous_dist = _previous_nonoverlapping(
            df_to_find_nearest_in.Start, ocdf.End)

        next_r_idx, next_dist = _next_nonoverlapping(df_to_find_nearest_in.End,
                                                     ocdf.Start,
                                                     ocdf.index.values)

        r_idx, dist = nearest_nonoverlapping(previous_r_idx, previous_dist,
                                             next_r_idx, next_dist)

    ocdf = ocdf.reindex(
        r_idx,
        fill_value=-1)  # instead of np.nan, so ints are not promoted to float

    ocdf.index = df_to_find_nearest_in.index
    ocdf.insert(ocdf.shape[1], "Distance",
                pd.Series(dist, index=ocdf.index).fillna(-1).astype(int))

    r_idx = pd.Series(r_idx, index=ocdf.index)
    df_to_find_nearest_in = df_to_find_nearest_in.drop(
        r_idx.loc[r_idx == -1].index)

    df = df_to_find_nearest_in.join(ocdf, rsuffix=suffix)

    if overlap and not df.empty and not nearest_df.empty:
        df = pd.concat([nearest_df, df])
    elif overlap and not nearest_df.empty:
        df = nearest_df

    df = df.drop("Chromosome" + suffix, axis=1)
    return df
Esempio n. 2
0
def _nearest(self, other, strandedness, suffix="_b", how=None, overlap=True):

    if overlap:
        nearest_df, df_to_find_nearest_in = _overlapping_for_nearest(self, other, strandedness, suffix)
    else:
        df_to_find_nearest_in = self.df

    other_strand = {"+": "-", "-": "+"}

    if self.stranded and strandedness: # chromosome and strand
        grpby_key = "Chromosome Strand".split()
    else:
        grpby_key = "Chromosome"

    other_dfs = {k: d for k, d in other.df.groupby(grpby_key)}

    dfs = []

    for key, scdf in df_to_find_nearest_in.groupby(grpby_key):

        if len(key) == 2 and strandedness == "opposite":
            other_key = key[0], other_strand[key[1]]
        else:
            other_key = key

        if not other_key in other_dfs:
            continue

        ocdf = other_dfs[other_key]

        scdf.index = pd.Index(range(len(scdf)))

        if how == "next":
            r_idx, dist = _next_nonoverlapping(scdf.End, ocdf.Start, ocdf.index.values)
        elif how == "previous":
            r_idx, dist = _previous_nonoverlapping(scdf.Start, ocdf.End, ocdf.index.values)
        else:
            previous_r_idx, previous_dist = _previous_nonoverlapping(scdf.Start, ocdf.End, ocdf.index.values)

            next_r_idx, next_dist = _next_nonoverlapping(scdf.End, ocdf.Start, ocdf.index.values)

            r_idx, dist = nearest_nonoverlapping(previous_r_idx,
                                                 previous_dist,
                                                 next_r_idx, next_dist)

        ocdf = ocdf.reindex(r_idx, fill_value=-1) # instead of np.nan, so ints are not promoted to float

        ocdf.index = scdf.index
        ocdf.insert(ocdf.shape[1], "Distance", pd.Series(dist, index=ocdf.index).fillna(-1).astype(int))
        ocdf.drop("Chromosome", axis=1, inplace=True)

        r_idx = pd.Series(r_idx, index=ocdf.index)
        scdf = scdf.drop(r_idx.loc[r_idx == -1].index)

        result = scdf.join(ocdf, rsuffix=suffix)

        dfs.append(result)

    if dfs:
        df = pd.concat(dfs)
    else:
        df = pd.DataFrame(columns="Chromosome Start End Strand".split())


    if overlap and not df.empty and not nearest_df.empty:
        df = pd.concat([nearest_df, df])
    elif overlap and not nearest_df.empty:
        df = nearest_df

    return df
Esempio n. 3
0
def _nearest(scdf, ocdf, kwargs):

    if scdf.empty or ocdf.empty:
        return None

    overlap = kwargs["overlap"]
    how = kwargs["how"]
    suffix = kwargs["suffix"]

    if how == "upstream":
        strand = scdf.Strand.iloc[0]
        how = {"+": "previous", "-": "next"}[strand]
    elif how == "downstream":
        strand = scdf.Strand.iloc[0]
        how = {"+": "next", "-": "previous"}[strand]

    ocdf = ocdf.reset_index(drop=True)

    if overlap:
        nearest_df, df_to_find_nearest_in = _overlapping_for_nearest(
            scdf, ocdf, suffix)
    else:
        df_to_find_nearest_in = scdf

    if not df_to_find_nearest_in.empty:
        df_to_find_nearest_in = sort_one_by_one(df_to_find_nearest_in, "Start",
                                                "End")
        ocdf = sort_one_by_one(ocdf, "Start", "End")
        df_to_find_nearest_in.index = pd.Index(
            range(len(df_to_find_nearest_in)))

        if how == "next":
            r_idx, dist = _next_nonoverlapping(df_to_find_nearest_in.End,
                                               ocdf.Start, ocdf.index.values)
        elif how == "previous":
            r_idx, dist = _previous_nonoverlapping(df_to_find_nearest_in.Start,
                                                   ocdf.End)
        else:
            previous_r_idx, previous_dist = _previous_nonoverlapping(
                df_to_find_nearest_in.Start, ocdf.End)

            next_r_idx, next_dist = _next_nonoverlapping(
                df_to_find_nearest_in.End, ocdf.Start, ocdf.index.values)

            r_idx, dist = nearest_nonoverlapping(previous_r_idx, previous_dist,
                                                 next_r_idx, next_dist)

        ocdf = ocdf.reindex(r_idx)

        ocdf.index = df_to_find_nearest_in.index

        ocdf = _insert_distance(ocdf, dist, suffix)

        r_idx = pd.Series(r_idx, index=ocdf.index)
        df_to_find_nearest_in = df_to_find_nearest_in.drop(
            r_idx.loc[r_idx == -1].index)

        df = df_to_find_nearest_in.join(ocdf, rsuffix=suffix)

    if overlap and "df" in locals() and not df.empty and not nearest_df.empty:

        df = pd.concat([nearest_df, df], sort=False)
    elif overlap and not nearest_df.empty:
        df = nearest_df

    df = df.drop("Chromosome" + suffix, axis=1)
    return df