Ejemplo n.º 1
0
 def get_data(data: pd.Dataframe) -> pd.Dataframe:
     with zipfile.ZipFile(countpath) as z:
         data["text"] = data.apply(
             lambda row: [
                 s.strip().decode("utf-8").split("\t")
                 for s in z.open(row.path, "r").readlines()
             ],
             axis=1,
         )
     data = data.explode("text")
     data[["word", "count"]] = data["text"].tolist()
     data.drop(columns=["text", "path"], inplace=True)
     data["count"] = data["count"].astype(int)
     if filter is not None:
         data = data[data["word"].map(filter)]
     return data
Ejemplo n.º 2
0
def _adjust_tstamp_drift_of_triplet(df: pd.Dataframe) -> List[pd.DataFrame]:
    """Return list of pandas DataFrames where timestamp offsets has been adjusted.

    Sorts dataframe based on timestamp, finds triplets where timestamp is equal +-2, and
    adjusts any timestamps +-2 from 2nd timestamp to be equal to 2nd timestamp. Returns
    a list of all valid triplets.

    Args:
        df: pd.DataFrame where columns "timestamp" and "millisecond" are used to adjust.

    Returns:
        Returns list of pd.DataFrame where timestamps offset +-2 from middle timestamp
        is adjusted. For example:

        | timestamp  | millisecond | frequency | tagID | tagData |
        | 1556555369 |     995     |     69    |   12  |   3.5   |
        | 1556555370 |     005     |     69    |   12  |   3.5   |
        | 1556555371 |     010     |     69    |   12  |   3.5   |

        becomes -->

        | timestamp  | millisecond | frequency | tagID | tagData |
        | 1556555370 |     995     |     69    |   12  |   3.5   |
        | 1556555370 |     005     |     69    |   12  |   3.5   |
        | 1556555370 |     010     |     69    |   12  |   3.5   |
    """
    ts_drift_threshold = 2
    ms_1km = 0.667

    # Sort dataframe by timestamps in case some timestamps are in the wrong order
    df = df.sort_values("timestamp")
    df = df.reset_index(drop=True)

    # Extract timestamps and find all triplets within dataframe
    ts = df["timestamp"]
    last_indices = ts.index[ts.diff(periods=2) <= ts_drift_threshold]
    all_indices = last_indices.append([last_indices - 1,
                                       last_indices - 2]).sort_values()

    # Mask out all detections that aren't triplets
    mask_values = [i for i in range(len(last_indices)) for _ in range(3)]
    df.loc[all_indices, "mask"] = mask_values
    df = df[df["mask"].notnull()]
    if df.empty:
        return []

    # Adjust timestamps that have drifted
    # | if 2nd timestamp in triplet is much larger than the 1st, add 2nd index to list
    # | if 3rd timestamp in triplet is much larger than the 2nd, add 2nd index to list
    df["drift"] = df.apply(lambda x: x["timestamp"] + x["millisecond"] / 1000,
                           axis=1)
    drift = df["drift"].diff()
    drift_3rd = drift[last_indices].where(abs(drift[last_indices]) >= ms_1km)
    drift_1st = drift[last_indices -
                      1].where(abs(drift[last_indices - 1]) >= ms_1km)
    drift_indices = drift_3rd.dropna(
    ).index - 1  # -1 to get index of 2nd timestamp
    drift_indices = drift_indices.append(drift_1st.dropna().index)

    # Set timestamp 1 and 3 of each triplet with drift has equal to 2nd timestamp
    df.loc[drift_indices - 1, "timestamp"] = ts[drift_indices].values
    df.loc[drift_indices + 1, "timestamp"] = ts[drift_indices].values

    # get and return triplets as list of dataframes
    triplets = [
        v.drop(["mask", "drift"], axis=1) for _, v in df.groupby("mask")
    ]
    # triplets = [v.drop(["mask"], axis=1) for _, v in df.groupby("mask")]
    return triplets