def get_data(data: pd.Dataframe) -> pd.Dataframe: with zipfile.ZipFile(countpath) as z: data["text"] = data.apply( lambda row: [ s.strip().decode("utf-8").split("\t") for s in z.open(row.path, "r").readlines() ], axis=1, ) data = data.explode("text") data[["word", "count"]] = data["text"].tolist() data.drop(columns=["text", "path"], inplace=True) data["count"] = data["count"].astype(int) if filter is not None: data = data[data["word"].map(filter)] return data
def _adjust_tstamp_drift_of_triplet(df: pd.Dataframe) -> List[pd.DataFrame]: """Return list of pandas DataFrames where timestamp offsets has been adjusted. Sorts dataframe based on timestamp, finds triplets where timestamp is equal +-2, and adjusts any timestamps +-2 from 2nd timestamp to be equal to 2nd timestamp. Returns a list of all valid triplets. Args: df: pd.DataFrame where columns "timestamp" and "millisecond" are used to adjust. Returns: Returns list of pd.DataFrame where timestamps offset +-2 from middle timestamp is adjusted. For example: | timestamp | millisecond | frequency | tagID | tagData | | 1556555369 | 995 | 69 | 12 | 3.5 | | 1556555370 | 005 | 69 | 12 | 3.5 | | 1556555371 | 010 | 69 | 12 | 3.5 | becomes --> | timestamp | millisecond | frequency | tagID | tagData | | 1556555370 | 995 | 69 | 12 | 3.5 | | 1556555370 | 005 | 69 | 12 | 3.5 | | 1556555370 | 010 | 69 | 12 | 3.5 | """ ts_drift_threshold = 2 ms_1km = 0.667 # Sort dataframe by timestamps in case some timestamps are in the wrong order df = df.sort_values("timestamp") df = df.reset_index(drop=True) # Extract timestamps and find all triplets within dataframe ts = df["timestamp"] last_indices = ts.index[ts.diff(periods=2) <= ts_drift_threshold] all_indices = last_indices.append([last_indices - 1, last_indices - 2]).sort_values() # Mask out all detections that aren't triplets mask_values = [i for i in range(len(last_indices)) for _ in range(3)] df.loc[all_indices, "mask"] = mask_values df = df[df["mask"].notnull()] if df.empty: return [] # Adjust timestamps that have drifted # | if 2nd timestamp in triplet is much larger than the 1st, add 2nd index to list # | if 3rd timestamp in triplet is much larger than the 2nd, add 2nd index to list df["drift"] = df.apply(lambda x: x["timestamp"] + x["millisecond"] / 1000, axis=1) drift = df["drift"].diff() drift_3rd = drift[last_indices].where(abs(drift[last_indices]) >= ms_1km) drift_1st = drift[last_indices - 1].where(abs(drift[last_indices - 1]) >= ms_1km) drift_indices = drift_3rd.dropna( ).index - 1 # -1 to get index of 2nd timestamp drift_indices = drift_indices.append(drift_1st.dropna().index) # Set timestamp 1 and 3 of each triplet with drift has equal to 2nd timestamp df.loc[drift_indices - 1, "timestamp"] = ts[drift_indices].values df.loc[drift_indices + 1, "timestamp"] = ts[drift_indices].values # get and return triplets as list of dataframes triplets = [ v.drop(["mask", "drift"], axis=1) for _, v in df.groupby("mask") ] # triplets = [v.drop(["mask"], axis=1) for _, v in df.groupby("mask")] return triplets