def fill_data_missing_ts(data, tolerance=20): """ It fills missing rows. Suppose the time difference between two consecutive points are 100 (greater than a tolerance=20) so it creates 8 new entries between them. The row values are the same as the first row. :param data: :return: """ data = local_time(data) last_timestamp = data["local_time"].iloc[0] for index, current_row in data[1 : len(data) - 1].iterrows(): current_timestamp = current_row["local_time"] if current_timestamp - last_timestamp >= tolerance: for n in range( math.trunc((current_timestamp - last_timestamp) / TEN_SECONDS) - 1): new_entry_timestamp = last_timestamp + (n+1) * TEN_SECONDS new_row = current_row.copy() new_row["local_time"] = new_entry_timestamp new_row["db_key"] = None data = data.append(new_row) last_timestamp = current_timestamp return data.sort_values(by="local_time")
def load_user_gps_csv(userid, from_day_n=None, to_day_n=None, fill=False): try: user_data = pd.read_csv("outputs/user_gps/" + str(userid) + '_gps.csv') except pd.errors.EmptyDataError: return pd.DataFrame() user_data = local_time(user_data) if len(user_data) > 0: user_data = user_data.drop_duplicates().sort_values(by="local_time") min_time = user_data["local_time"].min() if from_day_n is None: use_data_from_time = min_time else: use_data_from_time = min_time + DAY_SECONDS * from_day_n if to_day_n is None: use_data_to_time = user_data["local_time"].max() else: use_data_to_time = use_data_from_time + to_day_n * DAY_SECONDS user_data = user_data[(user_data["local_time"] >= use_data_from_time) & (user_data["local_time"] <= use_data_to_time)] if fill: pass return user_data
def places(place_label_visit_data, user_gps_data): place_label_visit_data = time_utils.local_time(place_label_visit_data, time_col="time_start", tz_col="tz_start") place_label_visit_data = time_utils.local_time(place_label_visit_data, time_col="time_end", tz_col="tz_end") user_visit_locations = pd.DataFrame() for index, row in place_label_visit_data.iterrows(): user_visit_locations = user_visit_locations.append(user_gps_data[ (user_gps_data["local_time"] >= row["local_time_start"]) & (user_gps_data["local_time"] <= row["local_time_end"])]) return user_visit_locations
def load_user_gps_time_window(userid, from_local_time, to_local_time): user_gps_data = load_user_gps_csv(userid) user_gps_data["userid"] = [userid] * len(user_gps_data) user_gps_data = local_time(user_gps_data) user_gps_data = user_gps_data[["userid", "latitude", "longitude", "tz", "time", "local_time"]].sort_values("local_time") user_gps_data = user_gps_data[(user_gps_data["local_time"] >= from_local_time) & (user_gps_data["local_time"] <= to_local_time)] return user_gps_data
def places(place_label_visit_data, user_gps_data): ''' Returns a pandas.DataFrame that matches the time of the informed places the user have been with GPS points. This match is based on local_time. :param place_label_visit_data: :param user_gps_data: :return: ''' place_label_visit_data = time_utils.local_time(place_label_visit_data, time_col="time_start", tz_col="tz_start") place_label_visit_data = time_utils.local_time(place_label_visit_data, time_col="time_end", tz_col="tz_end") user_visit_locations = pd.DataFrame() for index, row in place_label_visit_data.iterrows(): user_visit_locations = user_visit_locations.append(user_gps_data[ (user_gps_data["local_time"] >= row["local_time_start"]) & (user_gps_data["local_time"] <= row["local_time_end"])]) return user_visit_locations
def load_users_gps_data(userids, cols=[ "userid", "latitude", "longitude", "tz", "time", "local_time", "horizontal_accuracy", "horizontal_dop", "speed" ]): df = pd.DataFrame() for userid in userids: df = df.append(load_user_gps_data(userid)) df = time_utils.local_time(df) if cols != "*": df = df[cols] df = df.sort_values("local_time") return df
def load_user_gps_csv_by_timestamp_interval(userid, from_ts=None, to_ts=None, fill=False): try: user_data = pd.read_csv("outputs/user_gps/" + str(userid) + '_gps.csv') except pd.errors.EmptyDataError: return pd.DataFrame() user_data = local_time(user_data) if len(user_data) > 0: user_data = user_data.drop_duplicates().sort_values(by="local_time") if from_ts is None: from_ts = user_data["local_time"].min() if to_ts is None: to_ts = user_data["local_time"].max() user_data = user_data[(user_data["local_time"] >= from_ts) & (user_data["local_time"] <= to_ts)] if fill: pass return user_data
user_clusters_dir = "outputs/stop_regions/" + str(userid) if os.path.exists(user_clusters_dir): print("User data already processed") print() continue try: print("LOADING USER DATA") user_data = load_user_gps_csv(userid) if len(user_data) == 0: print("Empty csv\n") continue user_data = local_time(user_data) if len(user_data) == 0: continue print("user_data head") print(user_data.head()) print("FINDING STOP REGIONS") clusters = MovingCentroidStopRegionFinder( region_radius=r, delta_time=delta_t).find_clusters(user_data, verbose=False) print(len(clusters), "found") if os.path.isdir("outputs/stop_regions/" ) and not os.path.exists(user_clusters_dir): os.mkdir(user_clusters_dir)