Ejemplo n.º 1
0
def fill_data_missing_ts(data, tolerance=20):
    """
    It fills missing rows. Suppose the time difference between two consecutive points are 100 (greater than a tolerance=20)
    so it creates 8 new entries between them. The row values are the same as the first row.
    :param data:
    :return:
    """

    data = local_time(data)

    last_timestamp = data["local_time"].iloc[0]

    for index, current_row in data[1 : len(data) - 1].iterrows():
        current_timestamp = current_row["local_time"]

        if current_timestamp - last_timestamp >= tolerance:
            for n in range( math.trunc((current_timestamp - last_timestamp) / TEN_SECONDS) - 1):
                new_entry_timestamp = last_timestamp + (n+1) * TEN_SECONDS
                new_row = current_row.copy()
                new_row["local_time"] = new_entry_timestamp
                new_row["db_key"] = None
                data = data.append(new_row)

        last_timestamp = current_timestamp

    return data.sort_values(by="local_time")
Ejemplo n.º 2
0
def load_user_gps_csv(userid, from_day_n=None, to_day_n=None, fill=False):
    try:
        user_data = pd.read_csv("outputs/user_gps/" + str(userid) + '_gps.csv')
    except pd.errors.EmptyDataError:
        return pd.DataFrame()

    user_data = local_time(user_data)
    if len(user_data) > 0:
        user_data = user_data.drop_duplicates().sort_values(by="local_time")

    min_time = user_data["local_time"].min()

    if from_day_n is None:
        use_data_from_time = min_time
    else:
        use_data_from_time = min_time + DAY_SECONDS * from_day_n

    if to_day_n is None:
        use_data_to_time = user_data["local_time"].max()
    else:
        use_data_to_time = use_data_from_time + to_day_n * DAY_SECONDS

    user_data = user_data[(user_data["local_time"] >= use_data_from_time) & (user_data["local_time"] <= use_data_to_time)]

    if fill:
        pass

    return user_data
Ejemplo n.º 3
0
def places(place_label_visit_data, user_gps_data):
    place_label_visit_data = time_utils.local_time(place_label_visit_data,
                                                   time_col="time_start",
                                                   tz_col="tz_start")
    place_label_visit_data = time_utils.local_time(place_label_visit_data,
                                                   time_col="time_end",
                                                   tz_col="tz_end")

    user_visit_locations = pd.DataFrame()

    for index, row in place_label_visit_data.iterrows():
        user_visit_locations = user_visit_locations.append(user_gps_data[
            (user_gps_data["local_time"] >= row["local_time_start"])
            & (user_gps_data["local_time"] <= row["local_time_end"])])

    return user_visit_locations
Ejemplo n.º 4
0
def load_user_gps_time_window(userid, from_local_time, to_local_time):
    user_gps_data = load_user_gps_csv(userid)

    user_gps_data["userid"] = [userid] * len(user_gps_data)
    user_gps_data = local_time(user_gps_data)
    user_gps_data = user_gps_data[["userid", "latitude", "longitude", "tz", "time", "local_time"]].sort_values("local_time")

    user_gps_data = user_gps_data[(user_gps_data["local_time"] >= from_local_time) & (user_gps_data["local_time"] <= to_local_time)]
    return user_gps_data
Ejemplo n.º 5
0
def places(place_label_visit_data, user_gps_data):
    '''
    Returns a pandas.DataFrame that matches the time of the informed places the user have been with GPS points.
    This match is based on local_time.
    :param place_label_visit_data:
    :param user_gps_data:
    :return:
    '''
    place_label_visit_data = time_utils.local_time(place_label_visit_data,
                                                   time_col="time_start",
                                                   tz_col="tz_start")
    place_label_visit_data = time_utils.local_time(place_label_visit_data,
                                                   time_col="time_end",
                                                   tz_col="tz_end")

    user_visit_locations = pd.DataFrame()

    for index, row in place_label_visit_data.iterrows():
        user_visit_locations = user_visit_locations.append(user_gps_data[
            (user_gps_data["local_time"] >= row["local_time_start"])
            & (user_gps_data["local_time"] <= row["local_time_end"])])

    return user_visit_locations
Ejemplo n.º 6
0
def load_users_gps_data(userids,
                        cols=[
                            "userid", "latitude", "longitude", "tz", "time",
                            "local_time", "horizontal_accuracy",
                            "horizontal_dop", "speed"
                        ]):
    df = pd.DataFrame()

    for userid in userids:
        df = df.append(load_user_gps_data(userid))

    df = time_utils.local_time(df)

    if cols != "*":
        df = df[cols]

    df = df.sort_values("local_time")

    return df
Ejemplo n.º 7
0
def load_user_gps_csv_by_timestamp_interval(userid, from_ts=None, to_ts=None, fill=False):
    try:
        user_data = pd.read_csv("outputs/user_gps/" + str(userid) + '_gps.csv')
    except pd.errors.EmptyDataError:
        return pd.DataFrame()

    user_data = local_time(user_data)
    if len(user_data) > 0:
        user_data = user_data.drop_duplicates().sort_values(by="local_time")

    if from_ts is None:
        from_ts = user_data["local_time"].min()

    if to_ts is None:
        to_ts = user_data["local_time"].max()

    user_data = user_data[(user_data["local_time"] >= from_ts) & (user_data["local_time"] <= to_ts)]

    if fill:
        pass

    return user_data
Ejemplo n.º 8
0
    user_clusters_dir = "outputs/stop_regions/" + str(userid)

    if os.path.exists(user_clusters_dir):
        print("User data already processed")
        print()
        continue

    try:
        print("LOADING USER DATA")
        user_data = load_user_gps_csv(userid)
        if len(user_data) == 0:
            print("Empty csv\n")
            continue

        user_data = local_time(user_data)

        if len(user_data) == 0:
            continue

        print("user_data head")
        print(user_data.head())
        print("FINDING STOP REGIONS")
        clusters = MovingCentroidStopRegionFinder(
            region_radius=r, delta_time=delta_t).find_clusters(user_data,
                                                               verbose=False)
        print(len(clusters), "found")

        if os.path.isdir("outputs/stop_regions/"
                         ) and not os.path.exists(user_clusters_dir):
            os.mkdir(user_clusters_dir)