Beispiel #1
0
def update_estimate_posts(new_derived_posts):
    locs = new_derived_posts.apply(
        lambda x: route_detection_2(x["stage_3"].split(',')), axis=1)

    from_locs = [','.join(x[0]) for x in locs]
    to_locs = [','.join(x[1]) for x in locs]

    print('done routes')
    dates = new_derived_posts.apply(
        lambda x: find_dates(x["stage_3"].split(','), x["posttime"]), axis=1)
    best_dates = [x[0] for x in dates]
    all_dates = [x[1] for x in dates]

    times = new_derived_posts.apply(
        lambda x: find_times(x["stage_3"].split(','), x["posttime"]), axis=1)
    best_times = [x[0] for x in times]
    all_times = [x[1] for x in times]

    print('done time/date')
    posts_type = new_derived_posts.apply(
        lambda x: predict(dos_model, dos_vectorizer, x["stage_3"]), axis=1)

    print('done post_type')
    new_estimate_posts = pd.DataFrame({
        "post_id": new_derived_posts["post_id"],
        "from_loc": from_locs,
        "to_loc": to_locs,
        "date": best_dates,
        "all_dates": all_dates,
        "time": best_times,
        "all_times": all_times,
        "post_type": posts_type,
        "posttime": new_derived_posts["posttime"]
    })
    new_estimate_posts.astype({
        "date": "object",
        "time": "object",
    })

    values = [(post_id, post_type, pandas_nat_to_none(best_date),
               format_all_date(all_dates), pandas_nat_to_none(best_time),
               format_all_time(all_times), from_locs, to_locs)
              for post_id, post_type, best_date, all_dates, best_time,
              all_times, from_locs, to_locs in new_estimate_posts[[
                  'post_id', 'post_type', 'date', 'all_dates', 'time',
                  'all_times', 'from_loc', 'to_loc'
              ]].values]

    batch_update(UPDATE_STATEMENT, values, 1000, 10000)

    new_estimate_posts["time"] = pd.to_datetime(
        new_estimate_posts["time"]).dt.time

    return new_estimate_posts
Beispiel #2
0
def update_groups(new_posts, all_posts=False):
    new_groups = None
    if all_posts:
        new_posts = group_posts(new_posts)
        new_posts.astype({
            'group_id': int,
            'post_id': int,
        }, copy=False)
        new_groups = new_posts.drop_duplicates(subset="group_id")
    else:
        old_groups = pd.read_sql_query(old_groups_sql(), con=engine)
        new_posts = new_posts.assign(group_id="")
        max_group_id = get_max_group_id()
        index = max_group_id + 1
        n = new_posts.shape[0]

        for i in range(0, n):
            message = new_posts["message"].iloc[i]
            group_id_list = old_groups[old_groups["message"] ==
                                       message]["group_id"].tolist()

            if (len(group_id_list) == 0):
                group_id = index
                new_posts.at[new_posts.index[i], "group_id"] = group_id
                index += 1
            else:
                group_id = group_id_list[0]
                new_posts.at[new_posts.index[i], "group_id"] = group_id

            old_groups = pd.concat([
                old_groups,
                pd.DataFrame({
                    "group_id": [group_id],
                    "message": [message]
                })
            ],
                                   sort=False)

        new_groups = new_posts[
            new_posts["group_id"] > max_group_id].drop_duplicates(
                subset="group_id")

    new_groups = new_groups[['group_id', 'post_id', 'message']]
    values = [
        (group_id, post_id)
        for group_id, post_id in new_posts[['group_id', 'post_id']].values
    ]

    batch_update(UPDATE_STATEMENT, values, 1000, 10000)

    return new_posts, new_groups
def update_derived_posts(new_posts):
    new_posts_stage_1 = new_posts.apply(lambda x: ','.join(process(x["message"], False, False)), axis = 1)
    new_posts_stage_2 = new_posts.apply(lambda x: ','.join(process(x["message"], True, False)), axis = 1)
    new_posts_stage_3 = new_posts.apply(lambda x: ','.join(process(x["message"])), axis = 1)
    new_posts_clean_message = new_posts.apply(lambda x: clean_message(x["message"], "**********"), axis = 1)

    new_derived_posts = pd.DataFrame({
        "post_id": new_posts["post_id"],
        "stage_1": new_posts_stage_1,
        "stage_2": new_posts_stage_2,
        "stage_3": new_posts_stage_3,
        "clean_message": new_posts_clean_message,
        "posttime": new_posts["posttime"]})

    values = [(post_id, stage_1, stage_2, stage_3, clean_message) for post_id, stage_1, stage_2, stage_3, clean_message in new_derived_posts[['post_id', 'stage_1', 'stage_2', 'stage_3', 'clean_message']].values]
    batch_update(UPDATE_STATEMENT, values)

    return new_derived_posts
Beispiel #4
0
def update_trips(new_estimate_posts, new_groups):

    new_groups.index = new_groups["post_id"]
    new_estimate_posts.index = new_estimate_posts["post_id"]

    new_groups = new_groups.drop(["post_id"], axis=1)

    new_trips = new_groups.join(new_estimate_posts)
    new_trips = new_trips[new_trips["post_type"] != "o"]
    new_trips = splitDataFrameList(
        splitDataFrameList(new_trips, "from_loc", ","), "to_loc", ",")

    if len(new_trips) != 0:
        values = [
            (group_id, post_type, pandas_nat_to_none(best_date),
             pandas_nat_to_none(best_time), from_loc, to_loc) for group_id,
            post_type, best_date, best_time, from_loc, to_loc in new_trips[[
                'group_id', 'post_type', 'date', 'time', 'from_loc', 'to_loc'
            ]].values
        ]

        batch_update(INSERT_STATEMENT, values, 1000, 10000)

    return new_trips