def update_estimate_posts(new_derived_posts): locs = new_derived_posts.apply( lambda x: route_detection_2(x["stage_3"].split(',')), axis=1) from_locs = [','.join(x[0]) for x in locs] to_locs = [','.join(x[1]) for x in locs] print('done routes') dates = new_derived_posts.apply( lambda x: find_dates(x["stage_3"].split(','), x["posttime"]), axis=1) best_dates = [x[0] for x in dates] all_dates = [x[1] for x in dates] times = new_derived_posts.apply( lambda x: find_times(x["stage_3"].split(','), x["posttime"]), axis=1) best_times = [x[0] for x in times] all_times = [x[1] for x in times] print('done time/date') posts_type = new_derived_posts.apply( lambda x: predict(dos_model, dos_vectorizer, x["stage_3"]), axis=1) print('done post_type') new_estimate_posts = pd.DataFrame({ "post_id": new_derived_posts["post_id"], "from_loc": from_locs, "to_loc": to_locs, "date": best_dates, "all_dates": all_dates, "time": best_times, "all_times": all_times, "post_type": posts_type, "posttime": new_derived_posts["posttime"] }) new_estimate_posts.astype({ "date": "object", "time": "object", }) values = [(post_id, post_type, pandas_nat_to_none(best_date), format_all_date(all_dates), pandas_nat_to_none(best_time), format_all_time(all_times), from_locs, to_locs) for post_id, post_type, best_date, all_dates, best_time, all_times, from_locs, to_locs in new_estimate_posts[[ 'post_id', 'post_type', 'date', 'all_dates', 'time', 'all_times', 'from_loc', 'to_loc' ]].values] batch_update(UPDATE_STATEMENT, values, 1000, 10000) new_estimate_posts["time"] = pd.to_datetime( new_estimate_posts["time"]).dt.time return new_estimate_posts
def update_groups(new_posts, all_posts=False): new_groups = None if all_posts: new_posts = group_posts(new_posts) new_posts.astype({ 'group_id': int, 'post_id': int, }, copy=False) new_groups = new_posts.drop_duplicates(subset="group_id") else: old_groups = pd.read_sql_query(old_groups_sql(), con=engine) new_posts = new_posts.assign(group_id="") max_group_id = get_max_group_id() index = max_group_id + 1 n = new_posts.shape[0] for i in range(0, n): message = new_posts["message"].iloc[i] group_id_list = old_groups[old_groups["message"] == message]["group_id"].tolist() if (len(group_id_list) == 0): group_id = index new_posts.at[new_posts.index[i], "group_id"] = group_id index += 1 else: group_id = group_id_list[0] new_posts.at[new_posts.index[i], "group_id"] = group_id old_groups = pd.concat([ old_groups, pd.DataFrame({ "group_id": [group_id], "message": [message] }) ], sort=False) new_groups = new_posts[ new_posts["group_id"] > max_group_id].drop_duplicates( subset="group_id") new_groups = new_groups[['group_id', 'post_id', 'message']] values = [ (group_id, post_id) for group_id, post_id in new_posts[['group_id', 'post_id']].values ] batch_update(UPDATE_STATEMENT, values, 1000, 10000) return new_posts, new_groups
def update_derived_posts(new_posts): new_posts_stage_1 = new_posts.apply(lambda x: ','.join(process(x["message"], False, False)), axis = 1) new_posts_stage_2 = new_posts.apply(lambda x: ','.join(process(x["message"], True, False)), axis = 1) new_posts_stage_3 = new_posts.apply(lambda x: ','.join(process(x["message"])), axis = 1) new_posts_clean_message = new_posts.apply(lambda x: clean_message(x["message"], "**********"), axis = 1) new_derived_posts = pd.DataFrame({ "post_id": new_posts["post_id"], "stage_1": new_posts_stage_1, "stage_2": new_posts_stage_2, "stage_3": new_posts_stage_3, "clean_message": new_posts_clean_message, "posttime": new_posts["posttime"]}) values = [(post_id, stage_1, stage_2, stage_3, clean_message) for post_id, stage_1, stage_2, stage_3, clean_message in new_derived_posts[['post_id', 'stage_1', 'stage_2', 'stage_3', 'clean_message']].values] batch_update(UPDATE_STATEMENT, values) return new_derived_posts
def update_trips(new_estimate_posts, new_groups): new_groups.index = new_groups["post_id"] new_estimate_posts.index = new_estimate_posts["post_id"] new_groups = new_groups.drop(["post_id"], axis=1) new_trips = new_groups.join(new_estimate_posts) new_trips = new_trips[new_trips["post_type"] != "o"] new_trips = splitDataFrameList( splitDataFrameList(new_trips, "from_loc", ","), "to_loc", ",") if len(new_trips) != 0: values = [ (group_id, post_type, pandas_nat_to_none(best_date), pandas_nat_to_none(best_time), from_loc, to_loc) for group_id, post_type, best_date, best_time, from_loc, to_loc in new_trips[[ 'group_id', 'post_type', 'date', 'time', 'from_loc', 'to_loc' ]].values ] batch_update(INSERT_STATEMENT, values, 1000, 10000) return new_trips