def prediction(LGBM, dataset_id, df, label): tweets = Data.get_feature("raw_feature_tweet_id", dataset_id)["raw_feature_tweet_id"].array users = Data.get_feature("raw_feature_engager_id", dataset_id)["raw_feature_engager_id"].array # LGBM Prediction prediction_start_time = time.time() predictions = LGBM.get_prediction(df.to_numpy()) print(f"Prediction time: {time.time() - prediction_start_time} seconds") # Uncomment to plot feature importance at the end of training # LGBM.plot_fimportance() create_submission_file(tweets, users, predictions, f"{dataset_id}_{label}_lgbm_blending_submission_2.csv")
def main(): # Instantiate the parser parser = argparse.ArgumentParser() parser.add_argument('label', type=str, help='required argument: label') args = parser.parse_args() nn_labels = ["like", "reply", "retweet", "comment"] LABEL = args.label assert LABEL in ["like", "reply", "retweet", "comment"], "LABEL not valid." print(f"label is {LABEL}") features = [ "raw_feature_creator_follower_count", "raw_feature_creator_following_count", "raw_feature_engager_follower_count", "raw_feature_engager_following_count", "raw_feature_creator_is_verified", "raw_feature_engager_is_verified", "raw_feature_engagement_creator_follows_engager", "tweet_feature_number_of_photo", "tweet_feature_number_of_video", "tweet_feature_number_of_gif", "tweet_feature_number_of_media", "tweet_feature_is_retweet", "tweet_feature_is_quote", "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags", "tweet_feature_creation_timestamp_hour", "tweet_feature_creation_timestamp_week_day", # "tweet_feature_number_of_mentions", "tweet_feature_token_length", "tweet_feature_token_length_unique", "tweet_feature_text_topic_word_count_adult_content", "tweet_feature_text_topic_word_count_kpop", "tweet_feature_text_topic_word_count_covid", "tweet_feature_text_topic_word_count_sport", "number_of_engagements_with_language_like", "number_of_engagements_with_language_retweet", "number_of_engagements_with_language_reply", "number_of_engagements_with_language_comment", "number_of_engagements_with_language_negative", "number_of_engagements_with_language_positive", "number_of_engagements_ratio_like", "number_of_engagements_ratio_retweet", "number_of_engagements_ratio_reply", "number_of_engagements_ratio_comment", "number_of_engagements_ratio_negative", "number_of_engagements_ratio_positive", "number_of_engagements_between_creator_and_engager_like", "number_of_engagements_between_creator_and_engager_retweet", "number_of_engagements_between_creator_and_engager_reply", "number_of_engagements_between_creator_and_engager_comment", "number_of_engagements_between_creator_and_engager_negative", "number_of_engagements_between_creator_and_engager_positive", "creator_feature_number_of_like_engagements_received", "creator_feature_number_of_retweet_engagements_received", "creator_feature_number_of_reply_engagements_received", "creator_feature_number_of_comment_engagements_received", "creator_feature_number_of_negative_engagements_received", "creator_feature_number_of_positive_engagements_received", "creator_feature_number_of_like_engagements_given", "creator_feature_number_of_retweet_engagements_given", "creator_feature_number_of_reply_engagements_given", "creator_feature_number_of_comment_engagements_given", "creator_feature_number_of_negative_engagements_given", "creator_feature_number_of_positive_engagements_given", "engager_feature_number_of_like_engagements_received", "engager_feature_number_of_retweet_engagements_received", "engager_feature_number_of_reply_engagements_received", "engager_feature_number_of_comment_engagements_received", "engager_feature_number_of_negative_engagements_received", "engager_feature_number_of_positive_engagements_received", "number_of_engagements_like", "number_of_engagements_retweet", "number_of_engagements_reply", "number_of_engagements_comment", "number_of_engagements_negative", "number_of_engagements_positive", "engager_feature_number_of_previous_like_engagement", "engager_feature_number_of_previous_reply_engagement", "engager_feature_number_of_previous_retweet_engagement", "engager_feature_number_of_previous_comment_engagement", "engager_feature_number_of_previous_positive_engagement", "engager_feature_number_of_previous_negative_engagement", "engager_feature_number_of_previous_engagement", "engager_feature_number_of_previous_like_engagement_ratio_1", "engager_feature_number_of_previous_reply_engagement_ratio_1", "engager_feature_number_of_previous_retweet_engagement_ratio_1", "engager_feature_number_of_previous_comment_engagement_ratio_1", "engager_feature_number_of_previous_positive_engagement_ratio_1", "engager_feature_number_of_previous_negative_engagement_ratio_1", "engager_feature_number_of_previous_like_engagement_ratio", "engager_feature_number_of_previous_reply_engagement_ratio", "engager_feature_number_of_previous_retweet_engagement_ratio", "engager_feature_number_of_previous_comment_engagement_ratio", "engager_feature_number_of_previous_positive_engagement_ratio", "engager_feature_number_of_previous_negative_engagement_ratio", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_creator", "engager_feature_number_of_previous_like_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_reply_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_retweet_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_comment_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_negative_engagement_between_creator_and_engager_by_engager", "engager_feature_number_of_previous_positive_engagement_between_creator_and_engager_by_engager", # "tweet_feature_number_of_previous_like_engagements", # "tweet_feature_number_of_previous_reply_engagements", # "tweet_feature_number_of_previous_retweet_engagements", # "tweet_feature_number_of_previous_comment_engagements", # "tweet_feature_number_of_previous_positive_engagements", # "tweet_feature_number_of_previous_negative_engagements", "creator_feature_number_of_previous_like_engagements_given", "creator_feature_number_of_previous_reply_engagements_given", "creator_feature_number_of_previous_retweet_engagements_given", "creator_feature_number_of_previous_comment_engagements_given", "creator_feature_number_of_previous_positive_engagements_given", "creator_feature_number_of_previous_negative_engagements_given", "creator_feature_number_of_previous_like_engagements_received", "creator_feature_number_of_previous_reply_engagements_received", "creator_feature_number_of_previous_retweet_engagements_received", "creator_feature_number_of_previous_comment_engagements_received", "creator_feature_number_of_previous_positive_engagements_received", "creator_feature_number_of_previous_negative_engagements_received", "engager_feature_number_of_previous_like_engagement_with_language", "engager_feature_number_of_previous_reply_engagement_with_language", "engager_feature_number_of_previous_retweet_engagement_with_language", "engager_feature_number_of_previous_comment_engagement_with_language", "engager_feature_number_of_previous_positive_engagement_with_language", "engager_feature_number_of_previous_negative_engagement_with_language", "engager_feature_knows_hashtag_positive", "engager_feature_knows_hashtag_negative", "engager_feature_knows_hashtag_like", "engager_feature_knows_hashtag_reply", "engager_feature_knows_hashtag_rt", "engager_feature_knows_hashtag_comment", "creator_and_engager_have_same_main_language", "is_tweet_in_creator_main_language", "is_tweet_in_engager_main_language", # "statistical_probability_main_language_of_engager_engage_tweet_language_1", # "statistical_probability_main_language_of_engager_engage_tweet_language_2", "creator_and_engager_have_same_main_grouped_language", "is_tweet_in_creator_main_grouped_language", "is_tweet_in_engager_main_grouped_language", # # "hashtag_similarity_fold_ensembling_positive", # # "link_similarity_fold_ensembling_positive", # # "domain_similarity_fold_ensembling_positive" "tweet_feature_creation_timestamp_hour_shifted", "tweet_feature_creation_timestamp_day_phase", "tweet_feature_creation_timestamp_day_phase_shifted" ] label = [f"tweet_feature_engagement_is_{LABEL}"] train_dataset = "cherry_train" val_dataset = "cherry_val" test_dataset = "new_test" ensembling_list_dict = { 'like': ['reply', 'retweet', 'comment'], 'reply': ['reply', 'retweet', 'comment'], 'retweet': ['reply', 'retweet', 'comment'], 'comment': ['reply', 'retweet', 'comment'], } ensembling_list = ensembling_list_dict[LABEL] ensembling_lgbm_params = {} ensembling_xgb_params = {} for ens_label in ensembling_list: ensembling_lgbm_params[ens_label], ensembling_xgb_params[ens_label] \ = params_by_label(ens_label) categorical_features_set = set([]) # Load train data # loading_data_start_time = time.time() # df_train, df_train_label = Data.get_dataset_xgb(train_dataset, features, label) # print(f"Loading train data time: {loading_data_start_time - time.time()} seconds") # Load val data df_val, df_val_label = Data.get_dataset_xgb(val_dataset, features, label) # Load test data df_test = Data.get_dataset(features, test_dataset) new_index = pd.Series(df_test.index).map(lambda x: x + len(df_val)) df_test.set_index(new_index, inplace=True) # df to be predicted by the lgbm blending feature df_to_predict = pd.concat([df_val, df_test]) # BLENDING FEATURE DECLARATION feature_list = [] # NEW CODE ADDED df_train = pd.DataFrame(columns=features) df_train_label = pd.DataFrame(columns=label) need_to_load_train_set = False for ens_label in ensembling_list: lgbm_params = ensembling_lgbm_params[ens_label] for lgbm_param_dict in lgbm_params: start_time = time.time() if not LGBMEnsemblingFeature( dataset_id=train_dataset, df_train=df_train, df_train_label=get_ensembling_label( ens_label, train_dataset), df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set ).has_feature(): print(f"{ens_label} {lgbm_param_dict}") need_to_load_train_set = True if need_to_load_train_set: df_train, df_train_label = get_dataset_xgb_batch( total_n_split=1, split_n=0, dataset_id=train_dataset, X_label=features, Y_label=label, sample=0.3) for ens_label in ensembling_list: lgbm_params = ensembling_lgbm_params[ens_label] for lgbm_param_dict in lgbm_params: start_time = time.time() feature_list.append( LGBMEnsemblingFeature( dataset_id=train_dataset, df_train=df_train, df_train_label=get_ensembling_label( ens_label, train_dataset), df_to_predict=df_to_predict, param_dict=lgbm_param_dict, categorical_features_set=categorical_features_set)) # NEW PARTll # ONLY THIS PART IS NEW # LOAD THIS PART FIRST del df_train, df_train_label df_feature_list = [x.load_or_create() for x in feature_list] for ens_label in ensembling_list: start_time = time.time() if ens_label == "like": val_features_df = XGBFoldEnsemblingLike2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingLike2( test_dataset).load_or_create() elif ens_label == "retweet": val_features_df = XGBFoldEnsemblingRetweet2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingRetweet2( test_dataset).load_or_create() elif ens_label == "reply": val_features_df = XGBFoldEnsemblingReply2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingReply2( test_dataset).load_or_create() elif ens_label == "comment": val_features_df = XGBFoldEnsemblingComment2( val_dataset).load_or_create() test_features_df = XGBFoldEnsemblingComment2( test_dataset).load_or_create() else: assert False, "oh oh something went wrong. label not found" test_features_df.set_index(new_index, inplace=True) xgb_feature_df = pd.concat([val_features_df, test_features_df]) df_feature_list.append(xgb_feature_df) print(f"time: {time.time() - start_time}") del val_features_df, test_features_df # check dimensions len_val = len(df_val) for df_feat in df_feature_list: assert len(df_feat) == (len_val + len(df_test)), \ f"Blending features are not of dimension expected, len val: {len_val} len test: {len(df_test)}\n " \ f"obtained len: {len(df_feat)} of {df_feat.columns[0]}\n" # split feature dataframe in validation and testing df_feat_val_list = [df_feat.iloc[:len_val] for df_feat in df_feature_list] df_feat_test_list = [df_feat.iloc[len_val:] for df_feat in df_feature_list] df_feat_nn_val_list = [ get_nn_prediction(l, val_dataset) for l in nn_labels ] df_feat_nn_test_list = [ get_nn_prediction(l, test_dataset) for l in nn_labels ] for df_feat_nn_test in df_feat_nn_test_list: new_index = pd.Series( df_feat_nn_test.index).map(lambda x: x + len(df_val)) df_feat_nn_test.set_index(new_index, inplace=True) df_feat_val_list += df_feat_nn_val_list df_feat_test_list += df_feat_nn_test_list df_val_to_be_concatenated_list = [df_val ] + df_feat_val_list + [df_val_label] df_test_to_be_concatenated_list = [df_test] + df_feat_test_list # creating the new validation set on which we will do meta optimization df_val = pd.concat(df_val_to_be_concatenated_list, axis=1) df_test = pd.concat(df_test_to_be_concatenated_list, axis=1) # now we are in full meta-model mode # watchout! they are unsorted now, you got to re-sort the dfs df_metatrain, df_metaval = train_test_split(df_val, test_size=0.3) df_metatrain.sort_index(inplace=True) df_metaval.sort_index(inplace=True) # split dataframe columns in train and label col_names_list = [df_feat.columns[0] for df_feat in df_feature_list] extended_features = df_test.columns df_metatrain_label = df_metatrain[label] df_metatrain = df_metatrain[extended_features] df_metaval_label = df_metaval[label] df_metaval = df_metaval[extended_features] for i in range(len(df_metatrain.columns)): assert df_metatrain.columns[i] == df_test.columns[i], f'You f****d yourself. metatrain col {i}: {df_metatrain.columns[i]}' \ f' test col {i}: {df_test.columns[i]}' model_name = "lightgbm_classifier" kind = LABEL params = { 'num_leaves': 544, 'max_depth': 7, 'lambda_l1': 50.0, 'lambda_l2': 2.841130937148593, 'colsample_bynode': 0.4, 'colsample_bytree': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 8, 'min_data_in_leaf': 611, } LGBM = LightGBM( objective='binary', num_threads=-1, num_iterations=1000, early_stopping_rounds=15, **params, ) # LGBM Training training_start_time = time.time() LGBM.fit(X=df_metatrain, Y=df_metatrain_label, X_val=df_metaval, Y_val=df_metaval_label, categorical_feature=set([])) print(f"Training time: {time.time() - training_start_time} seconds") # LGBM Evaluation evaluation_start_time = time.time() prauc, rce, conf, max_pred, min_pred, avg = LGBM.evaluate( df_metaval.to_numpy(), df_metaval_label.to_numpy()) print( "since I'm lazy I did the local test on the same test on which I did EarlyStopping" ) print(f"PRAUC:\t{prauc}") print(f"RCE:\t{rce}") print(f"TN:\t{conf[0, 0]}") print(f"FP:\t{conf[0, 1]}") print(f"FN:\t{conf[1, 0]}") print(f"TP:\t{conf[1, 1]}") print(f"MAX_PRED:\t{max_pred}") print(f"MIN_PRED:\t{min_pred}") print(f"AVG:\t{avg}") print(f"Evaluation time: {time.time() - evaluation_start_time} seconds") tweets = Data.get_feature("raw_feature_tweet_id", test_dataset)["raw_feature_tweet_id"].array users = Data.get_feature("raw_feature_engager_id", test_dataset)["raw_feature_engager_id"].array # LGBM Prediction prediction_start_time = time.time() predictions = LGBM.get_prediction(df_test.to_numpy()) print(f"Prediction time: {time.time() - prediction_start_time} seconds") # Uncomment to plot feature importance at the end of training # LGBM.plot_fimportance() create_submission_file(tweets, users, predictions, f"{LABEL}_lgbm_blending_submission.csv")
def main(): ''' feature_list = [ "raw_feature_creator_follower_count", # 0 "raw_feature_creator_following_count", # 1 "raw_feature_engager_follower_count", # 2 "raw_feature_engager_following_count", # 3 "tweet_feature_number_of_photo", # 4 "tweet_feature_number_of_video", # 5 "tweet_feature_number_of_gif", # 6 "tweet_feature_number_of_hashtags", # 7 "tweet_feature_creation_timestamp_hour", # 8 "tweet_feature_creation_timestamp_week_day", # 9 "tweet_feature_number_of_mentions", # 10 "number_of_engagements_like", # 11 "number_of_engagements_retweet", # 12 "number_of_engagements_reply", # 13 "number_of_engagements_comment", # 14 "number_of_engagements_positive", # 15 "number_of_engagements_negative", # 16 "engager_feature_number_of_previous_like_engagement_ratio", # 17 "engager_feature_number_of_previous_reply_engagement_ratio", # 18 "engager_feature_number_of_previous_retweet_engagement_ratio", # 19 "engager_feature_number_of_previous_comment_engagement_ratio", # 20 "engager_feature_number_of_previous_positive_engagement_ratio", # 21 "engager_feature_number_of_previous_negative_engagement_ratio" # 22 ] ''' ''' feature_list = [ "raw_feature_creator_follower_count", "raw_feature_creator_following_count", "raw_feature_engager_follower_count", "raw_feature_engager_following_count", "raw_feature_creator_is_verified", "raw_feature_engager_is_verified", "raw_feature_engagement_creator_follows_engager", "tweet_feature_number_of_photo", "tweet_feature_number_of_video", "tweet_feature_number_of_gif", "tweet_feature_number_of_media", "tweet_feature_is_retweet", "tweet_feature_is_quote", "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags", "tweet_feature_creation_timestamp_hour", "tweet_feature_creation_timestamp_week_day", #"tweet_feature_number_of_mentions", "tweet_feature_token_length", "tweet_feature_token_length_unique", "tweet_feature_text_topic_word_count_adult_content", "tweet_feature_text_topic_word_count_kpop", "tweet_feature_text_topic_word_count_covid", "tweet_feature_text_topic_word_count_sport", "number_of_engagements_with_language_like", "number_of_engagements_with_language_retweet", "number_of_engagements_with_language_reply", "number_of_engagements_with_language_comment", "number_of_engagements_with_language_negative", "number_of_engagements_with_language_positive", "number_of_engagements_ratio_like", "number_of_engagements_ratio_retweet", "number_of_engagements_ratio_reply", "number_of_engagements_ratio_comment", "number_of_engagements_ratio_negative", "number_of_engagements_ratio_positive", "number_of_engagements_between_creator_and_engager_like", "number_of_engagements_between_creator_and_engager_retweet", "number_of_engagements_between_creator_and_engager_reply", "number_of_engagements_between_creator_and_engager_comment", "number_of_engagements_between_creator_and_engager_negative", "number_of_engagements_between_creator_and_engager_positive", "number_of_engagements_like", "number_of_engagements_retweet", "number_of_engagements_reply", "number_of_engagements_comment", "number_of_engagements_negative", "number_of_engagements_positive", "tweet_feature_creation_timestamp_hour_shifted", "tweet_feature_creation_timestamp_day_phase", "tweet_feature_creation_timestamp_day_phase_shifted", "engager_feature_number_of_previous_like_engagement_ratio", "engager_feature_number_of_previous_reply_engagement_ratio", "engager_feature_number_of_previous_retweet_engagement_ratio", "engager_feature_number_of_previous_comment_engagement_ratio", "engager_feature_number_of_previous_positive_engagement_ratio", "engager_feature_number_of_previous_negative_engagement_ratio", "adjacency_between_creator_and_engager_retweet", "adjacency_between_creator_and_engager_reply", "adjacency_between_creator_and_engager_comment", "adjacency_between_creator_and_engager_like", "adjacency_between_creator_and_engager_positive", "adjacency_between_creator_and_engager_negative", "graph_two_steps_adjacency_positive", "graph_two_steps_adjacency_negative", "graph_two_steps_adjacency_like", "graph_two_steps_adjacency_reply", "graph_two_steps_adjacency_retweet", "graph_two_steps_adjacency_comment", "graph_two_steps_positive", "graph_two_steps_negative", "graph_two_steps_like", "graph_two_steps_reply", "graph_two_steps_retweet", "graph_two_steps_comment" ] ''' feature_list = [ "raw_feature_creator_follower_count", # 0 "raw_feature_creator_following_count", # 1 ] print("Running on labels : like - retweet - reply - comment") ip = '34.242.41.76' submission_filename = "Dataset/Features/cherry_val/ensembling/nn_predictions" chunksize = 2048 train_dataset = "cherry_train" test_dataset = "new_test" ffnn_params = { 'hidden_size_1': 128, 'hidden_size_2': 64, 'hidden_dropout_prob_1': 0.5, 'hidden_dropout_prob_2': 0.5 } rec_params = { 'epochs': 5, 'weight_decay': 1e-5, 'lr': 2e-5, 'cap_length': 128, 'ffnn_params': ffnn_params } saved_model_path = "./saved_models/saved_model_multi_label" rec = MultiDistilBertRec(**rec_params) train_df = get_dataset(features=feature_list, dataset_id=train_dataset) train_df = train_df.head(3840000) train_df = rec._normalize_features(train_df, is_train=True) ### PREDICTION test_df = get_dataset(features=feature_list, dataset_id=test_dataset) #test_df = test_df.head(2500) prediction_start_time = time.time() text_test_reader_df = get_feature_reader( feature_name="raw_feature_tweet_text_token", dataset_id=test_dataset, chunksize=chunksize) predictions = rec.get_prediction( df_test_features=test_df, df_test_tokens_reader=text_test_reader_df, pretrained_model_dict_path=saved_model_path) print(f"Prediction time: {time.time() - prediction_start_time} seconds") print(predictions) print(predictions.shape) predictions_like = predictions[:, 0] predictions_retweet = predictions[:, 1] predictions_reply = predictions[:, 2] predictions_comment = predictions[:, 3] #print(predictions_like) #print(predictions_like.shape) tweets = get_feature("raw_feature_tweet_id", test_dataset)["raw_feature_tweet_id"].array users = get_feature("raw_feature_engager_id", test_dataset)["raw_feature_engager_id"].array #tweets = tweets.head(2500).array #users = users.head(2500).array create_submission_file(tweets, users, predictions_like, submission_filename + "_like.csv") create_submission_file(tweets, users, predictions_like, submission_filename + "_retweet.csv") create_submission_file(tweets, users, predictions_like, submission_filename + "_reply.csv") create_submission_file(tweets, users, predictions_like, submission_filename + "_comment.csv")
def main(label_1, label_2, test_dataset, model_id): feature_list = [ "raw_feature_creator_follower_count", "raw_feature_creator_following_count", "raw_feature_engager_follower_count", "raw_feature_engager_following_count", "raw_feature_creator_is_verified", "raw_feature_engager_is_verified", "raw_feature_engagement_creator_follows_engager", "tweet_feature_number_of_photo", "tweet_feature_number_of_video", "tweet_feature_number_of_gif", "tweet_feature_number_of_media", "tweet_feature_is_retweet", "tweet_feature_is_quote", "tweet_feature_is_top_level", "tweet_feature_number_of_hashtags", "tweet_feature_creation_timestamp_hour", "tweet_feature_creation_timestamp_week_day", "tweet_feature_token_length", "tweet_feature_token_length_unique", "tweet_feature_text_topic_word_count_adult_content", "tweet_feature_text_topic_word_count_kpop", "tweet_feature_text_topic_word_count_covid", "tweet_feature_text_topic_word_count_sport", "number_of_engagements_with_language_like", "number_of_engagements_with_language_retweet", "number_of_engagements_with_language_reply", "number_of_engagements_with_language_comment", "number_of_engagements_with_language_negative", "number_of_engagements_with_language_positive", "number_of_engagements_ratio_like", "number_of_engagements_ratio_retweet", "number_of_engagements_ratio_reply", "number_of_engagements_ratio_comment", "number_of_engagements_ratio_negative", "number_of_engagements_ratio_positive", "number_of_engagements_between_creator_and_engager_like", "number_of_engagements_between_creator_and_engager_retweet", "number_of_engagements_between_creator_and_engager_reply", "number_of_engagements_between_creator_and_engager_comment", "number_of_engagements_between_creator_and_engager_negative", "number_of_engagements_between_creator_and_engager_positive", "number_of_engagements_like", "number_of_engagements_retweet", "number_of_engagements_reply", "number_of_engagements_comment", "number_of_engagements_negative", "number_of_engagements_positive", "tweet_feature_creation_timestamp_hour_shifted", "tweet_feature_creation_timestamp_day_phase", "tweet_feature_creation_timestamp_day_phase_shifted", "engager_feature_number_of_previous_like_engagement_ratio", "engager_feature_number_of_previous_reply_engagement_ratio", "engager_feature_number_of_previous_retweet_engagement_ratio", "engager_feature_number_of_previous_comment_engagement_ratio", "engager_feature_number_of_previous_positive_engagement_ratio", "engager_feature_number_of_previous_negative_engagement_ratio", "adjacency_between_creator_and_engager_retweet", "adjacency_between_creator_and_engager_reply", "adjacency_between_creator_and_engager_comment", "adjacency_between_creator_and_engager_like", "adjacency_between_creator_and_engager_positive", "adjacency_between_creator_and_engager_negative", "graph_two_steps_adjacency_positive", "graph_two_steps_adjacency_negative", "graph_two_steps_adjacency_like", "graph_two_steps_adjacency_reply", "graph_two_steps_adjacency_retweet", "graph_two_steps_adjacency_comment", "graph_two_steps_positive", "graph_two_steps_negative", "graph_two_steps_like", "graph_two_steps_reply", "graph_two_steps_retweet", "graph_two_steps_comment" ] print(f"Model : {model_id}") print(f"Running on labels : {label_1} - {label_2}") ip = '34.242.41.76' submission_dir = f"Dataset/Features/{test_dataset}/ensembling" submission_filename = f"{submission_dir}/nn_predictions" training_chunksize = 192 training_batches_number = 20000 n_data_train = training_chunksize * training_batches_number test_chunksize = 2048 train_dataset = "cherry_train" print(f"Test dataset : {test_dataset}") ffnn_params = { 'hidden_size_1': 128, 'hidden_size_2': 64, 'hidden_dropout_prob_1': 0.5, 'hidden_dropout_prob_2': 0.5 } rec_params = { 'epochs': 2, 'weight_decay': 1e-5, 'lr': 2e-5, 'cap_length': 128, 'ffnn_params': ffnn_params } saved_model_path = f"./saved_models/saved_model_{label_1}_{label_2}_{model_id}" rec = DualDistilBertRec(**rec_params) train_df = get_dataset(features=feature_list, dataset_id=train_dataset) if model_id == 1: train_df = train_df.head(n_data_train) elif model_id == 2: train_df = train_df.iloc[n_data_train:2 * n_data_train] train_df = rec._normalize_features(train_df, is_train=True) ### PREDICTION test_df = get_dataset(features=feature_list, dataset_id=test_dataset) #test_df = test_df.head(2500) prediction_start_time = time.time() text_test_reader_df = get_feature_reader( feature_name="raw_feature_tweet_text_token", dataset_id=test_dataset, chunksize=test_chunksize) predictions = rec.get_prediction( df_test_features=test_df, df_test_tokens_reader=text_test_reader_df, pretrained_model_dict_path=saved_model_path) print(f"Prediction time: {time.time() - prediction_start_time} seconds") print(predictions) print(predictions.shape) p_1 = predictions[:, 0] p_2 = predictions[:, 1] tweets = get_feature("raw_feature_tweet_id", test_dataset)["raw_feature_tweet_id"].array users = get_feature("raw_feature_engager_id", test_dataset)["raw_feature_engager_id"].array #tweets = tweets.head(2500).array #users = users.head(2500).array pathlib.Path(submission_dir).mkdir(parents=True, exist_ok=True) create_submission_file(tweets, users, p_1, submission_filename + f"_{label_1}_{model_id}.csv") create_submission_file(tweets, users, p_2, submission_filename + f"_{label_2}_{model_id}.csv")