def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in ["user_id", "content_id"]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"][ "PastNTimestampEncoder"] = PastNFeatureEncoder(column="timestamp", past_ns=[5, 20], agg_funcs=["vslast"], remove_now=False) feature_factory_dict["user_id"][ "Past1ContentTypeId"] = PastNFeatureEncoder(column="content_type_id", past_ns=[5, 15], agg_funcs=["mean"], remove_now=False) feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeVsShiftDiffEncoder"] = ElapsedTimeVsShiftDiffEncoder() feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } for column in ["user_id", "content_id", "part", ("user_id", "part")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="shiftdiff_timestamp_by_user_id_cap200k", remove_now=False) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=column, agg_column="study_time", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="shiftdiff_timestamp_by_user_id_cap200k", remove_now=False) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=list(column), agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=5, is_partial_fit=True) feature_factory_dict[f"previous_5_ans"] = { "TargetEncoder": TargetEncoder(column="previous_5_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=100) feature_factory_dict["user_id"][ "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "train_0" logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_question_id": { "type": "category" }, "previous_answer_question_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" }, "duration_previous_content": { "type": "numeric" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="question_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_question_id", "previous_answer_question_id", "row_id", "timediff-elapsedtime_bin500", "duration_previous_content" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../../riiid_takoi/notebook/fe/validation_row_id.feather").head( len(df)) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model261", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model261/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model261/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model261/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model261/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): label = item["label"].to(device).float() output = model(item, device) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
import glob import time import tqdm output_dir = f"../output/{os.path.basename(__file__).replace('.py', '')}/{dt.now().strftime('%Y%m%d%H%M%S')}/" for _ in tqdm.tqdm(range(60*60*5)): time.sleep(1) for fname in glob.glob("../input/riiid-test-answer-prediction/split10/*"): print(fname) df = pd.read_pickle(fname).sort_values(["user_id", "timestamp"]) # df = pd.concat([pd.read_pickle(fname).head(500), pd.read_pickle(fname).tail(500)]) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1).astype("int8") logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = { "TagsSeparator": TagsSeparator() } for column in ["content_id", "user_id", "part", "prior_question_had_explanation", "tags1", "tags2", ("user_id", "prior_question_had_explanation"), ("user_id", "part"), ("content_id", "prior_question_had_explanation")]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) }
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in ["user_id", "content_id"]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True) feature_factory_dict["user_id"]["PastNTimestampEncoder"] = PastNFeatureEncoder(column="timestamp", past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10], agg_funcs=["vslast"], remove_now=False) feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"]["ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder() feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(column="user_id", is_partial_fit=True) feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } for column in [("user_id", "prior_question_had_explanation"), ("content_id", "prior_question_had_explanation"), ("part", "prior_question_had_explanation"), ("user_id", "part", "prior_question_had_explanation")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(column=column, agg_column="duration_previous_content_cap100k", remove_now=False) feature_factory_dict[column][f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(column=column, agg_column="study_time", remove_now=False) else: feature_factory_dict[column][f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(column=list(column), agg_column="duration_previous_content_cap100k", remove_now=False) feature_factory_dict[column][f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(column=list(column), agg_column="study_time", remove_now=False) feature_factory_dict["user_id"]["CategoryLevelEncoderPart"] = CategoryLevelEncoder(groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"]["UserContentNowRateEncoder"] = UserContentNowRateEncoder(column="part", target=[1, 2, 3, 4, 5, 6, 7], rate_func="elo") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"]["PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(n=5, is_partial_fit=True) feature_factory_dict[f"previous_5_ans"] = { "TargetEncoder": TargetEncoder(column="previous_5_ans") } feature_factory_dict["user_id"]["QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"]["QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"]["UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["user_id"]["PreviousContentAnswerTargetEncoder"] = PreviousContentAnswerTargetEncoder(min_size=300) feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = { "TagsSeparator": TagsSeparator(is_partial_fit=True) } for column in ["user_id", "content_id", "tags1", "tags2"]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeVsShiftDiffEncoder"] = ElapsedTimeVsShiftDiffEncoder() # feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id") # feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True) # feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id", # agg_column="content_level_user_id", # remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } for column in ["user_id", "content_id", "part", ("user_id", "part")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="shiftdiff_timestamp_by_user_id_cap200k", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=column, agg_column="study_time", remove_now=True) else: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="shiftdiff_timestamp_by_user_id_cap200k", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=list(column), agg_column="study_time", remove_now=True) feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=500) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=3, is_partial_fit=True) feature_factory_dict["user_id"]["Counter"] = Counter( groupby_column="user_id", agg_column="prior_question_had_explanation", categories=[0, 1]) feature_factory_dict[f"previous_3_ans"] = { "TargetEncoder": TargetEncoder(column="previous_3_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=100) feature_factory_dict["user_id"][ "QuestionQuestionTableEncoder"] = QuestionQuestionTableEncoder( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "category" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300" ]] print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=pd.DataFrame()) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model051", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model051/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model051/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model051/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model051/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * epochs) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] for item in tqdm(dataloader_val): x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to( device).long() output = model(x, target_id, part, elapsed_time, duration_previous_content) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = { "TagsSeparator": TagsSeparator(is_partial_fit=True) } for column in [ "user_id", "content_id", "part", "prior_question_had_explanation", "tags1", "tags2", ("user_id", "prior_question_had_explanation"), ("user_id", "part"), ("content_id", "prior_question_had_explanation") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp") feature_factory_dict["user_id"][ "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id", column="content_id") for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["content_id"][ "ContentLevelEncoder2UserId"] = ContentLevelEncoder( vs_column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "MeanAggregatorContentLevel"] = MeanAggregator( column="user_id", agg_column="content_level_user_id", remove_now=False) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder( column="user_count_bin") feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]), "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[( "prior_question_had_explanation", "user_count_bin")] = { "CountEncoder": CountEncoder( column=["prior_question_had_explanation", "user_count_bin"]), "TargetEncoder": TargetEncoder( column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) for column in ["user_id", "user_count_bin", ("user_id", "part")]: if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_content_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_content_id", remove_now=False) feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \ CategoryLevelEncoder(groupby_column="user_id", agg_column="user_count_bin", categories=[0]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "CountEncoder": CountEncoder(column=["part", "prior_question_elapsed_time_bin"]), "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num) return feature_factory_manager
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() column_config = { ("content_id", "content_type_id"): {"type": "category", "dtype": np.int16}, "user_answer": {"type": "leakage_feature", "dtype": np.int8}, "answered_correctly": {"type": "leakage_feature", "dtype": np.int8}, "part": {"type": "category", "dtype": np.int8}, "prior_question_elapsed_time_bin300": {"type": "category", "dtype": np.int16}, "duration_previous_content_bin300": {"type": "category", "dtype": np.int16}, "prior_question_had_explanation": {"type": "category", "dtype": np.int8}, "rating_diff_content_user_id": {"type": "numeric", "dtype": np.float16}, "task_container_id_bin300": {"type": "category", "dtype": np.int16}, "previous_answer_index_content_id": {"type": "category", "dtype": np.int16}, "previous_answer_content_id": {"type": "category", "dtype": np.int8}, "timediff-elapsedtime_bin500": {"type": "category", "dtype": np.int16} } if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent(is_partial_fit=True) feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"]["UserContentRateEncoder"] = UserContentRateEncoder(rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"]["ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder() feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) ff_for_transformer.make_dict(df=df) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -90: return -90 if x > 90: return 90 return x df["task_container_id_bin300"] = [x if x < 300 else 300 for x in df["task_container_id"].values] df["timediff-elapsedtime_bin500"] = [f(x) for x in df["timediff-elapsedtime"].values] df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_content_id", "previous_answer_content_id", "row_id", "timediff-elapsedtime_bin500"]] for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def main(params: dict): import mlflow logger = get_logger() print("start params={}".format(params)) # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle").head(30_000_000) df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) df = df[["user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly"]] train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.1: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.9) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = (w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype(str) + "_" + w_df["group"].astype(str) ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "category"}, "part": {"type": "category"}}, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) group = ff_for_transformer.all_predict(w_df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) print(group) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"}, "user_answer": {"type": "category"}, "part": {"type": "category"}}, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=64, shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"]) optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"]) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".format(epoch, loss, auc, auc_val)) preds = [] labels = [] for d in tqdm(dataloader_val): x = d[0].to(device).long() target_id = d[1].to(device).long() part = d[2].to(device).long() label = d[3].to(device).long() output, atten_weight = model(x, target_id, part) preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) print(preds) df_oof = pd.DataFrame() df_oof["row_id"] = df.loc[val_idx].index df_oof["predict"] = preds df_oof["target"] = df.loc[val_idx]["answered_correctly"].values df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) df_oof2 = pd.read_csv("../output/ex_172/20201202080625/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_transformer = roc_auc_score(df_oof2["target"].values, df_oof2["predict"].values) auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("single transformer: {:.4f}".format(auc_transformer)) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) mlflow.log_param("count_row", len(df)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.log_metric("auc_lgbm", auc_lgbm) mlflow.log_metric("auc_ensemble", max_auc) mlflow.log_metric("ensemble_nn_ratio", max_nn_ratio) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: ff_for_transformer = FeatureFactoryForTransformer(column_config={("content_id", "content_type_id"): {"type": "category"}, "part": {"type": "category"}}, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "qq_table2_mean": { "type": "numeric" }, "qq_table2_min": { "type": "numeric" } } ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["QuestionQuestionTableEncoder2"] = \ QuestionQuestionTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300 ) feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "qq_table2_mean", "qq_table2_min" ]] df["qq_table2_mean"] = df["qq_table2_mean"].fillna(0.65) df["qq_table2_min"] = df["qq_table2_min"].fillna(0.6) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in [ "user_id", "content_id", "part", ("user_id", "prior_question_had_explanation"), ("user_id", "part"), ("content_id", "prior_question_had_explanation") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit), "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["content_id"][ "MeanAggregatorShiftDiffTimestamp"] = MeanAggregator( column="content_id", agg_column="shiftdiff_timestamp_by_user_id", remove_now=False) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["content_id"][ "ContentLevelEncoder2UserId"] = ContentLevelEncoder( vs_column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "MeanAggregatorContentLevel"] = MeanAggregator( column="user_id", agg_column="content_level_user_id", remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[( "prior_question_had_explanation", "user_count_bin")] = { "TargetEncoder": TargetEncoder( column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"][ "FirstColumnEncoderContentId"] = FirstColumnEncoder( agg_column="content_id", astype="int16", is_partial_fit=True) feature_factory_dict["user_id"][ "FirstColumnEncoderPart"] = FirstColumnEncoder(agg_column="part", astype="int8", is_partial_fit=True) for column in [ "user_id", "user_count_bin", "first_column_content_id", "first_column_part", ("user_id", "part") ]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_content_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_content_id", remove_now=False) for column in [("content_id", "prior_question_had_explanation")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_user_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_user_id", remove_now=False) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=1000) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=3, is_partial_fit=True) feature_factory_dict[f"previous_3_ans"] = { "TargetEncoder": TargetEncoder(column="previous_3_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100) feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(500000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" } } with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df.iloc[:95000].copy()) w_df = feature_factory_manager.all_predict(df.iloc[:95000].copy()) ff_for_transformer.fit(w_df) for _, w_df in tqdm(df.iloc[95000:].groupby( ["user_id", "task_container_id"])): ww_df = feature_factory_manager.partial_predict( w_df.drop(["answered_correctly", "user_answer"], axis=1)) group = ff_for_transformer.partial_predict(ww_df) ww_df["answered_correctly"] = w_df["answered_correctly"] ww_df["user_answer"] = w_df["user_answer"] feature_factory_manager.fit(ww_df) ff_for_transformer.fit(ww_df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def test_interval1(self): logger = get_logger() df = pd.DataFrame({"row_id": [0, 1, 2, 3, 4, 5, 6, 7], "user_id": ["a", "a", "a", "b", "a", "b", "a", "b"], "timestamp": [0, 1, 2, 3, 4, 5, 6, 7], "content_id": [0, 0, 1, 1, 0, 0, 1, 1], "content_type_id": [0, 1, 0, 0, 0, 0, 0, 1], "user_answer": [0, 1, 2, 3, 4, 5, 6, 7], "answered_correctly": [0, -1, 1, -1, 0, 0, 1, -1], "prior_question_had_explanation": [0, 0, 0, 0, 0, 0, 0, 0]}).sort_values(["user_id", "timestamp"]) df_question = pd.DataFrame({"question_id": [0, 1], "bundle_id": [0, 1], "correct_answer": [0, 1], "part": [0, 1], "tags": ["0", "1"]}) df_lecture = pd.DataFrame({"lecture_id": [0, 1], "tag": [0, 1], "part": [0, 1], "type_of": ["0", "1"]}) feature_factory_dict = { "user_id": { "CountEncoder": CountEncoder(column="user_id"), "TargetEncoder": TargetEncoder(column="user_id")} } print(df.iloc[4:]) gen = MyEnvironment(df_test=df.iloc[4:], interval=1).iter_test() feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger) env_manager = EnvironmentManager(feature_factory_manager=feature_factory_manager, gen=gen, fit_interval=1, df_question=df_question, df_lecture=df_lecture) w_df1 = pd.merge(df[df["content_type_id"] == 0], df_question, how="left", left_on="content_id", right_on="question_id") w_df2 = pd.merge(df[df["content_type_id"] == 1], df_lecture, how="left", left_on="content_id", right_on="lecture_id") df2 = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"]) df_expect = feature_factory_manager.all_predict(df2).iloc[4:] df_expect["tag"] = df_expect["tag"].fillna(-1) df_expect["correct_answer"] = df_expect["correct_answer"].fillna(-1) df_expect["bundle_id"] = df_expect["bundle_id"].fillna(-1) df_expect["prior_question_had_explanation"] = df_expect["prior_question_had_explanation"].astype("float16").fillna(-1).astype("int8") df_expect.columns = [x.replace(" ", "_") for x in df_expect.columns] df_actual = pd.DataFrame() feature_factory_manager.fit(df2.iloc[:4]) while True: x = env_manager.step() if x is None: break df_test = x[0] df_sub = x[1] df_actual = pd.concat([df_actual, df_test], axis=0) pd.testing.assert_frame_equal(df_expect.reset_index(drop=True), df_actual.reset_index(drop=True), check_dtype=False)
def main(params: dict): import mlflow logger = get_logger() print("start params={}".format(params)) # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle").head(30_000_000) df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(3000) df.to_csv("aaa.csv") df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) df = df[[ "user_id", "content_id", "content_type_id", "part", "answered_correctly" ]] train_idx = [] val_idx = [] np.random.seed(0) # for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): for _, w_df in df.groupby("user_id"): if np.random.random() < 0.1: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.9) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 ff_for_transformer = FeatureFactoryForTransformer( column_config={ ("content_id", "content_type_id"): { "type": "category" }, "part": { "type": "category" } }, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) group = ff_for_transformer.all_predict(df) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) ## all_predict xs = [] target_ids = [] parts = [] labels = [] for d in tqdm(dataloader_val): xs.extend(d[0].to(device).long().data.cpu().numpy().tolist()) target_ids.extend(d[1].to(device).long().data.cpu().numpy().tolist()) parts.extend(d[2].to(device).long().data.cpu().numpy().tolist()) labels.extend(d[3].to(device).long().data.cpu().numpy().tolist()) print(xs[0]) print(target_ids[0]) print(parts[0]) print(labels[0]) ## partial_predict ff_for_transformer = FeatureFactoryForTransformer( column_config={ ("content_id", "content_type_id"): { "type": "category" }, "part": { "type": "category" } }, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.fit(df.loc[train_idx]) xs_test = [] target_ids_test = [] parts_test = [] labels_test = [] for i in tqdm(range(len(df.loc[val_idx]))): w_df = df.loc[val_idx[i:i + 1]] group = ff_for_transformer.partial_predict(w_df.copy()) dataset_val = SAKTDataset(group, predict_mode=True, n_skill=n_skill, max_seq=params["max_seq"]) dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1) for d in dataloader_val: xs_test.extend(d[0].to(device).long().data.cpu().numpy().tolist()) target_ids_test.extend( d[1].to(device).long().data.cpu().numpy().tolist()) parts_test.extend( d[2].to(device).long().data.cpu().numpy().tolist()) labels_test.extend( d[3].to(device).long().data.cpu().numpy().tolist()) ff_for_transformer.fit(w_df.copy()) pd.DataFrame(xs).to_csv("all_xs.csv") pd.DataFrame(xs_test).to_csv("partial_xs.csv") pd.DataFrame(labels).to_csv("all_labels.csv") pd.DataFrame(labels_test).to_csv("partial_labels.csv") df.loc[val_idx].to_csv("raw_data.csv") assert xs[0] == xs_test[0] assert xs == xs_test assert target_ids == target_ids_test assert parts == parts_test
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1) column_config = { ("content_id", "content_type_id"): {"type": "category", "dtype": np.int16}, "user_answer": {"type": "leakage_feature", "dtype": np.int8}, "answered_correctly": {"type": "leakage_feature", "dtype": np.int8}, "part": {"type": "category", "dtype": np.int8}, "prior_question_elapsed_time_bin300": {"type": "category", "dtype": np.int16}, "duration_previous_content_bin300": {"type": "category", "dtype": np.int16}, "prior_question_had_explanation": {"type": "category", "dtype": np.int8}, "rating_diff_content_user_id": {"type": "numeric", "dtype": np.float16}, "task_container_id_bin300": {"type": "category", "dtype": np.int16}, "previous_answer_index_content_id": {"type": "category", "dtype": np.int16}, "previous_answer_content_id": {"type": "category", "dtype": np.int8} } if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"]["DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"]["ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"]["UserContentRateEncoder"] = UserContentRateEncoder(rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer(column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) df["task_container_id_bin300"] = [x if x < 300 else 300 for x in df["task_container_id"]] if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) ff_for_transformer.make_dict(df=df) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) df["answered_correctly"] = df["answered_correctly"].replace(np.nan, -1) print(df["previous_answer_index_content_id"].value_counts()) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open(f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
from pipeline.p_005_partialfit import Pipeline from experiment.common import get_logger import pandas as pd from model.lgbm import train_lgbm_cv_newuser from sklearn.model_selection import KFold from datetime import datetime as dt import os import glob output_dir = f"../output/ex_007/{dt.now().strftime('%Y%m%d%H%M%S')}/" for model_id, fname in enumerate(glob.glob("../input/riiid-test-answer-prediction/split10/*")): print(fname) df = pd.read_pickle(fname) pipeline = Pipeline(logger=get_logger()) df = pipeline.fit_transform(df) os.makedirs(output_dir, exist_ok=True) params = { 'objective': 'binary', 'num_leaves': 32, 'min_data_in_leaf': 15, # 42, 'max_depth': -1, 'learning_rate': 0.1, 'boosting': 'gbdt', 'bagging_fraction': 0.7, # 0.5, 'feature_fraction': 0.5, 'bagging_seed': 0, 'reg_alpha': 0.1, # 1.728910519108444, 'reg_lambda': 1,