Ejemplo n.º 1
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id", ("last_lecture", "content_id")]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "PastNTimestampEncoder"] = PastNFeatureEncoder(
            column="timestamp",
            past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10],
            agg_funcs=["vslast"],
            remove_now=False)
    feature_factory_dict["user_id"][
        "Past1ContentTypeId"] = PastNFeatureEncoder(column="content_type_id",
                                                    past_ns=[5, 15],
                                                    agg_funcs=["mean"],
                                                    remove_now=False)
    feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder(
        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ElapsedTimeVsShiftDiffEncoder"] = ElapsedTimeVsShiftDiffEncoder()
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }

    for column in ["user_id", "content_id", "part", ("user_id", "part")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="shiftdiff_timestamp_by_user_id_cap200k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="shiftdiff_timestamp_by_user_id_cap200k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=False)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])

    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=300)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=5, is_partial_fit=True)

    feature_factory_dict[f"previous_5_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_5_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")
    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
Ejemplo n.º 2
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}
    feature_factory_dict["tags"] = {
        "TagsSeparator": TagsSeparator(is_partial_fit=True)
    }

    for column in [
            "user_id", "content_id", "part",
        ("user_id", "prior_question_had_explanation"), ("user_id", "part"),
        ("content_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp")
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="content_id")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][
            f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                column=column,
                agg_column="prior_question_elapsed_time",
                remove_now=True)

    feature_factory_dict["user_id"][
        "UserLevelEncoder2ContentId"] = UserLevelEncoder2(
            vs_column="content_id")
    feature_factory_dict["content_id"][
        "ContentLevelEncoder2UserId"] = ContentLevelEncoder(
            vs_column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "MeanAggregatorContentLevel"] = MeanAggregator(
            column="user_id",
            agg_column="content_level_user_id",
            remove_now=False)
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }
    feature_factory_dict[(
        "prior_question_had_explanation", "user_count_bin")] = {
            "TargetEncoder":
            TargetEncoder(
                column=["prior_question_had_explanation", "user_count_bin"])
        }

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])
    feature_factory_dict["user_id"][
        "FirstColumnEncoderContentId"] = FirstColumnEncoder(
            column="content_id", astype="int16", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "FirstColumnEncoderPart"] = FirstColumnEncoder(column="part",
                                                       astype="int8",
                                                       is_partial_fit=True)
    for column in [
            "user_id", "user_count_bin", "first_column_content_id",
            "first_column_part", ("user_id", "part")
    ]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=column,
                    agg_column="target_enc_content_id",
                    remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="target_enc_content_id",
                    remove_now=False)

    for column in [
            "content_id", "part", "tags1", "tags2",
            "prior_question_had_explanation",
        ("content_id", "prior_question_had_explanation")
    ]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=column,
                    agg_column="target_enc_user_id",
                    remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="target_enc_user_id",
                    remove_now=False)

    feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \
        CategoryLevelEncoder(groupby_column="user_id",
                             agg_column="user_count_bin",
                             categories=[0])

    feature_factory_dict["prior_question_elapsed_time"] = {
        "PriorQuestionElapsedTimeBinningEncoder":
        PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True)
    }
    feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = {
        "TargetEncoder":
        TargetEncoder(column=["part", "prior_question_elapsed_time_bin"])
    }
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id)
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num)
    return feature_factory_manager
Ejemplo n.º 3
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
    for column in [
            "content_id", "user_id", "content_type_id",
            "prior_question_had_explanation", "tags1", "tags2", "tags3",
            "tags4", "tags5", "tags6", ("user_id", "content_type_id"),
        ("user_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = column == "content_id"
        is_onebyone = "content_id" in column
        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column)),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }

    for column in [
            "part", ("user_id", "tag"), ("user_id", "part"),
        ("content_type_id", "part"), ("user_id", "content_id")
    ]:
        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=column)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=list(column))
            }

    feature_factory_dict["user_id"][
        "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                    agg_column="timestamp",
                                                    remove_now=False)
    feature_factory_dict["user_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="user_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    feature_factory_dict["user_id"]["ShiftDiffEncoder"] = ShiftDiffEncoder(
        groupby="user_id", column="timestamp")
    feature_factory_dict["content_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="content_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)

    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].fillna(-1).astype("int8")
        if debug:
            df = df.head(1000)
        df = pd.concat([
            pd.merge(df[df["content_type_id"] == 0],
                     df_question,
                     how="left",
                     left_on="content_id",
                     right_on="question_id"),
            pd.merge(df[df["content_type_id"] == 1],
                     df_lecture,
                     how="left",
                     left_on="content_id",
                     right_on="lecture_id")
        ]).sort_values(["user_id", "timestamp"])
        feature_factory_manager.fit(df, is_first_fit=True)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    df_test_prev1 = pd.DataFrame()
    answered_correctlies = []
    user_answers = []
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:  # 初回のみパスするためのif
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]
            answered_correctlies.extend([
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ])
            user_answers.extend([
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ])

        if debug:
            update_record = 1
        else:
            update_record = 150
        if len(df_test_prev) > update_record:
            df_test_prev["answered_correctly"] = answered_correctlies
            df_test_prev["user_answer"] = user_answers
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)
            df_test_prev = df_test_prev[
                df_test_prev["answered_correctly"] != -1]
            df_test_prev["answered_correctly"] = df_test_prev[
                "answered_correctly"].replace(-1, np.nan)
            df_test_prev["prior_question_had_explanation"] = df_test_prev[
                "prior_question_had_explanation"].fillna(-1).astype("int8")

            feature_factory_manager.fit(df_test_prev)

            df_test_prev = pd.DataFrame()
            answered_correctlies = []
            user_answers = []
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"merge... ")
        w_df1 = pd.merge(df_test[df_test["content_type_id"] == 0],
                         df_question,
                         how="left",
                         left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df_test[df_test["content_type_id"] == 1],
                         df_lecture,
                         how="left",
                         left_on="content_id",
                         right_on="lecture_id")
        df_test = pd.concat([w_df1, w_df2])
        df_test["tag"] = df_test["tag"].fillna(-1)
        df_test["correct_answer"] = df_test["correct_answer"].fillna(-1)
        df_test["bundle_id"] = df_test["bundle_id"].fillna(-1)

        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        df.columns = [x.replace(" ", "_") for x in df.columns]
        logger.info(f"other... ")

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df_test_prev.append(df[cols + ["user_id", "tags"]])
        if debug:
            df_test_prev.to_csv(f"{i}.csv")
Ejemplo n.º 4
0
def make_feature_factory_manager(split_num, size, window, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}
    feature_factory_dict["tags"] = {
        "TagsSeparator": TagsSeparator(is_partial_fit=True)
    }

    for column in ["user_id", "content_id"]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder(
        is_partial_fit=True)
    # feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id")
    # feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True)
    # feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id",
    #                                                                                agg_column="content_level_user_id",
    #                                                                                remove_now=False)
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }

    for column in ["user_id", "content_id", "part", ("user_id", "part")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="prior_question_elapsed_time",
                    remove_now=True)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=True)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="prior_question_elapsed_time",
                    remove_now=True)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=True)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])

    feature_factory_dict["prior_question_elapsed_time"] = {
        "PriorQuestionElapsedTimeBinningEncoder":
        PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True)
    }
    feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = {
        "TargetEncoder":
        TargetEncoder(column=["part", "prior_question_elapsed_time_bin"])
    }
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=500)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=3, is_partial_fit=True)

    feature_factory_dict[f"previous_3_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_3_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=100)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder"] = QuestionQuestionTableEncoder(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")

    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
Ejemplo n.º 5
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id"]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "DurationPreviousContent"] = DurationPreviousContent(
            is_partial_fit=True)
    feature_factory_dict["user_id"][
        "PastNTimestampEncoder"] = PastNFeatureEncoder(
            column="timestamp",
            past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10],
            agg_funcs=["vslast"],
            remove_now=False)
    feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(
        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
        )
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }
    feature_factory_dict["user_id"][
        "PastNUserAnswerHistory"] = PastNUserAnswerHistory(past_n=2,
                                                           min_size=300)
    for column in [("user_id", "prior_question_had_explanation"),
                   ("content_id", "prior_question_had_explanation"),
                   ("part", "prior_question_had_explanation"),
                   ("user_id", "part", "prior_question_had_explanation")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="duration_previous_content_cap100k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="duration_previous_content_cap100k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=False)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])
    feature_factory_dict["user_id"][
        "UserContentNowRateEncoder"] = UserContentNowRateEncoder(
            column="part", target=[1, 2, 3, 4, 5, 6, 7], rate_func="elo")
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=300)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=5, is_partial_fit=True)

    feature_factory_dict[f"previous_5_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_5_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")
    feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-Duration100k"] = \
        CorrectVsIncorrectMeanEncoder(groupby="content_id",
                                      column="duration_previous_content_cap100k",
                                      min_size=300)
    feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-UserIdTargetEnc"] = \
        CorrectVsIncorrectMeanEncoder(groupby="part",
                                      column="target_enc_user_id",
                                      min_size=300)

    feature_factory_dict["user_id"][
        "PreviousContentAnswerTargetEncoder"] = PreviousContentAnswerTargetEncoder(
            min_size=300)
    feature_factory_dict["post"] = {
        "DurationFeaturePostProcess": DurationFeaturePostProcess()
    }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
Ejemplo n.º 6
0
def make_feature_factory_manager(split_num):
    logger = get_logger()

    feature_factory_dict = {}
    for column in [
            "content_id", "user_id", "prior_question_had_explanation",
        ("user_id", "part"), ("content_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column, is_partial_fit=is_partial_fit),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column),
                             is_partial_fit=is_partial_fit),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="content_id")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][
            f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                column=column,
                agg_column="prior_question_elapsed_time",
                remove_now=True)

    feature_factory_dict["user_id"][
        "UserLevelEncoder2ContentId"] = UserLevelEncoder2(
            vs_column="content_id")
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder(
        column="user_count_bin")
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]),
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]),
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }
    feature_factory_dict[(
        "prior_question_had_explanation", "user_count_bin")] = {
            "CountEncoder":
            CountEncoder(
                column=["prior_question_had_explanation", "user_count_bin"]),
            "TargetEncoder":
            TargetEncoder(
                column=["prior_question_had_explanation", "user_count_bin"])
        }

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])
    feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \
        CategoryLevelEncoder(groupby_column="user_id",
                             agg_column="user_count_bin",
                             categories=[0])

    feature_factory_dict["prior_question_elapsed_time"] = {
        "PriorQuestionElapsedTimeBinningEncoder":
        PriorQuestionElapsedTimeBinningEncoder()
    }
    feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = {
        "CountEncoder":
        CountEncoder(column=["part", "prior_question_elapsed_time_bin"]),
        "TargetEncoder":
        TargetEncoder(column=["part", "prior_question_elapsed_time_bin"])
    }
    feature_factory_dict[("user_id", "content_id")] = {
        "PreviousAnswer2": PreviousAnswer2(column=["user_id", "content_id"])
    }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num)
    return feature_factory_manager
Ejemplo n.º 7
0
        "prior_question_had_explanation"].fillna(-1).astype("int8")
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
    for column in [
            "content_id", "user_id", "content_type_id",
            "prior_question_had_explanation", "tags1", "tags2", "tags3",
            "tags4", "tags5", "tags6", ("user_id", "content_type_id"),
        ("user_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = column == "content_id"

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column)),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }

    for column in [
            "part", ("user_id", "tag"), ("user_id", "part"),
        ("content_type_id", "part")
Ejemplo n.º 8
0
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1).astype("int8")
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {
        "TagsSeparator": TagsSeparator()
    }
    for column in ["content_id", "user_id", "part", "prior_question_had_explanation",
                   "tags1", "tags2",
                   ("user_id", "prior_question_had_explanation"), ("user_id", "part"),
                   ("content_id", "prior_question_had_explanation")]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit),
                "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit),
                "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"]["ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                                                    column="timestamp",
                                                                                    is_partial_fit=True)
    feature_factory_dict["user_id"]["ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id",
                                                                                    column="content_id")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(column=column,
                                                                                                           agg_column="prior_question_elapsed_time",
Ejemplo n.º 9
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id", "part",
                   ("user_id", "prior_question_had_explanation"), ("user_id", "part"),
                   ("content_id", "prior_question_had_explanation")]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit),
                "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit),
                "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"]["ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                                                    column="timestamp",
                                                                                    is_partial_fit=True)
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(column=column,
                                                                                                           agg_column="prior_question_elapsed_time",
                                                                                                           remove_now=True)
    feature_factory_dict["content_id"]["MeanAggregatorShiftDiffTimestamp"] = MeanAggregator(column="content_id",
                                                                                            agg_column="shiftdiff_timestamp_by_user_id",
                                                                                            remove_now=False)
    for column in ["target_enc_user_id", "prior_question_elapsed_time"]:
        feature_factory_dict["user_id"][f"MeanAggregatorContentIdUserAnswer{column}"] = MeanAggregator2(column=["content_id", "user_answer"],
                                                                                                        agg_column=column,
                                                                                                        remove_now=True)

    feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id")
    feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id",
                                                                                   agg_column="content_level_user_id",
                                                                                   remove_now=False)
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"]["UserCountBinningEncoder"] = UserCountBinningEncoder(is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }
    feature_factory_dict[("prior_question_had_explanation", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["prior_question_had_explanation", "user_count_bin"])
    }

    feature_factory_dict["user_id"]["CategoryLevelEncoderPart"] = CategoryLevelEncoder(groupby_column="user_id",
                                                                                       agg_column="part",
                                                                                       categories=[2, 5])
    feature_factory_dict["user_id"]["FirstColumnEncoderContentId"] = FirstColumnEncoder(agg_column="content_id",
                                                                                        astype="int16",
                                                                                        is_partial_fit=True)
    feature_factory_dict["user_id"]["FirstColumnEncoderPart"] = FirstColumnEncoder(agg_column="part",
                                                                                   astype="int8",
                                                                                   is_partial_fit=True)
    for column in ["user_id", "user_count_bin", "first_column_content_id", "first_column_part", ("user_id", "part")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                 column=column,
                 agg_column="target_enc_content_id",
                 remove_now=False
            )
        else:
            feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                 column=list(column),
                 agg_column="target_enc_content_id",
                 remove_now=False
            )

    for column in [("content_id", "prior_question_had_explanation")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                 column=column,
                 agg_column="target_enc_user_id",
                 remove_now=False
            )
        else:
            feature_factory_dict[column][f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                 column=list(column),
                 agg_column="target_enc_user_id",
                 remove_now=False
            )

    feature_factory_dict["prior_question_elapsed_time"] = {
        "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True)
    }
    feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = {
        "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"])
    }
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(groupby="user_id",
                                                                         column="content_id",
                                                                         is_debug=is_debug,
                                                                         model_id=model_id,
                                                                         n=1000)
    feature_factory_dict["user_id"]["PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(n=3,
                                                                                               is_partial_fit=True)

    feature_factory_dict[f"previous_3_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_3_ans")
    }
    feature_factory_dict["user_id"]["QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(model_id=model_id,
                                                                                                   is_debug=is_debug,
                                                                                                   past_n=100)
    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict,
                                                    logger=logger,
                                                    split_num=split_num,
                                                    model_id=model_id,
                                                    load_feature=not is_debug,
                                                    save_feature=not is_debug)
    return feature_factory_manager
Ejemplo n.º 10
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    for column in [
            "user_id", "content_id", "content_type_id",
            "prior_question_had_explanation"
    ]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=column),
            "TargetEncoder": TargetEncoder(column=column)
        }
    feature_factory_dict["user_id"][
        "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                    agg_column="timestamp",
                                                    remove_now=False)
    feature_factory_dict["user_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="user_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    feature_factory_dict["content_id"][
        "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
            column="content_id",
            agg_column="prior_question_elapsed_time",
            remove_now=True)
    for column in [("user_id", "content_type_id"),
                   ("user_id", "prior_question_had_explanation")]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=list(column)),
            "TargetEncoder": TargetEncoder(column=list(column))
        }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)

    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        if debug:
            df = df.head(1000)
        feature_factory_manager.fit(df)

    iter_test = env.iter_test()
    df_test_prev = pd.DataFrame()
    i = 0
    t = time.time()
    for (df_test, df_sample_prediction) in iter_test:
        i += 1
        logger.info(
            f"[time: {int(time.time() - t)}iteration {i}: data_length: {len(df_test)}"
        )
        # 前回のデータ更新
        if len(df_test_prev) > 0:
            answered_correctly = df_test.iloc[0]["prior_group_answers_correct"]
            user_answer = df_test.iloc[0]["prior_group_responses"]

            df_test_prev["answered_correctly"] = [
                int(x) for x in answered_correctly.replace("[", "").replace(
                    "'", "").replace("]", "").replace(" ", "").split(",")
            ]
            df_test_prev["user_answer"] = [
                int(x)
                for x in user_answer.replace("[", "").replace("'", "").replace(
                    "]", "").replace(" ", "").split(",")
            ]
            # df_test_prev = df_test_prev.drop(prior_columns, axis=1)

            feature_factory_manager.fit(df_test_prev)
        # 今回のデータ取得&計算

        # logger.info(f"[time: {int(time.time() - t)}dataload")
        logger.info(f"transform... ")
        df_test["prior_question_had_explanation"] = df_test[
            "prior_question_had_explanation"].astype("float16").fillna(
                -1).astype("int8")

        df = feature_factory_manager.partial_predict(df_test)
        logger.info(f"other... ")
        cols = models[0].feature_name()
        for col in cols:
            if col not in df.columns:
                df[col] = -99999

        # predict
        predicts = []
        cols = models[0].feature_name()
        for model in models:
            predicts.append(model.predict(df[cols]))

        df["answered_correctly"] = np.array(predicts).transpose().mean(axis=1)
        df_sample_prediction = pd.merge(df_sample_prediction[["row_id"]],
                                        df[["row_id", "answered_correctly"]],
                                        how="inner")
        env.predict(df_sample_prediction)
        df_test_prev = df[cols + ["user_id"]]

        df_test_prev.to_csv(f"{i}.csv")
Ejemplo n.º 11
0
    def test_interval1(self):
        logger = get_logger()

        df = pd.DataFrame({"row_id": [0, 1, 2, 3, 4, 5, 6, 7],
                           "user_id": ["a", "a", "a", "b", "a", "b", "a", "b"],
                           "timestamp": [0, 1, 2, 3, 4, 5, 6, 7],
                           "content_id": [0, 0, 1, 1, 0, 0, 1, 1],
                           "content_type_id": [0, 1, 0, 0, 0, 0, 0, 1],
                           "user_answer": [0, 1, 2, 3, 4, 5, 6, 7],
                           "answered_correctly": [0, -1, 1, -1, 0, 0, 1, -1],
                           "prior_question_had_explanation": [0, 0, 0, 0, 0, 0, 0, 0]}).sort_values(["user_id", "timestamp"])
        df_question = pd.DataFrame({"question_id": [0, 1],
                                    "bundle_id": [0, 1],
                                    "correct_answer": [0, 1],
                                    "part": [0, 1],
                                    "tags": ["0", "1"]})
        df_lecture = pd.DataFrame({"lecture_id": [0, 1],
                                   "tag": [0, 1],
                                   "part": [0, 1],
                                   "type_of": ["0", "1"]})
        feature_factory_dict = {
            "user_id": {
                "CountEncoder": CountEncoder(column="user_id"),
                "TargetEncoder": TargetEncoder(column="user_id")}
        }
        print(df.iloc[4:])
        gen = MyEnvironment(df_test=df.iloc[4:],
                            interval=1).iter_test()
        feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict,
                                                        logger=logger)

        env_manager = EnvironmentManager(feature_factory_manager=feature_factory_manager,
                                         gen=gen,
                                         fit_interval=1,
                                         df_question=df_question,
                                         df_lecture=df_lecture)

        w_df1 = pd.merge(df[df["content_type_id"] == 0], df_question, how="left", left_on="content_id",
                         right_on="question_id")
        w_df2 = pd.merge(df[df["content_type_id"] == 1], df_lecture, how="left", left_on="content_id",
                         right_on="lecture_id")
        df2 = pd.concat([w_df1, w_df2]).sort_values(["user_id", "timestamp"])
        df_expect = feature_factory_manager.all_predict(df2).iloc[4:]
        df_expect["tag"] = df_expect["tag"].fillna(-1)
        df_expect["correct_answer"] = df_expect["correct_answer"].fillna(-1)
        df_expect["bundle_id"] = df_expect["bundle_id"].fillna(-1)
        df_expect["prior_question_had_explanation"] = df_expect["prior_question_had_explanation"].astype("float16").fillna(-1).astype("int8")
        df_expect.columns = [x.replace(" ", "_") for x in df_expect.columns]

        df_actual = pd.DataFrame()

        feature_factory_manager.fit(df2.iloc[:4])
        while True:
            x = env_manager.step()
            if x is None:
                break
            df_test = x[0]
            df_sub = x[1]
            df_actual = pd.concat([df_actual, df_test], axis=0)

        pd.testing.assert_frame_equal(df_expect.reset_index(drop=True),
                                      df_actual.reset_index(drop=True),
                                      check_dtype=False)
Ejemplo n.º 12
0
def run(debug, model_dir, kaggle=False):

    if kaggle:
        files_dir = "/kaggle/input/riiid-split10/*.pickle"
    else:
        files_dir = "../input/riiid-test-answer-prediction/split10_base/*.pickle"

    logger = get_logger()
    # environment
    env = riiideducation.make_env()

    df_question = pd.read_csv(
        "../input/riiid-test-answer-prediction/questions.csv",
        dtype={
            "bundle_id": "int32",
            "question_id": "int32",
            "correct_answer": "int8",
            "part": "int8"
        })
    df_lecture = pd.read_csv(
        "../input/riiid-test-answer-prediction/lectures.csv",
        dtype={
            "lecture_id": "int32",
            "tag": "int16",
            "part": "int8"
        })
    # model loading
    models = []
    for model_path in glob.glob(f"{model_dir}/*model*.pickle"):
        with open(model_path, "rb") as f:
            models.append(pickle.load(f))

    # data preprocessing
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
    for column in [
            "content_id", "user_id", "part", "prior_question_had_explanation",
            "tags1", "tags2", ("user_id", "prior_question_had_explanation"),
        ("user_id", "part")
    ]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=column, is_partial_fit=is_partial_fit),
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder":
                CountEncoder(column=list(column),
                             is_partial_fit=is_partial_fit),
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="content_id")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][
            f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                column=column,
                agg_column="prior_question_elapsed_time",
                remove_now=True)

    feature_factory_dict["user_id"][
        "UserLevelEncoder2ContentId"] = UserLevelEncoder2(
            vs_column="content_id")
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["CountEncoder"] = CountEncoder(
        column="user_count_bin")
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "CountEncoder": CountEncoder(column=["user_id", "user_count_bin"]),
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "CountEncoder": CountEncoder(column=["content_id", "user_count_bin"]),
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id",
            agg_column="part",
            categories=[1, 2, 3, 4, 5, 6, 7])
    feature_factory_dict["user_count_bin"]["CategoryLevelEncoderUserCountBin"] = \
        CategoryLevelEncoder(groupby_column="user_id",
                             agg_column="user_count_bin",
                             categories=[0, 1, 2, 3, 4, 5])
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict, logger=logger)
    for model_id, fname in enumerate(glob.glob(files_dir)):
        logger.info(f"loading... {fname}")
        df = pd.read_pickle(fname)
        df = df[df["answered_correctly"] != -1]
        df["prior_question_had_explanation"] = df[
            "prior_question_had_explanation"].fillna(-1).astype("int8")
        if debug:
            df = df.head(1000)
        df = pd.concat([
            pd.merge(df[df["content_type_id"] == 0],
                     df_question,
                     how="left",
                     left_on="content_id",
                     right_on="question_id"),
            pd.merge(df[df["content_type_id"] == 1],
                     df_lecture,
                     how="left",
                     left_on="content_id",
                     right_on="lecture_id")
        ]).sort_values(["user_id", "timestamp"])
        # df = feature_factory_manager.feature_factory_dict["content_id"]["TargetEncoder"].all_predict(df)
        feature_factory_manager.fit(df, is_first_fit=True)

    for dicts in feature_factory_manager.feature_factory_dict.values():
        for factory in dicts.values():
            factory.logger = None
    feature_factory_manager.logger = None
    with open(f"feature_factory_manager.pickle", "wb") as f:
        pickle.dump(feature_factory_manager, f)
    return
Ejemplo n.º 13
0
from sklearn.model_selection import KFold
from datetime import datetime as dt
import os
import glob

output_dir = f"../output/ex_006/{dt.now().strftime('%Y%m%d%H%M%S')}/"

for fname in glob.glob("../input/riiid-test-answer-prediction/split10/*"):
    print(fname)
    df = pd.read_pickle(fname)
    df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1).astype("int8")
    logger = get_logger()
    feature_factory_dict = {}
    for column in ["user_id", "content_id", "content_type_id", "prior_question_had_explanation"]:
        feature_factory_dict[column] = {
            "CountEncoder": CountEncoder(column=column),
            "TargetEncoder": TargetEncoder(column=column)
        }
    feature_factory_dict["user_id"]["MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                                                agg_column="timestamp",
                                                                                remove_now=False)
    feature_factory_dict["user_id"]["MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(column="user_id",
                                                                                               agg_column="prior_question_elapsed_time",
                                                                                               remove_now=True)
    feature_factory_dict["content_id"]["MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(column="content_id",
                                                                                                  agg_column="prior_question_elapsed_time",
                                                                                                  remove_now=True)

    feature_factory_manager = FeatureFactoryManager(feature_factory_dict=feature_factory_dict,
                                                    logger=logger)
    df = feature_factory_manager.all_predict(df)
Ejemplo n.º 14
0
 # df = pd.concat([pd.read_pickle(fname).head(500), pd.read_pickle(fname).tail(500)])
 df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
 df["prior_question_had_explanation"] = df[
     "prior_question_had_explanation"].fillna(-1).astype("int8")
 logger = get_logger()
 feature_factory_dict = {}
 feature_factory_dict["tags"] = {"TagsSeparator": TagsSeparator()}
 for column in [
         "user_id", "content_id", "content_type_id",
         "prior_question_had_explanation", "part", "tags1", "tags2",
         "tags3", "tags4", "tags5", "tags6"
 ]:
     is_partial_fit = column == "content_id"
     feature_factory_dict[column] = {
         "CountEncoder":
         CountEncoder(column=column),
         "TargetEncoder":
         TargetEncoder(column=column, is_partial_fit=is_partial_fit)
     }
 feature_factory_dict["user_id"][
     "MeanAggregatorTimestamp"] = MeanAggregator(column="user_id",
                                                 agg_column="timestamp",
                                                 remove_now=False)
 feature_factory_dict["user_id"][
     "MeanAggregatorPriorQuestionElapsedTime"] = MeanAggregator(
         column="user_id",
         agg_column="prior_question_elapsed_time",
         remove_now=True)
 feature_factory_dict["user_id"]["UserLevelEncoder"] = UserLevelEncoder(
     initial_score=0.66, initial_weight=200)
 feature_factory_dict["content_id"][
Ejemplo n.º 15
0
    # df = pd.concat([pd.read_pickle(fname).head(500), pd.read_pickle(fname).tail(500)])
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    df["prior_question_had_explanation"] = df["prior_question_had_explanation"].fillna(-1).astype("int8")
    logger = get_logger()
    feature_factory_dict = {}
    feature_factory_dict["tags"] = {
        "TagsSeparator": TagsSeparator()
    }
    for column in ["content_id", "user_id", "part", "prior_question_had_explanation",
                   "tags1", "tags2",
                   ("user_id", "prior_question_had_explanation"), ("user_id", "part")]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=column, is_partial_fit=is_partial_fit),
                "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "CountEncoder": CountEncoder(column=list(column), is_partial_fit=is_partial_fit),
                "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"]["ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                                                    column="timestamp",
                                                                                    is_partial_fit=True)
    feature_factory_dict["user_id"]["ShiftDiffEncoderContentId"] = ShiftDiffEncoder(groupby="user_id",
                                                                                    column="content_id")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(column=column,
                                                                                                           agg_column="prior_question_elapsed_time",