Exemple #1
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id", ("last_lecture", "content_id")]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "PastNTimestampEncoder"] = PastNFeatureEncoder(
            column="timestamp",
            past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10],
            agg_funcs=["vslast"],
            remove_now=False)
    feature_factory_dict["user_id"][
        "Past1ContentTypeId"] = PastNFeatureEncoder(column="content_type_id",
                                                    past_ns=[5, 15],
                                                    agg_funcs=["mean"],
                                                    remove_now=False)
    feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder(
        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ElapsedTimeVsShiftDiffEncoder"] = ElapsedTimeVsShiftDiffEncoder()
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }

    for column in ["user_id", "content_id", "part", ("user_id", "part")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="shiftdiff_timestamp_by_user_id_cap200k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="shiftdiff_timestamp_by_user_id_cap200k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=False)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])

    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=300)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=5, is_partial_fit=True)

    feature_factory_dict[f"previous_5_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_5_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")
    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
Exemple #2
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in [
            "user_id", "content_id", ("user_id", "part"),
        ("content_id", "prior_question_had_explanation")
    ]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp")
    for column in ["user_id", "content_id"]:
        feature_factory_dict[column][
            f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                column=column,
                agg_column="prior_question_elapsed_time",
                remove_now=True)

    feature_factory_dict["user_id"][
        "UserLevelEncoder2ContentId"] = UserLevelEncoder2(
            vs_column="content_id")
    feature_factory_dict["content_id"][
        "ContentLevelEncoder2UserId"] = ContentLevelEncoder(
            vs_column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "MeanAggregatorContentLevel"] = MeanAggregator(
            column="user_id",
            agg_column="content_level_user_id",
            remove_now=False)
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }
    feature_factory_dict[(
        "prior_question_had_explanation", "user_count_bin")] = {
            "TargetEncoder":
            TargetEncoder(
                column=["prior_question_had_explanation", "user_count_bin"])
        }

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])
    feature_factory_dict["user_id"][
        "FirstColumnEncoderContentId"] = FirstColumnEncoder(
            agg_column="content_id", astype="int16", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "FirstColumnEncoderPart"] = FirstColumnEncoder(agg_column="part",
                                                       astype="int8",
                                                       is_partial_fit=True)
    for column in [
            "user_id", "user_count_bin", "first_column_content_id",
            "first_column_part", ("user_id", "part")
    ]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=column,
                    agg_column="target_enc_content_id",
                    remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="target_enc_content_id",
                    remove_now=False)

    for column in [("content_id", "prior_question_had_explanation")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=column,
                    agg_column="target_enc_user_id",
                    remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="target_enc_user_id",
                    remove_now=False)

    feature_factory_dict["prior_question_elapsed_time"] = {
        "PriorQuestionElapsedTimeBinningEncoder":
        PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True)
    }
    feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = {
        "TargetEncoder":
        TargetEncoder(column=["part", "prior_question_elapsed_time_bin"])
    }
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=1000)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=3, is_partial_fit=True)

    feature_factory_dict[f"previous_3_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_3_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=10)
    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
Exemple #3
0
def make_feature_factory_manager(split_num, size, window, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}
    feature_factory_dict["tags"] = {
        "TagsSeparator": TagsSeparator(is_partial_fit=True)
    }

    for column in ["user_id", "content_id"]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id",
                                                        column="timestamp",
                                                        is_partial_fit=True)
    feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder(
        is_partial_fit=True)
    # feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id")
    # feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True)
    # feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id",
    #                                                                                agg_column="content_level_user_id",
    #                                                                                remove_now=False)
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict["user_id"][
        "UserCountBinningEncoder"] = UserCountBinningEncoder(
            is_partial_fit=True)
    feature_factory_dict["user_count_bin"] = {}
    feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder(
        column="user_count_bin")
    feature_factory_dict[("user_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"])
    }
    feature_factory_dict[("content_id", "user_count_bin")] = {
        "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"])
    }
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }

    for column in ["user_id", "content_id", "part", ("user_id", "part")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="prior_question_elapsed_time",
                    remove_now=True)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=True)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="prior_question_elapsed_time",
                    remove_now=True)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=True)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])

    feature_factory_dict["prior_question_elapsed_time"] = {
        "PriorQuestionElapsedTimeBinningEncoder":
        PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True)
    }
    feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = {
        "TargetEncoder":
        TargetEncoder(column=["part", "prior_question_elapsed_time_bin"])
    }
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=500)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=3, is_partial_fit=True)

    feature_factory_dict[f"previous_3_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_3_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=100)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder"] = QuestionQuestionTableEncoder(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")

    feature_factory_dict["post"] = {
        "ContentIdTargetEncoderAggregator": TargetEncoderAggregator()
    }

    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager
Exemple #4
0
def make_feature_factory_manager(split_num, model_id=None):
    logger = get_logger()

    feature_factory_dict = {}

    for column in ["user_id", "content_id"]:
        is_partial_fit = (column == "content_id" or column == "user_id")

        if type(column) == str:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=column, is_partial_fit=is_partial_fit)
            }
        else:
            feature_factory_dict[column] = {
                "TargetEncoder":
                TargetEncoder(column=list(column),
                              is_partial_fit=is_partial_fit)
            }
    feature_factory_dict["user_id"][
        "DurationPreviousContent"] = DurationPreviousContent(
            is_partial_fit=True)
    feature_factory_dict["user_id"][
        "PastNTimestampEncoder"] = PastNFeatureEncoder(
            column="timestamp",
            past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10],
            agg_funcs=["vslast"],
            remove_now=False)
    feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2(
        is_partial_fit=True)
    feature_factory_dict["user_id"][
        "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
        )
    feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder(
        column="user_id", is_partial_fit=True)
    feature_factory_dict[("user_id", "part")] = {
        "UserContentRateEncoder":
        UserContentRateEncoder(column=["user_id", "part"], rate_func="elo")
    }
    feature_factory_dict["user_id"][
        "PastNUserAnswerHistory"] = PastNUserAnswerHistory(past_n=2,
                                                           min_size=300)
    for column in [("user_id", "prior_question_had_explanation"),
                   ("content_id", "prior_question_had_explanation"),
                   ("part", "prior_question_had_explanation"),
                   ("user_id", "part", "prior_question_had_explanation")]:
        if column not in feature_factory_dict:
            feature_factory_dict[column] = {}
        if type(column) == str:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=column,
                    agg_column="duration_previous_content_cap100k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=column, agg_column="study_time", remove_now=False)
        else:
            feature_factory_dict[column][
                f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="duration_previous_content_cap100k",
                    remove_now=False)
            feature_factory_dict[column][
                f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator(
                    column=list(column),
                    agg_column="study_time",
                    remove_now=False)

    feature_factory_dict["user_id"][
        "CategoryLevelEncoderPart"] = CategoryLevelEncoder(
            groupby_column="user_id", agg_column="part", categories=[2, 5])
    feature_factory_dict["user_id"][
        "UserContentNowRateEncoder"] = UserContentNowRateEncoder(
            column="part", target=[1, 2, 3, 4, 5, 6, 7], rate_func="elo")
    feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
        groupby="user_id",
        column="content_id",
        is_debug=is_debug,
        model_id=model_id,
        n=300)
    feature_factory_dict["user_id"][
        "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly(
            n=5, is_partial_fit=True)

    feature_factory_dict[f"previous_5_ans"] = {
        "TargetEncoder": TargetEncoder(column="previous_5_ans")
    }
    feature_factory_dict["user_id"][
        "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2(
            model_id=model_id, is_debug=is_debug, past_n=100, min_size=300)
    feature_factory_dict["user_id"][
        "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id",
                                                           rate_func="elo")
    feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-Duration100k"] = \
        CorrectVsIncorrectMeanEncoder(groupby="content_id",
                                      column="duration_previous_content_cap100k",
                                      min_size=300)
    feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-UserIdTargetEnc"] = \
        CorrectVsIncorrectMeanEncoder(groupby="part",
                                      column="target_enc_user_id",
                                      min_size=300)

    feature_factory_dict["user_id"][
        "PreviousContentAnswerTargetEncoder"] = PreviousContentAnswerTargetEncoder(
            min_size=300)
    feature_factory_dict["post"] = {
        "DurationFeaturePostProcess": DurationFeaturePostProcess()
    }
    feature_factory_manager = FeatureFactoryManager(
        feature_factory_dict=feature_factory_dict,
        logger=logger,
        split_num=split_num,
        model_id=model_id,
        load_feature=not is_debug,
        save_feature=not is_debug)
    return feature_factory_manager