def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in ["user_id", "content_id", ("last_lecture", "content_id")]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"][ "PastNTimestampEncoder"] = PastNFeatureEncoder( column="timestamp", past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10], agg_funcs=["vslast"], remove_now=False) feature_factory_dict["user_id"][ "Past1ContentTypeId"] = PastNFeatureEncoder(column="content_type_id", past_ns=[5, 15], agg_funcs=["mean"], remove_now=False) feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeVsShiftDiffEncoder"] = ElapsedTimeVsShiftDiffEncoder() feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } for column in ["user_id", "content_id", "part", ("user_id", "part")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="shiftdiff_timestamp_by_user_id_cap200k", remove_now=False) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=column, agg_column="study_time", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="shiftdiff_timestamp_by_user_id_cap200k", remove_now=False) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=list(column), agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=5, is_partial_fit=True) feature_factory_dict[f"previous_5_ans"] = { "TargetEncoder": TargetEncoder(column="previous_5_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in [ "user_id", "content_id", ("user_id", "part"), ("content_id", "prior_question_had_explanation") ]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp") for column in ["user_id", "content_id"]: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict["user_id"][ "UserLevelEncoder2ContentId"] = UserLevelEncoder2( vs_column="content_id") feature_factory_dict["content_id"][ "ContentLevelEncoder2UserId"] = ContentLevelEncoder( vs_column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "MeanAggregatorContentLevel"] = MeanAggregator( column="user_id", agg_column="content_level_user_id", remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[( "prior_question_had_explanation", "user_count_bin")] = { "TargetEncoder": TargetEncoder( column=["prior_question_had_explanation", "user_count_bin"]) } feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"][ "FirstColumnEncoderContentId"] = FirstColumnEncoder( agg_column="content_id", astype="int16", is_partial_fit=True) feature_factory_dict["user_id"][ "FirstColumnEncoderPart"] = FirstColumnEncoder(agg_column="part", astype="int8", is_partial_fit=True) for column in [ "user_id", "user_count_bin", "first_column_content_id", "first_column_part", ("user_id", "part") ]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_content_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_content_id", remove_now=False) for column in [("content_id", "prior_question_had_explanation")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=column, agg_column="target_enc_user_id", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorTargetEncContentIdBy{column}"] = MeanAggregator( column=list(column), agg_column="target_enc_user_id", remove_now=False) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=1000) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=3, is_partial_fit=True) feature_factory_dict[f"previous_3_ans"] = { "TargetEncoder": TargetEncoder(column="previous_3_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=10) feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def make_feature_factory_manager(split_num, size, window, model_id=None): logger = get_logger() feature_factory_dict = {} feature_factory_dict["tags"] = { "TagsSeparator": TagsSeparator(is_partial_fit=True) } for column in ["user_id", "content_id"]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "ShiftDiffEncoderTimestamp"] = ShiftDiffEncoder(groupby="user_id", column="timestamp", is_partial_fit=True) feature_factory_dict["user_id"]["StudyTermEncoder"] = StudyTermEncoder( is_partial_fit=True) # feature_factory_dict["user_id"]["UserLevelEncoder2ContentId"] = UserLevelEncoder2(vs_column="content_id") # feature_factory_dict["content_id"]["ContentLevelEncoder2UserId"] = ContentLevelEncoder(vs_column="user_id", is_partial_fit=True) # feature_factory_dict["user_id"]["MeanAggregatorContentLevel"] = MeanAggregator(column="user_id", # agg_column="content_level_user_id", # remove_now=False) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict["user_id"][ "UserCountBinningEncoder"] = UserCountBinningEncoder( is_partial_fit=True) feature_factory_dict["user_count_bin"] = {} feature_factory_dict["user_count_bin"]["TargetEncoder"] = TargetEncoder( column="user_count_bin") feature_factory_dict[("user_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["user_id", "user_count_bin"]) } feature_factory_dict[("content_id", "user_count_bin")] = { "TargetEncoder": TargetEncoder(column=["content_id", "user_count_bin"]) } feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } for column in ["user_id", "content_id", "part", ("user_id", "part")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=column, agg_column="study_time", remove_now=True) else: feature_factory_dict[column][ f"MeanAggregatorPriorQuestionElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="prior_question_elapsed_time", remove_now=True) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=list(column), agg_column="study_time", remove_now=True) feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["prior_question_elapsed_time"] = { "PriorQuestionElapsedTimeBinningEncoder": PriorQuestionElapsedTimeBinningEncoder(is_partial_fit=True) } feature_factory_dict[("part", "prior_question_elapsed_time_bin")] = { "TargetEncoder": TargetEncoder(column=["part", "prior_question_elapsed_time_bin"]) } feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=500) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=3, is_partial_fit=True) feature_factory_dict[f"previous_3_ans"] = { "TargetEncoder": TargetEncoder(column="previous_3_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=100) feature_factory_dict["user_id"][ "QuestionQuestionTableEncoder"] = QuestionQuestionTableEncoder( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["post"] = { "ContentIdTargetEncoderAggregator": TargetEncoderAggregator() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager
def make_feature_factory_manager(split_num, model_id=None): logger = get_logger() feature_factory_dict = {} for column in ["user_id", "content_id"]: is_partial_fit = (column == "content_id" or column == "user_id") if type(column) == str: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=column, is_partial_fit=is_partial_fit) } else: feature_factory_dict[column] = { "TargetEncoder": TargetEncoder(column=list(column), is_partial_fit=is_partial_fit) } feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "PastNTimestampEncoder"] = PastNFeatureEncoder( column="timestamp", past_ns=[2, 3, 4, 5, 6, 7, 8, 9, 10], agg_funcs=["vslast"], remove_now=False) feature_factory_dict["user_id"]["StudyTermEncoder2"] = StudyTermEncoder2( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["user_id"]["CountEncoder"] = CountEncoder( column="user_id", is_partial_fit=True) feature_factory_dict[("user_id", "part")] = { "UserContentRateEncoder": UserContentRateEncoder(column=["user_id", "part"], rate_func="elo") } feature_factory_dict["user_id"][ "PastNUserAnswerHistory"] = PastNUserAnswerHistory(past_n=2, min_size=300) for column in [("user_id", "prior_question_had_explanation"), ("content_id", "prior_question_had_explanation"), ("part", "prior_question_had_explanation"), ("user_id", "part", "prior_question_had_explanation")]: if column not in feature_factory_dict: feature_factory_dict[column] = {} if type(column) == str: feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=column, agg_column="duration_previous_content_cap100k", remove_now=False) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=column, agg_column="study_time", remove_now=False) else: feature_factory_dict[column][ f"MeanAggregatorShiftDiffTimeElapsedTimeby{column}"] = MeanAggregator( column=list(column), agg_column="duration_previous_content_cap100k", remove_now=False) feature_factory_dict[column][ f"MeanAggregatorStudyTimeby{column}"] = MeanAggregator( column=list(column), agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "CategoryLevelEncoderPart"] = CategoryLevelEncoder( groupby_column="user_id", agg_column="part", categories=[2, 5]) feature_factory_dict["user_id"][ "UserContentNowRateEncoder"] = UserContentNowRateEncoder( column="part", target=[1, 2, 3, 4, 5, 6, 7], rate_func="elo") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "PreviousNAnsweredCorrectly"] = PreviousNAnsweredCorrectly( n=5, is_partial_fit=True) feature_factory_dict[f"previous_5_ans"] = { "TargetEncoder": TargetEncoder(column="previous_5_ans") } feature_factory_dict["user_id"][ "QuestionLectureTableEncoder2"] = QuestionLectureTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "QuestionQuestionTableEncoder2"] = QuestionQuestionTableEncoder2( model_id=model_id, is_debug=is_debug, past_n=100, min_size=300) feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder(column="user_id", rate_func="elo") feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-Duration100k"] = \ CorrectVsIncorrectMeanEncoder(groupby="content_id", column="duration_previous_content_cap100k", min_size=300) feature_factory_dict["content_id"]["CorrectVsIncorrectMeanEncoderContent-UserIdTargetEnc"] = \ CorrectVsIncorrectMeanEncoder(groupby="part", column="target_enc_user_id", min_size=300) feature_factory_dict["user_id"][ "PreviousContentAnswerTargetEncoder"] = PreviousContentAnswerTargetEncoder( min_size=300) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=split_num, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) return feature_factory_manager