class ModelRepresentationBase(_AbstractModelRepresentation): """ class just to store the default HyperParameters """ default_hyper = { "n_components": hp.HyperRangeFloat(start=0.1, end=1, step=0.05), # Forest like estimators "n_estimators": hp.HyperComposition([ (0.75, hp.HyperRangeInt(start=25, end=175, step=25)), (0.25, hp.HyperRangeInt(start=200, end=1000, step=100)), ]), "max_features": hp.HyperComposition([(0.25, ["sqrt", "auto"]), (0.75, hp.HyperRangeBetaFloat(start=0, end=1, alpha=3, beta=1))]), "max_depth": hp.HyperChoice([ None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30, 50, 100 ]), "min_samples_split": hp.HyperRangeBetaInt(start=2, end=100, alpha=1, beta=5), # Linear model "C": hp.HyperLogRangeFloat(start=0.00001, end=10, n=50), "alpha": hp.HyperLogRangeFloat(start=0.00001, end=10, n=50), # CV "analyzer": hp.HyperChoice(["word", "char", "char_wb"]), "penalty": ["l1", "l2"], "random_state": [ 123 ], # So that every for every model with a random_state attribute, it will be passed and fix "drop_used_columns": [True], "drop_unused_columns": [True] } # This dictionnary is used to specify the default hyper-parameters that are used during the random search phase # They will be used if : # * the model has a paramters among that list # * the parameters is not specified within the class (within 'custom_hyper') default_default_hyper = { "random_state": 123, "drop_used_columns": True, "drop_unused_columns": True } # This dictionnary is used to specify the default hyper-parameters that are used during the default model phase # They will be used if : # * the model has a paramters among that list # * the default parameters is not specified within the class (withing 'default_parameters') depends_on = ()
def get_hyper_parameter(cls): ### Specific function to handle the fact that I don't want ngram != 1 IF analyzer = word ### res = hp.HyperComposition([ ( 0.5, hp.HyperCrossProduct({ "ngram_range": 1, "analyzer": "word", "min_df": [1, 0.001, 0.01, 0.05], "max_df": [0.999, 0.99, 0.95], "tfidf": [True, False], }), ), ( 0.5, hp.HyperCrossProduct({ "ngram_range": hp.HyperRangeInt(start=1, end=4), "analyzer": hp.HyperChoice(("char", "char_wb")), "min_df": [1, 0.001, 0.01, 0.05], "max_df": [0.999, 0.99, 0.95], "tfidf": [True, False], }), ), ]) return res
class Char2VecVectorizer_TextEncoder(ModelRepresentationBase): klass = Char2VecVectorizer category = StepCategories.TextEncoder type_of_variable = TypeOfVariables.TEXT custom_hyper = { "size": hp.HyperRangeInt(50, 300, step=10), "window": [3, 5, 7], "ngram": hp.HyperRangeInt(2, 6), "same_embedding_all_columns": [True, False], "text_preprocess": [None, "default", "digit", "nltk"], } type_of_model = None use_y = False
class Text_TruncatedSVD_DimensionReduction(ModelRepresentationBase): klass = TruncatedSVDWrapper category = StepCategories.TextDimensionReduction custom_hyper = {"n_components": hp.HyperRangeInt(10, 500, step=5)} type_of_variable = TypeOfVariables.TEXT type_of_model = None use_y = False custom_hyper = {"drop_used_columns": [True, False]}
class ModelRepresentationBase(_AbstractModelRepresentation): """ class just to store the default HyperParameters """ default_hyper = { "n_components": hp.HyperRangeFloat(start=0.1, end=1, step=0.05), # Forest like estimators "n_estimators": hp.HyperComposition([ (0.75, hp.HyperRangeInt(start=25, end=175, step=25)), (0.25, hp.HyperRangeInt(start=200, end=1000, step=100)), ]), "max_features": hp.HyperComposition([(0.25, ["sqrt", "auto"]), (0.75, hp.HyperRangeBetaFloat(start=0, end=1, alpha=3, beta=1))]), "max_depth": hp.HyperChoice([ None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30, 50, 100 ]), "min_samples_split": hp.HyperRangeBetaInt(start=1, end=100, alpha=1, beta=5), # Linear model "C": hp.HyperLogRangeFloat(start=0.00001, end=10, n=50), "alpha": hp.HyperLogRangeFloat(start=0.00001, end=10, n=50), # CV "analyzer": hp.HyperChoice(["word", "char", "char_wb"]), "penalty": ["l1", "l2"], "random_state": [ 123 ], # So that every for every model with a random_state attribute, it will be passed and fix "columns_to_encode": ["--object--"] }
class NumericalEncoder_CatEncoder(ModelRepresentationBase): klass = NumericalEncoder category = StepCategories.CategoryEncoder type_of_variable = (TypeOfVariables.CAT, TypeOfVariables.NUM) custom_hyper = { "encoding_type": ["dummy", "num"], "min_nb_observations": hp.HyperRangeInt(2, 20) } type_of_model = None use_y = False
def get_hyper_parameter(cls): """ specific function to handle dependency between hyper-parameters : bagging_fraction AND bagging_freq """ res = hp.HyperComposition([ ################## ### No Bagging ### ################## # * bagging_freq == 0 # * bagging_fraction == 1.0 # * no random forest here : 'booting_type' != 'rf' ( 0.5, hp.HyperCrossProduct({ "boosting_type": ["gbdt", "dart"], "learning_rate": hp.HyperLogRangeFloat(0.0001, 0.1), "max_depth": hp.HyperChoice([ -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30, 50, 100 ]), "n_estimators": hp.HyperComposition([ (0.50, hp.HyperRangeInt(start=25, end=175, step=25)), (0.25, hp.HyperRangeInt(start=200, end=900, step=100)), (0.25, hp.HyperRangeInt(start=1000, end=10000, step=100)), ]), "colsample_bytree": hp.HyperRangeBetaFloat(start=0.1, end=1, alpha=3, beta=1), # Mean = 0.75 "min_child_samples": hp.HyperRangeInt(2, 50), "num_leaves": hp.HyperRangeInt(10, 200), "bagging_fraction": [1.0], "bagging_freq": [0], "n_jobs": [1], }), ), ############### ### Bagging ### ############### # * bagging_freq = 1 # * bagging_fraction < 1 ( 0.5, hp.HyperCrossProduct({ "boosting_type": ["rf", "gbdt", "dart"], "learning_rate": hp.HyperLogRangeFloat(0.0001, 0.1), "max_depth": hp.HyperChoice([ -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30, 50, 100 ]), "n_estimators": hp.HyperComposition([ (0.50, hp.HyperRangeInt(start=25, end=175, step=25)), (0.25, hp.HyperRangeInt(start=200, end=900, step=100)), (0.25, hp.HyperRangeInt(start=1000, end=10000, step=100)), ]), "colsample_bytree": hp.HyperRangeBetaFloat(start=0.1, end=1, alpha=3, beta=1), # Mean = 0.75 "min_child_samples": hp.HyperRangeInt(2, 50), "num_leaves": hp.HyperRangeInt(10, 200), "bagging_fraction": hp.HyperRangeBetaFloat(start=0.1, end=1, alpha=3, beta=1), "bagging_freq": [1], "n_jobs": [1], }), ), ]) return res