Ejemplo n.º 1
0
class ModelRepresentationBase(_AbstractModelRepresentation):
    """ class just to store the default HyperParameters """

    default_hyper = {
        "n_components":
        hp.HyperRangeFloat(start=0.1, end=1, step=0.05),
        # Forest like estimators
        "n_estimators":
        hp.HyperComposition([
            (0.75, hp.HyperRangeInt(start=25, end=175, step=25)),
            (0.25, hp.HyperRangeInt(start=200, end=1000, step=100)),
        ]),
        "max_features":
        hp.HyperComposition([(0.25, ["sqrt", "auto"]),
                             (0.75,
                              hp.HyperRangeBetaFloat(start=0,
                                                     end=1,
                                                     alpha=3,
                                                     beta=1))]),
        "max_depth":
        hp.HyperChoice([
            None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25,
            30, 50, 100
        ]),
        "min_samples_split":
        hp.HyperRangeBetaInt(start=2, end=100, alpha=1, beta=5),
        # Linear model
        "C":
        hp.HyperLogRangeFloat(start=0.00001, end=10, n=50),
        "alpha":
        hp.HyperLogRangeFloat(start=0.00001, end=10, n=50),
        # CV
        "analyzer":
        hp.HyperChoice(["word", "char", "char_wb"]),
        "penalty": ["l1", "l2"],
        "random_state": [
            123
        ],  # So that every for every model with a random_state attribute, it will be passed and fix
        "drop_used_columns": [True],
        "drop_unused_columns": [True]
    }
    # This dictionnary is used to specify the default hyper-parameters that are used during the random search phase
    # They will be used if :
    # * the model has a paramters among that list
    # * the parameters is not specified within the class (within 'custom_hyper')

    default_default_hyper = {
        "random_state": 123,
        "drop_used_columns": True,
        "drop_unused_columns": True
    }
    # This dictionnary is used to specify the default hyper-parameters that are used during the default model phase
    # They will be used if :
    # * the model has a paramters among that list
    # * the default parameters is not specified within the class (withing 'default_parameters')

    depends_on = ()
Ejemplo n.º 2
0
    def get_hyper_parameter(cls):
        ### Specific function to handle the fact that I don't want ngram != 1 IF analyzer = word ###
        res = hp.HyperComposition([
            (
                0.5,
                hp.HyperCrossProduct({
                    "ngram_range": 1,
                    "analyzer": "word",
                    "min_df": [1, 0.001, 0.01, 0.05],
                    "max_df": [0.999, 0.99, 0.95],
                    "tfidf": [True, False],
                }),
            ),
            (
                0.5,
                hp.HyperCrossProduct({
                    "ngram_range":
                    hp.HyperRangeInt(start=1, end=4),
                    "analyzer":
                    hp.HyperChoice(("char", "char_wb")),
                    "min_df": [1, 0.001, 0.01, 0.05],
                    "max_df": [0.999, 0.99, 0.95],
                    "tfidf": [True, False],
                }),
            ),
        ])

        return res
Ejemplo n.º 3
0
    class Char2VecVectorizer_TextEncoder(ModelRepresentationBase):

        klass = Char2VecVectorizer
        category = StepCategories.TextEncoder
        type_of_variable = TypeOfVariables.TEXT

        custom_hyper = {
            "size": hp.HyperRangeInt(50, 300, step=10),
            "window": [3, 5, 7],
            "ngram": hp.HyperRangeInt(2, 6),
            "same_embedding_all_columns": [True, False],
            "text_preprocess": [None, "default", "digit", "nltk"],
        }

        type_of_model = None

        use_y = False
Ejemplo n.º 4
0
class Text_TruncatedSVD_DimensionReduction(ModelRepresentationBase):
    klass = TruncatedSVDWrapper
    category = StepCategories.TextDimensionReduction

    custom_hyper = {"n_components": hp.HyperRangeInt(10, 500, step=5)}

    type_of_variable = TypeOfVariables.TEXT

    type_of_model = None
    use_y = False

    custom_hyper = {"drop_used_columns": [True, False]}
Ejemplo n.º 5
0
class ModelRepresentationBase(_AbstractModelRepresentation):
    """ class just to store the default HyperParameters """

    default_hyper = {
        "n_components":
        hp.HyperRangeFloat(start=0.1, end=1, step=0.05),
        # Forest like estimators
        "n_estimators":
        hp.HyperComposition([
            (0.75, hp.HyperRangeInt(start=25, end=175, step=25)),
            (0.25, hp.HyperRangeInt(start=200, end=1000, step=100)),
        ]),
        "max_features":
        hp.HyperComposition([(0.25, ["sqrt", "auto"]),
                             (0.75,
                              hp.HyperRangeBetaFloat(start=0,
                                                     end=1,
                                                     alpha=3,
                                                     beta=1))]),
        "max_depth":
        hp.HyperChoice([
            None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25,
            30, 50, 100
        ]),
        "min_samples_split":
        hp.HyperRangeBetaInt(start=1, end=100, alpha=1, beta=5),
        # Linear model
        "C":
        hp.HyperLogRangeFloat(start=0.00001, end=10, n=50),
        "alpha":
        hp.HyperLogRangeFloat(start=0.00001, end=10, n=50),
        # CV
        "analyzer":
        hp.HyperChoice(["word", "char", "char_wb"]),
        "penalty": ["l1", "l2"],
        "random_state": [
            123
        ],  # So that every for every model with a random_state attribute, it will be passed and fix
        "columns_to_encode": ["--object--"]
    }
Ejemplo n.º 6
0
class NumericalEncoder_CatEncoder(ModelRepresentationBase):
    klass = NumericalEncoder
    category = StepCategories.CategoryEncoder
    type_of_variable = (TypeOfVariables.CAT, TypeOfVariables.NUM)

    custom_hyper = {
        "encoding_type": ["dummy", "num"],
        "min_nb_observations": hp.HyperRangeInt(2, 20)
    }

    type_of_model = None

    use_y = False
Ejemplo n.º 7
0
 def get_hyper_parameter(cls):
     """ specific function to handle dependency between hyper-parameters : bagging_fraction AND bagging_freq """
     res = hp.HyperComposition([
         ##################
         ### No Bagging ###
         ##################
         # * bagging_freq == 0
         # * bagging_fraction  == 1.0
         # * no random forest here : 'booting_type' != 'rf'
         (
             0.5,
             hp.HyperCrossProduct({
                 "boosting_type": ["gbdt", "dart"],
                 "learning_rate":
                 hp.HyperLogRangeFloat(0.0001, 0.1),
                 "max_depth":
                 hp.HyperChoice([
                     -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                     20, 25, 30, 50, 100
                 ]),
                 "n_estimators":
                 hp.HyperComposition([
                     (0.50, hp.HyperRangeInt(start=25, end=175, step=25)),
                     (0.25, hp.HyperRangeInt(start=200, end=900, step=100)),
                     (0.25, hp.HyperRangeInt(start=1000,
                                             end=10000,
                                             step=100)),
                 ]),
                 "colsample_bytree":
                 hp.HyperRangeBetaFloat(start=0.1, end=1, alpha=3,
                                        beta=1),  # Mean = 0.75
                 "min_child_samples":
                 hp.HyperRangeInt(2, 50),
                 "num_leaves":
                 hp.HyperRangeInt(10, 200),
                 "bagging_fraction": [1.0],
                 "bagging_freq": [0],
                 "n_jobs": [1],
             }),
         ),
         ###############
         ### Bagging ###
         ###############
         # * bagging_freq = 1
         # * bagging_fraction < 1
         (
             0.5,
             hp.HyperCrossProduct({
                 "boosting_type": ["rf", "gbdt", "dart"],
                 "learning_rate":
                 hp.HyperLogRangeFloat(0.0001, 0.1),
                 "max_depth":
                 hp.HyperChoice([
                     -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                     20, 25, 30, 50, 100
                 ]),
                 "n_estimators":
                 hp.HyperComposition([
                     (0.50, hp.HyperRangeInt(start=25, end=175, step=25)),
                     (0.25, hp.HyperRangeInt(start=200, end=900, step=100)),
                     (0.25, hp.HyperRangeInt(start=1000,
                                             end=10000,
                                             step=100)),
                 ]),
                 "colsample_bytree":
                 hp.HyperRangeBetaFloat(start=0.1, end=1, alpha=3,
                                        beta=1),  # Mean = 0.75
                 "min_child_samples":
                 hp.HyperRangeInt(2, 50),
                 "num_leaves":
                 hp.HyperRangeInt(10, 200),
                 "bagging_fraction":
                 hp.HyperRangeBetaFloat(start=0.1, end=1, alpha=3, beta=1),
                 "bagging_freq": [1],
                 "n_jobs": [1],
             }),
         ),
     ])
     return res