def build_model():
    """
    Build a MultiOutputClassifier pipeline using vect,tifidf,random forest with gridsearch
    Input Arguments: 
        None
    Output:
        GridSearchCV Result
    """

    #setting pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', multioutput.MultiOutputClassifier(RandomForestClassifier()))
    ])

    #model parameters for GridSearchCV
    parameters = {
        'vect__max_df': (0.75, 1.0),
        'clf__estimator__n_estimators': [10, 20],
        'clf__estimator__min_samples_split': [2, 5]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2)

    return cv
Exemple #2
0
def build_model():
    """
    Build Pipeline a machine learning pipeline function
    
    """

    pipeline = Pipeline([
        ('features',
         FeatureUnion([
             ('text_pipeline',
              Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                        ('tfidf', TfidfTransformer())])),
             ('starting_verb_transformer', StartingVerbExtractor())
         ])),
        ('classifier', multioutput.MultiOutputClassifier(AdaBoostClassifier()))
    ])
    # Define grid search parameters
    parameters = {
        #'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        #'features__text_pipeline__vect__max_df': (0.5, 1.0),
        #'features__text_pipeline__vect__max_features': (None, 5000, 10000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'classifier__estimator__n_estimators': [50, 100, 200],
    }
    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv
Exemple #3
0
def build_model():
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier()))
            ])
    parameters= {
            'tfidf__use_idf':[True],
            }
    cv = GridSearchCV(pipeline, parameters)
   #return pipelne 
    return cv
def get_lr_model(
    num_features: List[str], cat_features: List[str], C: float = 1.0
) -> sklearn.base.BaseEstimator:

    model = pipeline.Pipeline([
        ("pre", _get_preprocessor(num_features, cat_features)),
        ("model", multioutput.MultiOutputClassifier(
                    linear_model.LogisticRegression(penalty="l1", C = C, solver = "saga")
        )),
    ])
    
    return model
Exemple #5
0
def build_model():
    pipeline = Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer()),
        ("clf", multioutput.MultiOutputClassifier(RandomForestClassifier()))
    ])
    parameters = {
        'vect__max_df': (0.75, 1.0),
        'clf__estimator__n_estimators': [10, 20],
        'clf__estimator__min_samples_split': [2, 5]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv
Exemple #6
0
def get_lr_model(
    num_features: List[str], cat_features: List[str], C: float = 1.0
) -> pipeline.Pipeline:
    """
    Returns full pipeline for a logistic regression model with 
    specified numerical and categorical features.
    """

    model = pipeline.Pipeline([
        ("pre", _get_preprocessor(num_features, cat_features)),
        ("model", multioutput.MultiOutputClassifier(
                    linear_model.LogisticRegression(penalty="l1", C = C, solver = "saga")
        )),
    ])
    return model
Exemple #7
0
def build_model():
    """ 
<<<<<<< HEAD
    Build improved model with best parameters from gridsearch results cv.best_params_
=======
    Build improved model with best parameters from gridsearch         results cv.best_params_
>>>>>>> 06a0f5301411fe81c27428356de50caad9916749
    
    """
    
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ])),

            ('starting_verb_transformer', StartingVerbExtractor())
        ])),

<<<<<<< HEAD
        ('classifier', multioutput.MultiOutputClassifier(LogisticRegression()))
=======
        ('classifier', multioutput.MultiOutputClassifier(RandomForestClassifier(n_estimators=20)))
>>>>>>> 06a0f5301411fe81c27428356de50caad9916749
    ])
    
    parameters_grid = {'classifier__estimator__penalty': ['l1','l2'], 
                      # 'classifier__estimator__C': [0.001,0.01,0.1,1,10,100,1000]
                      }
    #                   }

    cv = GridSearchCV(pipeline, param_grid=parameters_grid, scoring='f1_micro')
    
    return cv
Exemple #8
0
def build_model():
    '''
    改进的ML管道+网格搜索
    '''
    pipeline = Pipeline([('vectorizer', CountVectorizer(tokenizer=tokenize)),
                         ('transformer', TfidfTransformer()),
                         ('clf',
                          multioutput.MultiOutputClassifier(
                              RandomForestClassifier(random_state=10),
                              n_jobs=-1))])

    # create grid search object
    parameters = {'clf__estimator__min_samples_split': [3, 4]}
    cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)

    return cv
Exemple #9
0
def build_model(model_type, num_targets=1):
    if model_type == 'gradient_boosting':
        base = ensemble.GradientBoostingClassifier(n_estimators=100,
                                                   verbose=True)
    elif model_type == 'random_forest':
        base = ensemble.RandomForestClassifier()
    elif model_type == 'dummy_stratified':
        base = dummy.DummyClassifier('stratified')
    elif model_type == 'dummy_most_frequent':
        base = dummy.DummyClassifier('most_frequent')
    else:
        raise (ValueError('invalid model type: {}'.format(model_type)))

    # multiple outputs in the dataset => fit a separate regressor to each
    if num_targets > 1:
        return multioutput.MultiOutputClassifier(base)
    else:
        return base
Exemple #10
0
def multioutput_classification_example():
    X, y1 = datasets.make_classification(n_samples=10,
                                         n_features=100,
                                         n_informative=30,
                                         n_classes=3,
                                         random_state=1)
    y2 = utils.shuffle(y1, random_state=1)
    y3 = utils.shuffle(y1, random_state=2)
    Y = np.vstack((y1, y2, y3)).T

    n_samples, n_features = X.shape  # 10, 100.
    n_outputs = Y.shape[1]  # 3.
    n_classes = 3
    clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=1)

    mo_clf = multioutput.MultiOutputClassifier(clf, n_jobs=-1)
    mo_clf.fit(X, Y)
    pred = mo_clf.predict(X)
    print('Prediction =\n', pred)
Exemple #11
0
def build_model():

    #setting pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier()))
        ])

    # fbeta_score scoring object using make_scorer()
    scorer = make_scorer (f1_scorer_eval)

    #model parameters for GridSearchCV
    parameters = {  'vect__max_df': (0.75, 1.0),
                    'clf__estimator__n_estimators': [10, 20],
                    'clf__estimator__min_samples_split': [2, 5]
              }
    cv = GridSearchCV (pipeline, param_grid= parameters, scoring = scorer, verbose =7 )

    return cv
def build_model():
    """
    Build the pipeline model that is going to be used as the model
    Input: word_dict - Dictionary
           The word dictionary from all of the messages
           
    Output: cv - model
            The model structure to be used for fitting and predicting
    """
    #setting pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', multioutput.MultiOutputClassifier(RandomForestClassifier()))
    ])

    #model parameters for GridSearchCV

    parameters = {'clf__estimator__max_depth': [1, 2, None]}
    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=7)
    return cv
def build_model():
    """
    Function to build model based on best parameters
    predetermined by grid search during development phase.
    
    Returns:
    Machine Learning Pipeline
    """
    
    pipeline_ada = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', multioutput.MultiOutputClassifier(AdaBoostClassifier()))
        ])
        
    parameters_ada =  {'clf__estimator__learning_rate': [0.2, 0.5, 1],
              'clf__estimator__n_estimators': [50, 100]}

    cv = GridSearchCV(pipeline_ada, param_grid=parameters_ada, verbose=3)
    
    return cv
    
    ''' WITHOUT GRID SEARCH
def train(
    data: tuple[np.ndarray, np.ndarray],
    model="BayesianRidge",
    n_estimators=100,
    alpha=0.0001,
    alpha_1=1.0e-6,
    alpha_2=1.0e-6,
    lambda_1=1.0e-6,
    lambda_2=1.0e-6,
    n_iter=300,
    epsilon=1.35,
    alphas=[0.1, 0.5, 1],
    gcv_mode="auto",
    solver="auto",
    n_hidden=20,
    rbf_width=0,
    activation_func="selu"
    #  load_trained_model=0, update_trained_model=1, save_model=1, saved_model_path_string='stored_models',
) -> Any:
    """Sklearn model. Models as input parameter. Can be linear, ridge, Huber or much more.
    It also contain extreme learning machine model from sklearn extensions.

    Note:
        There are many parameters in function, but all models use just a few of them.
        Usually default parameters are just enough.

        Some of models are regressors and some are classifiers. If it's classifier, it's optimal
        to have data sorted in limited number of bins.

    Args:
        data (tuple[np.ndarray, np.ndarray]) - Tuple (X, y) of input train vectors X and train outputs y.
            Insert input with no constant column - added by default in sklearn.
            Check `mydatapreprocessing` how to generate output.
        model ((str, object), optional): Model that will be used. You can insert model itself or
            just a name of used class. All possible options below in docs. Defaults to 'BayesianRidge'.
        n_estimators (100, optional):  Parameter of some model. Defaults to 100.
        alpha (float, optional): Parameter of some model. Defaults to 0.0001.
        alpha_1 (float, optional): Parameter of some model. Defaults to 1.e-6.
        alpha_2 (float, optional): Parameter of some model. Defaults to 1.e-6.
        lambda_1 (float, optional): Parameter of some model. Defaults to 1.e-6.
        lambda_2 (float, optional): Parameter of some model. Defaults to 1.e-6.
        n_iter (int, optional): Parameter of some model. Defaults to 300.
        epsilon (float, optional): Parameter of some model. Defaults to 1.35.
        alphas (list, optional): Parameter of some model. Defaults to [0.1, 0.5, 1].
        gcv_mode (str, optional): Parameter of some model. Defaults to 'auto'.
        solver (str, optional): Parameter of some model. Defaults to 'auto'.
        n_hidden (int, optional): Parameter of some model. Defaults to 20.
        rbf_width (int, optional): Parameter of some model. Defaults to 0.
        activation_func (str, optional): Parameter of some model. Defaults to 'selu'.

    Returns:
        np.ndarray: Predictions of input time series.

    Options if string::

        ['PLSRegression', 'RandomForestRegressor', 'ExtraTreesRegressor', 'BaggingRegressor',
        'GradientBoostingRegressor', 'AdaBoostRegressor', 'VotingRegressor', 'StackingRegressor',
        'RandomForestClassifier', 'ExtraTreesClassifier', 'BaggingClassifier', 'GradientBoostingClassifier',
        'AdaBoostClassifier', 'VotingClassifier', 'StackingClassifier', 'GaussianProcessRegressor',
        'GaussianProcessClassifier', 'IsotonicRegression', Regression', 'HuberRegressor', 'LinearRegression',
        'LogisticRegression', 'LogisticRegressionCV', 'PassiveAggressiveRegressor', 'SGDRegressor',
        'TheilSenRegressor', 'RANSACRegressor', 'PoissonRegressor', 'GammaRegressor', 'TweedieRegressor',
        'PassiveAggressiveClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'OneVsRestClassifier',
        'OneVsOneClassifier', 'OutputCodeClassifier', 'MultiOutputRegressor', 'RegressorChain',
        'MultiOutputClassifier', 'ClassifierChain', 'KNeighborsRegressor', 'RadiusNeighborsRegressor',
        'KNeighborsClassifier', 'RadiusNeighborsClassifier', 'MLPRegressor', 'MLPClassifier',
        'SelfTrainingClassifier', 'DecisionTreeRegressor', 'ExtraTreeRegressor', 'DecisionTreeClassifier',
        'ExtraTreeClassifier', 'TransformedTargetRegressor', 'BayesianRidge', 'ElasticNet', 'Hinge', 'Lars', 'LarsCV',
        'Lasso', 'LassoCV', 'LassoLarsIC', 'Log', 'ModifiedHuber', 'MultiTaskElasticNet', 'MultiTaskLasso',
        'MultiTaskLassoCV', 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', 'Perceptron', 'Ridge',
        'RidgeCV', 'SquaredLoss', 'SVR',
        # Sklearn extensions
        'ELMClassifier', 'ELMRegressor', 'GenELMClassifier', 'GenELMRegressor']
    """
    from sklearn import (
        multioutput,
        linear_model,
        ensemble,
        tree,
        neighbors,
        gaussian_process,
    )

    X, y = get_inputs(data)

    # If string like 'LinearRegression', find class with such a name
    if isinstance(model, str):

        for i in [linear_model, ensemble, tree, neighbors, gaussian_process]:
            if model in i.__all__:
                model = getattr(i, model)
                break

        # If model is still string, not object from sklearn, it means it was not found,
        # may be from sklearnextensions library
        if isinstance(model, str):

            import sklearn_extensions.extreme_learning_machines.elm as elm

            model = getattr(elm, model)

            # Model defined by string not found
            if isinstance(model, str):

                raise AttributeError(
                    mylogging.return_str(
                        "You defined model that was not found in sklearn. You can use not only string, but also"
                        "object or class itself. You can use function `get_all_models` to get list of all"
                        "possible models and then use one of them."))

    # If class, but no object was configured, create instance
    if callable(model):
        model = model()

    params = {
        "n_estimators": n_estimators,
        "alpha": alpha,
        "alpha_1": alpha_1,
        "alpha_2": alpha_2,
        "lambda_1": lambda_1,
        "lambda_2": lambda_2,
        "n_iter": n_iter,
        "epsilon": epsilon,
        "alphas": alphas,
        "gcv_mode": gcv_mode,
        "solver": solver,
        "n_hidden": n_hidden,
        "rbf_width": rbf_width,
        "activation_func": activation_func,
    }

    # Params, that are configured in function params as well as configurable in models
    used_params = {
        i: j
        for (i, j) in params.items() if i in model.get_params()
    }

    model.set_params(**used_params)

    if y.shape[1] == 1:
        model.output_shape = "one_step"
        setattr(model, "output_shape", "one_step")

        y = y.ravel()

    else:
        if model._estimator_type == "regressor":
            model = multioutput.MultiOutputRegressor(model)
        elif model._estimator_type == "classifier":
            model = multioutput.MultiOutputClassifier(model)

        setattr(model, "output_shape", "multi_step")

    model.fit(X, y)

    return model
 def model(self, **kwargs):
     return multioutput.MultiOutputClassifier(super().model(**kwargs))