def build_model(): """ Build a MultiOutputClassifier pipeline using vect,tifidf,random forest with gridsearch Input Arguments: None Output: GridSearchCV Result """ #setting pipeline pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', multioutput.MultiOutputClassifier(RandomForestClassifier())) ]) #model parameters for GridSearchCV parameters = { 'vect__max_df': (0.75, 1.0), 'clf__estimator__n_estimators': [10, 20], 'clf__estimator__min_samples_split': [2, 5] } cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2) return cv
def build_model(): """ Build Pipeline a machine learning pipeline function """ pipeline = Pipeline([ ('features', FeatureUnion([ ('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())])), ('starting_verb_transformer', StartingVerbExtractor()) ])), ('classifier', multioutput.MultiOutputClassifier(AdaBoostClassifier())) ]) # Define grid search parameters parameters = { #'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)), #'features__text_pipeline__vect__max_df': (0.5, 1.0), #'features__text_pipeline__vect__max_features': (None, 5000, 10000), 'features__text_pipeline__tfidf__use_idf': (True, False), 'classifier__estimator__n_estimators': [50, 100, 200], } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier())) ]) parameters= { 'tfidf__use_idf':[True], } cv = GridSearchCV(pipeline, parameters) #return pipelne return cv
def get_lr_model( num_features: List[str], cat_features: List[str], C: float = 1.0 ) -> sklearn.base.BaseEstimator: model = pipeline.Pipeline([ ("pre", _get_preprocessor(num_features, cat_features)), ("model", multioutput.MultiOutputClassifier( linear_model.LogisticRegression(penalty="l1", C = C, solver = "saga") )), ]) return model
def build_model(): pipeline = Pipeline([ ("vect", CountVectorizer(tokenizer=tokenize)), ("tfidf", TfidfTransformer()), ("clf", multioutput.MultiOutputClassifier(RandomForestClassifier())) ]) parameters = { 'vect__max_df': (0.75, 1.0), 'clf__estimator__n_estimators': [10, 20], 'clf__estimator__min_samples_split': [2, 5] } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def get_lr_model( num_features: List[str], cat_features: List[str], C: float = 1.0 ) -> pipeline.Pipeline: """ Returns full pipeline for a logistic regression model with specified numerical and categorical features. """ model = pipeline.Pipeline([ ("pre", _get_preprocessor(num_features, cat_features)), ("model", multioutput.MultiOutputClassifier( linear_model.LogisticRegression(penalty="l1", C = C, solver = "saga") )), ]) return model
def build_model(): """ <<<<<<< HEAD Build improved model with best parameters from gridsearch results cv.best_params_ ======= Build improved model with best parameters from gridsearch results cv.best_params_ >>>>>>> 06a0f5301411fe81c27428356de50caad9916749 """ pipeline = Pipeline([ ('features', FeatureUnion([ ('text_pipeline', Pipeline([ ('count_vectorizer', CountVectorizer(tokenizer=tokenize)), ('tfidf_transformer', TfidfTransformer()) ])), ('starting_verb_transformer', StartingVerbExtractor()) ])), <<<<<<< HEAD ('classifier', multioutput.MultiOutputClassifier(LogisticRegression())) ======= ('classifier', multioutput.MultiOutputClassifier(RandomForestClassifier(n_estimators=20))) >>>>>>> 06a0f5301411fe81c27428356de50caad9916749 ]) parameters_grid = {'classifier__estimator__penalty': ['l1','l2'], # 'classifier__estimator__C': [0.001,0.01,0.1,1,10,100,1000] } # } cv = GridSearchCV(pipeline, param_grid=parameters_grid, scoring='f1_micro') return cv
def build_model(): ''' 改进的ML管道+网格搜索 ''' pipeline = Pipeline([('vectorizer', CountVectorizer(tokenizer=tokenize)), ('transformer', TfidfTransformer()), ('clf', multioutput.MultiOutputClassifier( RandomForestClassifier(random_state=10), n_jobs=-1))]) # create grid search object parameters = {'clf__estimator__min_samples_split': [3, 4]} cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1) return cv
def build_model(model_type, num_targets=1): if model_type == 'gradient_boosting': base = ensemble.GradientBoostingClassifier(n_estimators=100, verbose=True) elif model_type == 'random_forest': base = ensemble.RandomForestClassifier() elif model_type == 'dummy_stratified': base = dummy.DummyClassifier('stratified') elif model_type == 'dummy_most_frequent': base = dummy.DummyClassifier('most_frequent') else: raise (ValueError('invalid model type: {}'.format(model_type))) # multiple outputs in the dataset => fit a separate regressor to each if num_targets > 1: return multioutput.MultiOutputClassifier(base) else: return base
def multioutput_classification_example(): X, y1 = datasets.make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1) y2 = utils.shuffle(y1, random_state=1) y3 = utils.shuffle(y1, random_state=2) Y = np.vstack((y1, y2, y3)).T n_samples, n_features = X.shape # 10, 100. n_outputs = Y.shape[1] # 3. n_classes = 3 clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=1) mo_clf = multioutput.MultiOutputClassifier(clf, n_jobs=-1) mo_clf.fit(X, Y) pred = mo_clf.predict(X) print('Prediction =\n', pred)
def build_model(): #setting pipeline pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier())) ]) # fbeta_score scoring object using make_scorer() scorer = make_scorer (f1_scorer_eval) #model parameters for GridSearchCV parameters = { 'vect__max_df': (0.75, 1.0), 'clf__estimator__n_estimators': [10, 20], 'clf__estimator__min_samples_split': [2, 5] } cv = GridSearchCV (pipeline, param_grid= parameters, scoring = scorer, verbose =7 ) return cv
def build_model(): """ Build the pipeline model that is going to be used as the model Input: word_dict - Dictionary The word dictionary from all of the messages Output: cv - model The model structure to be used for fitting and predicting """ #setting pipeline pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', multioutput.MultiOutputClassifier(RandomForestClassifier())) ]) #model parameters for GridSearchCV parameters = {'clf__estimator__max_depth': [1, 2, None]} cv = GridSearchCV(pipeline, param_grid=parameters, verbose=7) return cv
def build_model(): """ Function to build model based on best parameters predetermined by grid search during development phase. Returns: Machine Learning Pipeline """ pipeline_ada = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', multioutput.MultiOutputClassifier(AdaBoostClassifier())) ]) parameters_ada = {'clf__estimator__learning_rate': [0.2, 0.5, 1], 'clf__estimator__n_estimators': [50, 100]} cv = GridSearchCV(pipeline_ada, param_grid=parameters_ada, verbose=3) return cv ''' WITHOUT GRID SEARCH
def train( data: tuple[np.ndarray, np.ndarray], model="BayesianRidge", n_estimators=100, alpha=0.0001, alpha_1=1.0e-6, alpha_2=1.0e-6, lambda_1=1.0e-6, lambda_2=1.0e-6, n_iter=300, epsilon=1.35, alphas=[0.1, 0.5, 1], gcv_mode="auto", solver="auto", n_hidden=20, rbf_width=0, activation_func="selu" # load_trained_model=0, update_trained_model=1, save_model=1, saved_model_path_string='stored_models', ) -> Any: """Sklearn model. Models as input parameter. Can be linear, ridge, Huber or much more. It also contain extreme learning machine model from sklearn extensions. Note: There are many parameters in function, but all models use just a few of them. Usually default parameters are just enough. Some of models are regressors and some are classifiers. If it's classifier, it's optimal to have data sorted in limited number of bins. Args: data (tuple[np.ndarray, np.ndarray]) - Tuple (X, y) of input train vectors X and train outputs y. Insert input with no constant column - added by default in sklearn. Check `mydatapreprocessing` how to generate output. model ((str, object), optional): Model that will be used. You can insert model itself or just a name of used class. All possible options below in docs. Defaults to 'BayesianRidge'. n_estimators (100, optional): Parameter of some model. Defaults to 100. alpha (float, optional): Parameter of some model. Defaults to 0.0001. alpha_1 (float, optional): Parameter of some model. Defaults to 1.e-6. alpha_2 (float, optional): Parameter of some model. Defaults to 1.e-6. lambda_1 (float, optional): Parameter of some model. Defaults to 1.e-6. lambda_2 (float, optional): Parameter of some model. Defaults to 1.e-6. n_iter (int, optional): Parameter of some model. Defaults to 300. epsilon (float, optional): Parameter of some model. Defaults to 1.35. alphas (list, optional): Parameter of some model. Defaults to [0.1, 0.5, 1]. gcv_mode (str, optional): Parameter of some model. Defaults to 'auto'. solver (str, optional): Parameter of some model. Defaults to 'auto'. n_hidden (int, optional): Parameter of some model. Defaults to 20. rbf_width (int, optional): Parameter of some model. Defaults to 0. activation_func (str, optional): Parameter of some model. Defaults to 'selu'. Returns: np.ndarray: Predictions of input time series. Options if string:: ['PLSRegression', 'RandomForestRegressor', 'ExtraTreesRegressor', 'BaggingRegressor', 'GradientBoostingRegressor', 'AdaBoostRegressor', 'VotingRegressor', 'StackingRegressor', 'RandomForestClassifier', 'ExtraTreesClassifier', 'BaggingClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier', 'VotingClassifier', 'StackingClassifier', 'GaussianProcessRegressor', 'GaussianProcessClassifier', 'IsotonicRegression', Regression', 'HuberRegressor', 'LinearRegression', 'LogisticRegression', 'LogisticRegressionCV', 'PassiveAggressiveRegressor', 'SGDRegressor', 'TheilSenRegressor', 'RANSACRegressor', 'PoissonRegressor', 'GammaRegressor', 'TweedieRegressor', 'PassiveAggressiveClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'OneVsRestClassifier', 'OneVsOneClassifier', 'OutputCodeClassifier', 'MultiOutputRegressor', 'RegressorChain', 'MultiOutputClassifier', 'ClassifierChain', 'KNeighborsRegressor', 'RadiusNeighborsRegressor', 'KNeighborsClassifier', 'RadiusNeighborsClassifier', 'MLPRegressor', 'MLPClassifier', 'SelfTrainingClassifier', 'DecisionTreeRegressor', 'ExtraTreeRegressor', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'TransformedTargetRegressor', 'BayesianRidge', 'ElasticNet', 'Hinge', 'Lars', 'LarsCV', 'Lasso', 'LassoCV', 'LassoLarsIC', 'Log', 'ModifiedHuber', 'MultiTaskElasticNet', 'MultiTaskLasso', 'MultiTaskLassoCV', 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', 'Perceptron', 'Ridge', 'RidgeCV', 'SquaredLoss', 'SVR', # Sklearn extensions 'ELMClassifier', 'ELMRegressor', 'GenELMClassifier', 'GenELMRegressor'] """ from sklearn import ( multioutput, linear_model, ensemble, tree, neighbors, gaussian_process, ) X, y = get_inputs(data) # If string like 'LinearRegression', find class with such a name if isinstance(model, str): for i in [linear_model, ensemble, tree, neighbors, gaussian_process]: if model in i.__all__: model = getattr(i, model) break # If model is still string, not object from sklearn, it means it was not found, # may be from sklearnextensions library if isinstance(model, str): import sklearn_extensions.extreme_learning_machines.elm as elm model = getattr(elm, model) # Model defined by string not found if isinstance(model, str): raise AttributeError( mylogging.return_str( "You defined model that was not found in sklearn. You can use not only string, but also" "object or class itself. You can use function `get_all_models` to get list of all" "possible models and then use one of them.")) # If class, but no object was configured, create instance if callable(model): model = model() params = { "n_estimators": n_estimators, "alpha": alpha, "alpha_1": alpha_1, "alpha_2": alpha_2, "lambda_1": lambda_1, "lambda_2": lambda_2, "n_iter": n_iter, "epsilon": epsilon, "alphas": alphas, "gcv_mode": gcv_mode, "solver": solver, "n_hidden": n_hidden, "rbf_width": rbf_width, "activation_func": activation_func, } # Params, that are configured in function params as well as configurable in models used_params = { i: j for (i, j) in params.items() if i in model.get_params() } model.set_params(**used_params) if y.shape[1] == 1: model.output_shape = "one_step" setattr(model, "output_shape", "one_step") y = y.ravel() else: if model._estimator_type == "regressor": model = multioutput.MultiOutputRegressor(model) elif model._estimator_type == "classifier": model = multioutput.MultiOutputClassifier(model) setattr(model, "output_shape", "multi_step") model.fit(X, y) return model
def model(self, **kwargs): return multioutput.MultiOutputClassifier(super().model(**kwargs))