def test_multi_output_classification(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict, prodict_proba and score forest = RandomForestClassifier(n_estimators=10, random_state=1) multi_target_forest = MultiOutputClassifier(forest) # train the multi_target_forest and also get the predictions. multi_target_forest.fit(X, y) predictions = multi_target_forest.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) predict_proba = multi_target_forest.predict_proba(X) assert len(predict_proba) == n_outputs for class_probabilities in predict_proba: assert_equal((n_samples, n_classes), class_probabilities.shape) assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions) # train the forest with each column and assert that predictions are equal for i in range(3): forest_ = clone(forest) # create a clone with the same state forest_.fit(X, y[:, i]) assert_equal(list(forest_.predict(X)), list(predictions[:, i])) assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
def test_multiclass_multioutput_estimator_predict_proba(): seed = 542 # make test deterministic rng = np.random.RandomState(seed) # random features X = rng.normal(size=(5, 5)) # random labels y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1) # 2 classes y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1) # 3 classes Y = np.concatenate([y1, y2], axis=1) clf = MultiOutputClassifier(LogisticRegression(random_state=seed)) clf.fit(X, Y) y_result = clf.predict_proba(X) y_actual = [np.array([[0.23481764, 0.76518236], [0.67196072, 0.32803928], [0.54681448, 0.45318552], [0.34883923, 0.65116077], [0.73687069, 0.26312931]]), np.array([[0.5171785, 0.23878628, 0.24403522], [0.22141451, 0.64102704, 0.13755846], [0.16751315, 0.18256843, 0.64991843], [0.27357372, 0.55201592, 0.17441036], [0.65745193, 0.26062899, 0.08191907]])] for i in range(len(y_actual)): assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_predict_proba(): sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3) param = {'loss': ('hinge', 'log', 'modified_huber')} # inner function for custom scoring def custom_scorer(estimator, X, y): if hasattr(estimator, "predict_proba"): return 1.0 else: return 0.0 grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param, scoring=custom_scorer, cv=3, error_score=np.nan) multi_target_linear = MultiOutputClassifier(grid_clf) multi_target_linear.fit(X, y) multi_target_linear.predict_proba(X) # SGDClassifier defaults to loss='hinge' which is not a probabilistic # loss function; therefore it does not expose a predict_proba method sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) multi_target_linear.fit(X, y) err_msg = "The base estimator should implement predict_proba method" with pytest.raises(ValueError, match=err_msg): multi_target_linear.predict_proba(X)
def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0] # parallelism requires this to be the case for a sane implementation assert_false(est1 is est2)
def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0] if cpu_count() > 1: # parallelism requires this to be the case for a sane implementation assert est1 is not est2
def test_multi_output_exceptions(): # NotFittedError when fit is not done but score, predict and # and predict_proba are called moc = MultiOutputClassifier(LinearSVC(random_state=0)) assert_raises(NotFittedError, moc.predict, y) assert_raises(NotFittedError, moc.predict_proba, y) assert_raises(NotFittedError, moc.score, X, y) # ValueError when number of outputs is different # for fit and score y_new = np.column_stack((y1, y2)) moc.fit(X, y) assert_raises(ValueError, moc.score, X, y_new)
def test_multiclass_multioutput_estimator(): # test to check meta of meta estimators svc = LinearSVC(random_state=0) multi_class_svc = OneVsRestClassifier(svc) multi_target_svc = MultiOutputClassifier(multi_class_svc) multi_target_svc.fit(X, y) predictions = multi_target_svc.predict(X) assert_equal((n_samples, n_outputs), predictions.shape) # train the forest with each column and assert that predictions are equal for i in range(3): multi_class_svc_ = clone(multi_class_svc) # create a clone multi_class_svc_.fit(X, y[:, i]) assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict sgd_linear_clf = SGDClassifier(loss='log', random_state=1) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) # train the multi_target_linear and also get the predictions. half_index = X.shape[0] // 2 multi_target_linear.partial_fit( X[:half_index], y[:half_index], classes=classes) first_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), first_predictions.shape) multi_target_linear.partial_fit(X[half_index:], y[half_index:]) second_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), second_predictions.shape) # train the linear classification with each column and assert that # predictions are equal after first partial_fit and second partial_fit for i in range(3): # create a clone with the same state sgd_linear_clf = clone(sgd_linear_clf) sgd_linear_clf.partial_fit( X[:half_index], y[:half_index, i], classes=classes[i]) assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i]) sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i]) assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] yw = [[3, 2], [2, 3], [3, 2]] w = np.asarray([2., 1., 1.]) sgd_linear_clf = SGDClassifier(random_state=1) clf_w = MultiOutputClassifier(sgd_linear_clf) clf_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [3, 2], [2, 3], [3, 2]] sgd_linear_clf = SGDClassifier(random_state=1) clf = MultiOutputClassifier(sgd_linear_clf) clf.fit(X, y) X_test = [[1.5, 2.5, 3.5]] assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_classification_sample_weights(): # weighted classifier Xw = [[1, 2, 3], [4, 5, 6]] yw = [[3, 2], [2, 3]] w = np.asarray([2., 1.]) forest = RandomForestClassifier(n_estimators=10, random_state=1) clf_w = MultiOutputClassifier(forest) clf_w.fit(Xw, yw, w) # unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]] y = [[3, 2], [3, 2], [2, 3]] forest = RandomForestClassifier(n_estimators=10, random_state=1) clf = MultiOutputClassifier(forest) clf.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def build_model(): """ This function builds the ML model which will be used to train and predict the data. Output: - cv: ML model which will be used to train and predict the data """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier( RandomForestClassifier(n_estimators=50, min_samples_split=2, n_jobs=1)))]) parameters = { 'clf__estimator__n_estimators': [50], 'clf__estimator__min_samples_split': [2], 'clf__estimator__n_jobs': [1] } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): """ Build a tf-idf/random forest model and find hyperparameters using GridSearchCV returns optimized model """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # params dict to tune a model # Some parameters are commented because it takes too much time. parameters = { # 'clf__estimator__max_depth': [10, 25, 50], # 'clf__estimator__n_estimators': [10, 50, 100], # 'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (5000, 10000, 50000) } # instantiate a gridsearchcv object with the params defined cv = GridSearchCV(pipeline, param_grid=parameters, verbose=1) return cv
def build_model(grid_search=False): ''' build a Machine Learning pipeline using Countvectorizer,TfidfTransformer and MultiOutputClassifier parameters grid_search: a boolean parameter which is by default set as False if specified as True, Grid Search CV is used for hyperparameter tuning. ------- Returns Pipeline:An sklearn pipeline ------- ''' #build the MAchine Learning pipeline using sklearn pipeline=Pipeline([ ('vect',CountVectorizer(tokenizer=tokenize)), ('tfidf',TfidfTransformer()), ('clf',MultiOutputClassifier(RandomForestClassifier(n_jobs=-1, min_samples_split = 2, bootstrap=True, n_estimators=100,))) ]) if grid_search: parameters={ 'clf__estimator__min_samples_split': [2, 3, 4], 'clf__estimator__bootstrap': [True], 'clf__estimator__n_estimators': [100, 200, 300] } cv =GridSearchCV(pipeline,parameters,n_jobs = -1) return cv else: return pipeline
def split_n_train(X, Y): """ input: multi-labels Y which is a dataframe holding information about our 34 categories to classify, a numpy array X keeping text to be classified a tokenizer function to tokenize our text output: a trained classification model X_train: 60% of the X array for trainning purposes X_test: remaining 40% of the X array for testing purposes y_train: 60% rows of the Y dataframe to train our classifier y_test: 40% remaining ]) Description splits and trains the classifier """ #split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4) pipeline = Pipeline([ ('features', FeatureUnion([('text_pipeline', Pipeline([('count_vectorizer', CountVectorizer(tokenizer=tokenize)), ('tfidf_transformer', TfidfTransformer())])) ])), ('classifier', MultiOutputClassifier(AdaBoostClassifier())) ]) start = time.time() / 60 # train classifier fitted = pipeline.fit(X_train, y_train) stop = time.time() / 60 print(f"Model calculation time: {round(stop - start)} minutes") return fitted, X_train, X_test, y_train, y_test
def build_model(): ''' Builds a model, create pipeline, hypertuning as well as gridsearchcv. input: None output: cv: GridSearch model result. ''' # Create a pipeline #pipeline = Pipeline([ # ('vect', CountVectorizer(tokenizer=tokenize)), # ('tfidf', TfidfTransformer()), # ('clf', MultiOutputClassifier( # RandomForestClassifier(class_weight='balanced', random_state=0) # )) #]) pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC(random_state=0)))) ]) # Find the optimal model using GridSearchCV #parameters = { # 'tfidf__smooth_idf': [True, False], # 'clf__estimator__n_estimators': [20, 100], # 'clf__estimator__max_depth': [2, 10] # } parameters = { 'tfidf__smooth_idf': [True, False], 'clf__estimator__estimator__C': [1, 2, 5] } cv = GridSearchCV(pipeline, param_grid=parameters, cv=5) return cv
def build_model(): """ Build Model function This function output is a machine learning pipeline. This machine learning pipeline takes in the message column as input and output classification results on the other 36 categories in the dataset. A grid search is used to find better parameters. Parameters: y_true -> labels y_pred -> predictions Output: cv -> a trained ML pipeline """ pipeline = Pipeline([ ('features', FeatureUnion([ ('text_pipeline', Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()) ])), ('starting_verb', StartingVerbExtractor()) ])), ('clf', MultiOutputClassifier(AdaBoostClassifier())) ]) parameters = { 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)), 'features__text_pipeline__vect__max_df': (0.5,0.75), 'features__text_pipeline__vect__max_features': (None, 1000), 'features__text_pipeline__tfidf__use_idf': (True, False) } scorer = make_scorer(f1_score,greater_is_better = True) cv = GridSearchCV(pipeline, parameters,scoring = scorer,verbose=2,n_jobs=-1) return cv
def build_model(): """ Creates a model object that can be trained and used for predictions It uses an ML pipeline and grid search for optimization Args: None Returns: cv: the model to use """ pipeline = Pipeline([ ('features', FeatureUnion([ ('text_pipeline', Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()) ])), ('starting_verb', StartingVerbExtractor()) ])), ('clf', MultiOutputClassifier(RandomForestClassifier())) #ArmClassifier()) ]) parameters = { 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)), 'features__text_pipeline__vect__max_df': (0.5, 1.0), 'features__text_pipeline__vect__max_features': (None, 5000), 'features__text_pipeline__tfidf__use_idf': (True, False), #'clf__estimator__n_estimators': [100, 200], #'clf__estimator__min_samples_split': [2, 4], 'features__transformer_weights': ( {'text_pipeline': 1, 'starting_verb': 0.5}, {'text_pipeline': 0.5, 'starting_verb': 1}, {'text_pipeline': 0.8, 'starting_verb': 1}, ) } cv =GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): """ MultiOutput Text Classifier model building with pipeline technique and grid search over multiple parameters. Params -------- Returns -------- cv (GridSearchCV object): Grid search pipeline MultiOutput Text Classifier model to be trained. """ # Pipeline: pipeline = Pipeline([('vectorizer', CountVectorizer(tokenizer=tokenize, max_df=0.5, max_features=10000)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(KNeighborsClassifier(), n_jobs=-1))]) # Parameters for grid searching (Reduced version for speed purposes, see README) parameters = parameters = { #'vectorizer__ngram_range': ((1, 1), (1, 2)), #'vectorizer__max_df': (0.5, 0.75, 1.0), #'vectorizer__max_features': (None, 5000, 10000), #'tfidf__use_idf': (True, False), 'clf__estimator__leaf_size': [20, 30], #'clf__estimator__metric': ('minkowski', 'chebyshev'), 'clf__estimator__n_neighbors': [4, 5], #'clf__estimator__weights': ('uniform', 'distance') } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): ''' Builds a model pipeline and a grid search object Returns: model, grid search object refit with best parameters ''' pipeline = Pipeline([('features', FeatureUnion([ ('tfidf', TfidfVectorizer(tokenizer=tokenize)), ('pos_tfidf', TfidfVectorizer(tokenizer=pos_tokenizer)) ])), ('mult_clf', MultiOutputClassifier( LinearSVC(loss='hinge', class_weight='balanced', max_iter=10000)))]) # specify parameters for grid search # # These were the initial grid search parameters - takes about 3 hrs to run # parameters = { # 'features__tfidf__max_features': [10000, None], # 'features__tfidf__ngram_range': [(1,1),(1,2)], # 'features__pos_tfidf__ngram_range': [(1,1),(1,2)], # 'mult_clf__estimator__loss': ['hinge', 'squared_hinge'] # } # specifying a single parameter to demonstrate a quick grid search parameters = {'mult_clf__estimator__C': [0.1, 1.0, 2.0]} # Create a scorer to evaluate model performance on average recall score during grid search scorer = make_scorer(recall_score, average='macro') # create grid search object model = GridSearchCV(pipeline, parameters, cv=5, scoring=scorer) return model
def build_model(): ''' Builds MultiOutputClassifier with RandomForestClassifier as estimator. A pipeline is used to streamline two feature extraction processes: 1. Bag Of Words (CountVectorizer) 2. Tfidf Transformation The method creates a GridSearchCV object to train multiple parameters of the pipeline, and returns this object. Args: (None) Returns: - cv: GridSearchCV object of the pipeline ''' # Make pipeline with feature transformations and estimator pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer(smooth_idf=True)), ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42))) ]) # Set parameters to tune params = { 'clf__estimator__criterion': ['gini', 'entropy'], 'clf__estimator__warm_start': [True, False], 'clf__estimator__n_estimators': [50, 100, 200], 'clf__estimator__min_samples_split': [0.2, 0.3], 'vect__ngram_range': [(1, 1), (1, 2)], } # Create GridSearchCV object cv = GridSearchCV(pipeline, param_grid=params, verbose=2) return cv
def build_model(): """ Function Description: Build LinearSVC model using pipeline and then use Gridsearch to search for the best paramaters Input: None Output: Cross-Validated classifier """ pipeline = Pipeline([ ('features', FeatureUnion([('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())]))])), ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC(random_state=0)))) ]) parameters = { 'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0), 'features__text_pipeline__tfidf__use_idf': (True, False), 'clf__estimator__estimator__loss': ['hinge', 'squared_hinge'], 'clf__estimator__estimator__multi_class': ['ovr', 'crammer_singer'], 'clf__estimator__estimator__max_iter': [1000, 2000, 5000] } cv = GridSearchCV(pipeline, param_grid=parameters, cv=3, verbose=1, n_jobs=-1) return cv
def build_model(): """Build a machine learning pipeline Args: None Returns: cv: gridsearchcv object. Gridsearchcv object that transforms the data, creates the model object and finds the optimal model parameters. """ pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize, min_df=1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', MultiOutputClassifier(LogisticRegression())), ]) parameters = { 'vect__min_df': [1], 'tfidf__use_idf': [False], # 'clf__estimator__multi_class': ['ovr'] 'clf__estimator__random_state': [25], 'clf__estimator__C': [1.0, 10], 'clf__estimator__penalty': ["l1", "l2"], #'clf__estimator__solver':['lbfgs','liblinear'] 'clf__estimator__solver': ['liblinear'], } # Create scorer scorer = make_scorer(performance_metric) # Create grid search object cv = GridSearchCV(pipeline, param_grid=parameters, scoring=scorer, verbose=10, n_jobs=1) return cv
def build_model(): """ Returns a model """ # Build a machine learning pipeline lm = RandomForestClassifier(n_estimators=10) pipeline = Pipeline([ ("vect", CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ("clf", MultiOutputClassifier(lm)), ]) # specify parameters for grid search # parameters = { # 'clf__estimator__min_samples_split': [2, 3, 4] # } parameters = { # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__min_df': (0.1, 0.05), # 'vect__max_features': (None, 100, 500, 1000, 5000), # 'clf__estimator__n_estimators': [50, 100, 200], # 'clf__estimator__max_depth': [4, 8, 16], 'clf__estimator__min_samples_leaf': [2, 4, 8], 'clf__estimator__min_samples_split': [2, 4, 8], } # create grid search object #par = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) cv = GridSearchCV(pipeline, param_grid=parameters, cv=5) return cv
def build_model(): """ builds the model to be fitted on the data. Model used to train on the labeled disaster messages. Uses GridSearchCV to optimize the model in terms of parameters. In order to include more parameters, uncomment the lines in "parameters" below. Args: None Returns: model (GridSearchCV): optimized model to be fitted on the dataset """ # build pipeline using tokenize function as defined above model = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier( RandomForestClassifier(n_estimators=100, random_state=1)))]) # define parameters to be used for optimizing model using GridSearchCV # Uncomment lines below to include more parameters parameters = { #'vect__ngram_range': ((1, 1), (1, 2)), #'vect__max_df': (0.5, 1.0), 'vect__max_features': (None, 7500), 'clf__estimator__n_estimators': [50, 150, 500], #'clf__estimator__max_features': ['auto', 'sqrt'] #'clf__estimator__max_depth': max_depth, #'clf__estimator__min_samples_split': min_samples_split, #'clf__estimator__min_samples_leaf':min_samples_leaf, #'clf__estimator__bootstrap': bootstrap } # Use GridSearchCV to find optimized model when fitted to data. model = GridSearchCV(model, param_grid=parameters, cv=5) return model
def build_model(): """Build model Create the model by using pipeline """ # initial basic model forest = RandomForestClassifier(random_state=42, n_jobs=4) # create pipeline pipeline = Pipeline([("text_pipeline", Pipeline([("vect", CountVectorizer(tokenizer=tokenize)), ("tfidf", TfidfTransformer())])), ("clf", MultiOutputClassifier(forest, n_jobs=4))]) # define parameters params = { "text_pipeline__vect__max_features": (5000, 10000), "clf__estimator__n_estimators": [50, 100, 150], "clf__estimator__criterion": ["gini", "entropy"], "clf__estimator__max_depth": [4, 6, 10], } # choose a method to build model print("Hint: GridSearchCV will takes more time!\n") chose_option = input("Choose the GridSearchCV(Yes or No): ").lower() while True: if chose_option in ["yes", "y"]: model = GridSearchCV(pipeline, param_grid=params, cv=3) elif chose_option in ["no", "n"]: model = pipeline else: print("Choose a validate option!") model = False if model: return model
def build_model(): # ML pipeline should take in the message column as input and output classification results on the other 36 categories # Build ML model with GridSearchCV # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html # Tuning the hyper-parameters of an estimator # https://scikit-learn.org/stable/modules/grid_search.html # https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/ """ - Build ML model with GridSearchCV Attributes / Parameters: non Returns: Trained model after performing GridSearchCV """ # ML model pipeline pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC())))]) # parameters parameters = {'vect__ngram_range': ((1, 1), (1, 2)), 'vect__max_df': (0.75, 1.0) } # create ML model model = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=3, cv=3) return model
def build_model(): """ This function returns a scikit-learn ML Pipeline that processs text messages according to NLP best practices using feature union and applying a classifier GridSearchCV is used to find the best parameters for the model """ pipeline = Pipeline([ ('features', FeatureUnion([('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())])), ('starting_verb', StartingVerbExtractor())])), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) parameters = { 'clf__estimator__max_depth': [10, 50, None], 'clf__estimator__min_samples_leaf': [2, 5, 10] } model = GridSearchCV(pipeline, parameters) return model
def build_model(): """ This founction is to build the model for classify messages. Summary or Description of the Function No Parameters: Returns: Returning the built model. """ pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': [True, False], 'tfidf__norm': ['l1', 'l2'] } model = GridSearchCV(pipeline, param_grid=parameters) return model
def build_model(): """ Function to build an ML model Inputs: None Returns: GridSearchCV object """ pipeline = Pipeline([ ('count_vectorizer', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=-1))) ]) parameters = { 'tfidf__use_idf': [True, False], 'clf__estimator__n_estimators': [10, 20] } cv = GridSearchCV(pipeline, parameters, verbose=True) return cv
def build_model(): ''' Build a Machine learning pipeline and Using grid search Input: None Output: cv: Results of GridSearchCV ''' # Build pipeline pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # Parameters for GridSearch parameters = { 'clf__estimator__n_estimators': [50, 100], 'clf__estimator__min_samples_split': [2, 3, 4], #'tfidf__use_idf': (True, False), #'clf__estimator__criterion': ['entropy', 'gini'] } cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1) return cv
def build_model(): ''' Build machine learning pipeline, trains and tunes the model using GridSearchCV OUTPUT - GridSearchCV object ''' pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) parameters = { 'clf__estimator__n_estimators': [10, 50, 100], 'clf__estimator__max_depth': [None, 10, 50] } cv = GridSearchCV(pipeline, param_grid=parameters) return cv
def build_model(): """ This function will build the ML pipeline using random forest Classifier. Please note that for the gridsearch I have taken only a few parameters beacuse the time taken was very long and the pkl model was very huge. Please refer the intital preps folder in the repo and the ML pipeline notebook to view the complete Gridsearch. Input: None Output: GridsearchCV results """ ML_pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) parameters = { 'clf__estimator__n_estimators': [100], 'clf__estimator__min_samples_split': [2], 'tfidf__smooth_idf': [True, False] } pipeline = GridSearchCV(ML_pipeline, param_grid=parameters, verbose=2, n_jobs = 3) return pipeline
def build_model(): """ Build model with GridSearchCV Returns: Trained model after performing grid search """ # model pipeline pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier( OneVsRestClassifier(LinearSVC())))]) # hyper-parameter grid parameters = {'vect__ngram_range': ((1, 1), (1, 2)), 'vect__max_df': (0.75, 1.0) } # create model model = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=3, cv=3) return model
def build_model(): """Define sklearn pipeline and parameters""" pipeline = Pipeline([('features', FeatureUnion([ ('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())])), ('starting_verb', StartingVerbExtractor()) ])), ('clf', DecisionTreeClassifier())]) parameters = [{ 'features__text_pipeline__vect__max_df': (0.5, 1.0), 'features__text_pipeline__vect__min_df': (1, 0.01), 'features__text_pipeline__vect__max_features': (None, 5000), 'features__text_pipeline__tfidf__use_idf': (True, False), 'clf': (DecisionTreeClassifier(min_samples_split=3), ), 'clf__max_depth': (None, 4) }, { 'features__text_pipeline__vect__max_df': (0.5, 1.0), 'features__text_pipeline__vect__min_df': (1, 0.01), 'features__text_pipeline__vect__max_features': (None, 5000), 'features__text_pipeline__tfidf__use_idf': (True, False), 'clf': (MultiOutputClassifier(LinearSVC(multi_class='ovr')), ) }, { 'features__text_pipeline__vect__max_df': (0.5, 1.0), 'features__text_pipeline__vect__min_df': (1, 0.01), 'features__text_pipeline__vect__max_features': (None, 5000), 'features__text_pipeline__tfidf__use_idf': (True, False), 'clf': (MLPClassifier(), ), 'clf__hidden_layer_sizes': ((100, 10), (50, ), (50, 10)) }] cv = GridSearchCV(pipeline, parameters, cv=3, n_jobs=4, verbose=10) return cv
def build_model(): ''' build the GridSearchCV Model via usage of a pipeline and defines the parameters for the moden Parameters: None Returns: cv(obj): GridSearchCV model ''' # define Pipeline pipeline = Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) # define parameters parameters = { 'clf__estimator__min_samples_leaf': [50, 100, 200], 'clf__estimator__min_samples_leaf': [2, 3, 4], } cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=4) return cv
def build_model(): """ Build a Pipeline Function Returns: cv - Best Model """ parameters = { 'clf__estimator__n_estimators': [50, 100], 'clf__estimator__min_samples_split': [2, 3], } pipeline = Pipeline([ ('features', FeatureUnion([('text_pipeline', Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer(smooth_idf=False)), ])), ('Length', Length())])), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) cv = GridSearchCV(pipeline, param_grid=parameters, cv=2) return cv
def build_model(): """ Pipeline and GridSearch model builder Returns: GridSearchCV model object with parameters set """ # Pipeline structure pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(lgb.LGBMClassifier()))]) # Parameters for GridSearchCV parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'vect__max_df': [1.0, 0.7], 'clf__estimator__bagging_fraction': [1.0, 0.8] } # GridSearchCV object cv = GridSearchCV(pipeline, param_grid=parameters, verbose=3) return cv
def build_model(): # define parameters for GridSearchCV parameters = { 'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)), 'features__text_pipeline__vect__max_df': (0.75, 1.0), 'features__text_pipeline__vect__max_features': (None, 5000), 'features__text_pipeline__tfidf__use_idf': (True, False), } # create gridsearch object and return as final model pipeline pipeline = Pipeline([ ('features', FeatureUnion([('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())])), ('verb_extractor', VerbExtractor())])), ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier(), n_jobs=-1)) ]) model_pipeline = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1) return model_pipeline
def build_model(X_train, y_train): ''' INPUT X_Train: Training features for use in GridSearchCV y_train: Training labels for use in GridSearchCV OUTPUT Returns a pipeline model that has gone through tokenization, count vectorization, TFIDTransofmration and created into a ML model ''' #create pipeline pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) #set parameter parameters = { 'clf__estimator__min_samples_split': [2, 4], 'tfidf_vect__max_df': (0.75, 1.0), #'clf__estimator__n_estimators': [10, 25], } #create GridSearchCV and fit it cv = GridSearchCV(estimator=pipeline, param_grid=parameters) cv.fit(X_train, y_train) return cv
def build_model(): ''' Input: None Output: cv - grid search object ''' # building pipline with Feature Union pipeline = Pipeline([ ('features', FeatureUnion([('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer())])), ('starting_verb', StartingVerbExtractor())])), ('clf', MultiOutputClassifier(RandomForestClassifier())) ]) # specify parameters for grid search parameters = {'features__text_pipeline__vect__max_df': (0.5, 0.75)} # create grid search object cv = GridSearchCV(pipeline, param_grid=parameters) return cv
mfcc_audio[columns] = (mfcc_audio[columns] >= 0.0).astype(np.int32) audios = np.unique(mfcc_audio["Audio"]) train_audio, test_audio = train_test_split( audios, train_size=0.7, test_size=0.3, random_state=0) X_train = mfcc_audio[mfcc_audio["Audio"].isin(train_audio)] X_test = mfcc_audio[mfcc_audio["Audio"].isin(test_audio)] y_train = X_train[columns] y_test = X_test[columns] X_train.drop(columns + ["Audio"], inplace=True, axis=1) X_test.drop(columns + ["Audio"], inplace=True, axis=1) mor = MultiOutputClassifier( RandomForestClassifier(random_state=0, n_estimators=1000), n_jobs=-1) mor.fit(X_train, y_train) mor_pred = mor.predict(X_test) dummy = DummyClassifier() dummy.fit(X_train, y_train) dummy_pred = dummy.predict(X_test) estimators = mor.estimators_ for i, col in enumerate(columns): true = y_test[col] pred = mor_pred[:, i] d_p = dummy_pred[:, i]