def test_multi_output_classification():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict, prodict_proba and score

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest)

    # train the multi_target_forest and also get the predictions.
    multi_target_forest.fit(X, y)

    predictions = multi_target_forest.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    predict_proba = multi_target_forest.predict_proba(X)

    assert len(predict_proba) == n_outputs
    for class_probabilities in predict_proba:
        assert_equal((n_samples, n_classes), class_probabilities.shape)

    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
                       predictions)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        forest_ = clone(forest)  # create a clone with the same state
        forest_.fit(X, y[:, i])
        assert_equal(list(forest_.predict(X)), list(predictions[:, i]))
        assert_array_equal(list(forest_.predict_proba(X)),
                           list(predict_proba[i]))
def test_multiclass_multioutput_estimator_predict_proba():
    seed = 542

    # make test deterministic
    rng = np.random.RandomState(seed)

    # random features
    X = rng.normal(size=(5, 5))

    # random labels
    y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1)  # 2 classes
    y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1)  # 3 classes

    Y = np.concatenate([y1, y2], axis=1)

    clf = MultiOutputClassifier(LogisticRegression(random_state=seed))

    clf.fit(X, Y)

    y_result = clf.predict_proba(X)
    y_actual = [np.array([[0.23481764, 0.76518236],
                          [0.67196072, 0.32803928],
                          [0.54681448, 0.45318552],
                          [0.34883923, 0.65116077],
                          [0.73687069, 0.26312931]]),
                np.array([[0.5171785, 0.23878628, 0.24403522],
                          [0.22141451, 0.64102704, 0.13755846],
                          [0.16751315, 0.18256843, 0.64991843],
                          [0.27357372, 0.55201592, 0.17441036],
                          [0.65745193, 0.26062899, 0.08191907]])]

    for i in range(len(y_actual)):
        assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    param = {'loss': ('hinge', 'log', 'modified_huber')}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0
    grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param,
                            scoring=custom_scorer, cv=3, error_score=np.nan)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "The base estimator should implement predict_proba method"
    with pytest.raises(ValueError, match=err_msg):
        multi_target_linear.predict_proba(X)
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    # parallelism requires this to be the case for a sane implementation
    assert_false(est1 is est2)
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert est1 is not est2
def test_multi_output_exceptions():
    # NotFittedError when fit is not done but score, predict and
    # and predict_proba are called
    moc = MultiOutputClassifier(LinearSVC(random_state=0))
    assert_raises(NotFittedError, moc.predict, y)
    assert_raises(NotFittedError, moc.predict_proba, y)
    assert_raises(NotFittedError, moc.score, X, y)
    # ValueError when number of outputs is different
    # for fit and score
    y_new = np.column_stack((y1, y2))
    moc.fit(X, y)
    assert_raises(ValueError, moc.score, X, y_new)
def test_multiclass_multioutput_estimator():
    # test to check meta of meta estimators
    svc = LinearSVC(random_state=0)
    multi_class_svc = OneVsRestClassifier(svc)
    multi_target_svc = MultiOutputClassifier(multi_class_svc)

    multi_target_svc.fit(X, y)

    predictions = multi_target_svc.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        multi_class_svc_ = clone(multi_class_svc)  # create a clone
        multi_class_svc_.fit(X, y[:, i])
        assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(
        X[:half_index], y[:half_index], classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), first_predictions.shape)

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), second_predictions.shape)

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(
            X[:half_index], y[:half_index, i], classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    yw = [[3, 2], [2, 3], [3, 2]]
    w = np.asarray([2., 1., 1.])
    sgd_linear_clf = SGDClassifier(random_state=1)
    clf_w = MultiOutputClassifier(sgd_linear_clf)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [3, 2], [2, 3], [3, 2]]
    sgd_linear_clf = SGDClassifier(random_state=1)
    clf = MultiOutputClassifier(sgd_linear_clf)
    clf.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_classification_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6]]
    yw = [[3, 2], [2, 3]]
    w = np.asarray([2., 1.])
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf_w = MultiOutputClassifier(forest)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
    y = [[3, 2], [3, 2], [2, 3]]
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf = MultiOutputClassifier(forest)
    clf.fit(X, y)

    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
    assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def build_model():
    """
    This function builds the ML model which will be used to train and predict the data.
    Output:
    - cv: ML model which will be used to train and predict the data
    """

    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(
                              RandomForestClassifier(n_estimators=50,
                                                     min_samples_split=2,
                                                     n_jobs=1)))])

    parameters = {
        'clf__estimator__n_estimators': [50],
        'clf__estimator__min_samples_split': [2],
        'clf__estimator__n_jobs': [1]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv
def build_model():
    """
        Build a tf-idf/random forest model and find hyperparameters using GridSearchCV
        returns optimized model
    """
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # params dict to tune a model
    # Some parameters are commented because it takes too much time.
    parameters = {
        #     'clf__estimator__max_depth': [10, 25, 50],
        #     'clf__estimator__n_estimators': [10, 50, 100],
        #     'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (5000, 10000, 50000)
    }

    # instantiate a gridsearchcv object with the params defined
    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=1)

    return cv
def build_model(grid_search=False):
    '''
    build a Machine Learning pipeline using Countvectorizer,TfidfTransformer and MultiOutputClassifier
    
    parameters
    grid_search: a boolean parameter which is by default set as False
    if specified as True, Grid Search CV is used for hyperparameter tuning.
    
    -------
    
    Returns
    Pipeline:An sklearn pipeline
    
    -------
    '''
    #build the MAchine Learning pipeline using sklearn
    
    pipeline=Pipeline([
    ('vect',CountVectorizer(tokenizer=tokenize)),
    ('tfidf',TfidfTransformer()),
    ('clf',MultiOutputClassifier(RandomForestClassifier(n_jobs=-1,
                                                       min_samples_split = 2,
                                                       bootstrap=True,
                                                       n_estimators=100,)))
    ])
    
    if grid_search:
        parameters={
        'clf__estimator__min_samples_split': [2, 3, 4],
        'clf__estimator__bootstrap': [True],
        'clf__estimator__n_estimators': [100, 200, 300]
        }
        
        cv =GridSearchCV(pipeline,parameters,n_jobs = -1)
        return cv
    else:
        return pipeline
Beispiel #14
0
def split_n_train(X, Y):
    """
    input: 
    multi-labels Y which is a dataframe holding information about our 34 categories to classify,
    a numpy array X keeping text to be classified
    a tokenizer function to tokenize our text
    
    output:
    a trained classification model
    X_train: 60% of the X  array for trainning purposes
    X_test: remaining 40% of the X array for testing purposes
    y_train: 60% rows of the Y dataframe to train our classifier
    y_test: 40% remaining 
    ])
    
    Description 
    splits and trains the classifier
    """
    #split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)

    pipeline = Pipeline([
        ('features',
         FeatureUnion([('text_pipeline',
                        Pipeline([('count_vectorizer',
                                   CountVectorizer(tokenizer=tokenize)),
                                  ('tfidf_transformer', TfidfTransformer())]))
                       ])),
        ('classifier', MultiOutputClassifier(AdaBoostClassifier()))
    ])
    start = time.time() / 60
    # train classifier
    fitted = pipeline.fit(X_train, y_train)
    stop = time.time() / 60
    print(f"Model calculation time: {round(stop - start)} minutes")

    return fitted, X_train, X_test, y_train, y_test
def build_model():
    '''
    Builds a model, create pipeline, hypertuning as well as gridsearchcv.
    input:
        None
    output:
        cv: GridSearch model result.
    '''

    # Create a pipeline
    #pipeline = Pipeline([
    #    ('vect', CountVectorizer(tokenizer=tokenize)),
    #    ('tfidf', TfidfTransformer()),
    #    ('clf', MultiOutputClassifier(
    #        RandomForestClassifier(class_weight='balanced', random_state=0)
    #        ))
    #])
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',
         MultiOutputClassifier(OneVsRestClassifier(LinearSVC(random_state=0))))
    ])

    # Find the optimal model using GridSearchCV
    #parameters = {
    #            'tfidf__smooth_idf': [True, False],
    #            'clf__estimator__n_estimators': [20, 100],
    #            'clf__estimator__max_depth': [2, 10]
    #         }
    parameters = {
        'tfidf__smooth_idf': [True, False],
        'clf__estimator__estimator__C': [1, 2, 5]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)
    return cv
def build_model():    
    """
    Build Model function
    
    This function output is a machine learning pipeline. 
    This machine learning pipeline takes in the message column as input and output classification results on the other 36 categories in the dataset. 
    A grid search is used to find better parameters.
    Parameters:
        y_true -> labels
        y_pred -> predictions
    Output:
        cv -> a trained ML pipeline            
    """
    
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])
    
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.5,0.75),
        'features__text_pipeline__vect__max_features': (None, 1000),
        'features__text_pipeline__tfidf__use_idf': (True, False)
    }
    scorer = make_scorer(f1_score,greater_is_better = True)
    cv = GridSearchCV(pipeline, parameters,scoring = scorer,verbose=2,n_jobs=-1)
    return cv
Beispiel #17
0
def build_model():
"""
Creates a model object that can be trained and used for predictions
It uses an ML pipeline and grid search for optimization

Args:
    None

Returns:
    cv: the model to use
"""
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),
        ('clf', MultiOutputClassifier(RandomForestClassifier())) #ArmClassifier())
    ])
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.5, 1.0),
        'features__text_pipeline__vect__max_features': (None, 5000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        #'clf__estimator__n_estimators': [100, 200],
        #'clf__estimator__min_samples_split': [2, 4],
        'features__transformer_weights': (
            {'text_pipeline': 1, 'starting_verb': 0.5},
            {'text_pipeline': 0.5, 'starting_verb': 1},
            {'text_pipeline': 0.8, 'starting_verb': 1},
        )
    }
    cv =GridSearchCV(pipeline, param_grid=parameters)
    return cv
def build_model():
    """
    MultiOutput Text Classifier model building with pipeline technique and grid search over multiple parameters.
        
    Params
    --------
    
                                            
    Returns
    --------
        cv (GridSearchCV object): 
            Grid search pipeline MultiOutput Text Classifier model to be trained.  
    """
    # Pipeline:
    pipeline = Pipeline([('vectorizer',
                          CountVectorizer(tokenizer=tokenize,
                                          max_df=0.5,
                                          max_features=10000)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(KNeighborsClassifier(),
                                                n_jobs=-1))])

    # Parameters for grid searching (Reduced version for speed purposes, see README)
    parameters = parameters = {
        #'vectorizer__ngram_range': ((1, 1), (1, 2)),
        #'vectorizer__max_df': (0.5, 0.75, 1.0),
        #'vectorizer__max_features': (None, 5000, 10000),
        #'tfidf__use_idf': (True, False),
        'clf__estimator__leaf_size': [20, 30],
        #'clf__estimator__metric': ('minkowski', 'chebyshev'),
        'clf__estimator__n_neighbors': [4, 5],
        #'clf__estimator__weights': ('uniform', 'distance')
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv
Beispiel #19
0
def build_model():
    '''
    Builds a model pipeline and a grid search object

    Returns: model, grid search object refit with best parameters
    '''
    pipeline = Pipeline([('features',
                          FeatureUnion([
                              ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
                              ('pos_tfidf',
                               TfidfVectorizer(tokenizer=pos_tokenizer))
                          ])),
                         ('mult_clf',
                          MultiOutputClassifier(
                              LinearSVC(loss='hinge',
                                        class_weight='balanced',
                                        max_iter=10000)))])

    # specify parameters for grid search
    # # These were the initial grid search parameters - takes about 3 hrs to run
    # parameters = {
    # 'features__tfidf__max_features': [10000, None],
    # 'features__tfidf__ngram_range': [(1,1),(1,2)],
    # 'features__pos_tfidf__ngram_range': [(1,1),(1,2)],
    # 'mult_clf__estimator__loss': ['hinge', 'squared_hinge']
    # }

    # specifying a single parameter to demonstrate a quick grid search
    parameters = {'mult_clf__estimator__C': [0.1, 1.0, 2.0]}

    # Create a scorer to evaluate model performance on average recall score during grid search
    scorer = make_scorer(recall_score, average='macro')

    # create grid search object
    model = GridSearchCV(pipeline, parameters, cv=5, scoring=scorer)

    return model
def build_model():
    '''
    Builds MultiOutputClassifier with RandomForestClassifier as estimator.
    A pipeline is used to streamline two feature extraction processes:
        
        1. Bag Of Words (CountVectorizer)
        2. Tfidf Transformation
        
    The method creates a GridSearchCV object to train multiple parameters of
    the pipeline, and returns this object.
        
    Args: (None)
        
    Returns:
        - cv: GridSearchCV object of the pipeline
    '''

    # Make pipeline with feature transformations and estimator
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer(smooth_idf=True)),
        ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
    ])

    # Set parameters to tune
    params = {
        'clf__estimator__criterion': ['gini', 'entropy'],
        'clf__estimator__warm_start': [True, False],
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__min_samples_split': [0.2, 0.3],
        'vect__ngram_range': [(1, 1), (1, 2)],
    }

    # Create GridSearchCV object
    cv = GridSearchCV(pipeline, param_grid=params, verbose=2)

    return cv
Beispiel #21
0
def build_model():
    """
    Function Description:
        Build LinearSVC model using pipeline and then 
        use Gridsearch to search for the best paramaters
    
    Input:
        None
        
    Output:
        Cross-Validated classifier 
    """
    pipeline = Pipeline([
        ('features',
         FeatureUnion([('text_pipeline',
                        Pipeline([('vect',
                                   CountVectorizer(tokenizer=tokenize)),
                                  ('tfidf', TfidfTransformer())]))])),
        ('clf',
         MultiOutputClassifier(OneVsRestClassifier(LinearSVC(random_state=0))))
    ])

    parameters = {
        'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf__estimator__estimator__loss': ['hinge', 'squared_hinge'],
        'clf__estimator__estimator__multi_class': ['ovr', 'crammer_singer'],
        'clf__estimator__estimator__max_iter': [1000, 2000, 5000]
    }

    cv = GridSearchCV(pipeline,
                      param_grid=parameters,
                      cv=3,
                      verbose=1,
                      n_jobs=-1)

    return cv
def build_model():
    """Build a machine learning pipeline
    
    Args:
    None
       
    Returns:
    cv: gridsearchcv object. Gridsearchcv object that transforms the data, creates the 
    model object and finds the optimal model parameters.
    """
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize, min_df=1)),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', MultiOutputClassifier(LogisticRegression())),
    ])

    parameters = {
        'vect__min_df': [1],
        'tfidf__use_idf': [False],
        #   'clf__estimator__multi_class': ['ovr']
        'clf__estimator__random_state': [25],
        'clf__estimator__C': [1.0, 10],
        'clf__estimator__penalty': ["l1", "l2"],
        #'clf__estimator__solver':['lbfgs','liblinear']
        'clf__estimator__solver': ['liblinear'],
    }

    # Create scorer
    scorer = make_scorer(performance_metric)

    # Create grid search object
    cv = GridSearchCV(pipeline,
                      param_grid=parameters,
                      scoring=scorer,
                      verbose=10,
                      n_jobs=1)
    return cv
Beispiel #23
0
def build_model():
    """
    Returns a model
    """

    # Build a machine learning pipeline

    lm = RandomForestClassifier(n_estimators=10)

    pipeline = Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ("clf", MultiOutputClassifier(lm)),
    ])

    # specify parameters for grid search
    #     parameters = {
    #     'clf__estimator__min_samples_split': [2, 3, 4]
    #     }

    parameters = {
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__min_df': (0.1, 0.05),
        # 'vect__max_features': (None, 100, 500, 1000, 5000),

        # 'clf__estimator__n_estimators': [50, 100, 200],
        # 'clf__estimator__max_depth': [4, 8, 16],
        'clf__estimator__min_samples_leaf': [2, 4, 8],
        'clf__estimator__min_samples_split': [2, 4, 8],
    }

    # create grid search object
    #par = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)

    return cv
def build_model():
    """ builds the model to be fitted on the data.

    Model used to train on the labeled disaster messages. Uses GridSearchCV
    to optimize the model in terms of parameters. In order to include more
    parameters, uncomment the lines in "parameters" below.

    Args:
        None

    Returns:
        model (GridSearchCV): optimized model to be fitted on the dataset

    """
    # build pipeline using tokenize function as defined above
    model = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                      ('tfidf', TfidfTransformer()),
                      ('clf',
                       MultiOutputClassifier(
                           RandomForestClassifier(n_estimators=100,
                                                  random_state=1)))])
    # define parameters to be used for optimizing model using GridSearchCV
    # Uncomment lines below to include more parameters
    parameters = {
        #'vect__ngram_range': ((1, 1), (1, 2)),
        #'vect__max_df': (0.5, 1.0),
        'vect__max_features': (None, 7500),
        'clf__estimator__n_estimators': [50, 150, 500],
        #'clf__estimator__max_features': ['auto', 'sqrt']
        #'clf__estimator__max_depth': max_depth,
        #'clf__estimator__min_samples_split': min_samples_split,
        #'clf__estimator__min_samples_leaf':min_samples_leaf,
        #'clf__estimator__bootstrap': bootstrap
    }
    # Use GridSearchCV to find optimized model when fitted to data.
    model = GridSearchCV(model, param_grid=parameters, cv=5)
    return model
def build_model():
    """Build model

    Create the model by using pipeline
    """
    # initial basic model
    forest = RandomForestClassifier(random_state=42, n_jobs=4)

    # create pipeline
    pipeline = Pipeline([("text_pipeline",
                          Pipeline([("vect",
                                     CountVectorizer(tokenizer=tokenize)),
                                    ("tfidf", TfidfTransformer())])),
                         ("clf", MultiOutputClassifier(forest, n_jobs=4))])

    # define parameters
    params = {
        "text_pipeline__vect__max_features": (5000, 10000),
        "clf__estimator__n_estimators": [50, 100, 150],
        "clf__estimator__criterion": ["gini", "entropy"],
        "clf__estimator__max_depth": [4, 6, 10],
    }

    # choose a method to build model
    print("Hint: GridSearchCV will takes more time!\n")
    chose_option = input("Choose the GridSearchCV(Yes or No): ").lower()
    while True:
        if chose_option in ["yes", "y"]:
            model = GridSearchCV(pipeline, param_grid=params, cv=3)
        elif chose_option in ["no", "n"]:
            model = pipeline
        else:
            print("Choose a validate option!")
            model = False

        if model:
            return model
def build_model():
    
    # ML pipeline should take in the message column as input and output classification results on the other 36 categories
    
    # Build ML model with GridSearchCV
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    
    # Tuning the hyper-parameters of an estimator
    # https://scikit-learn.org/stable/modules/grid_search.html
    # https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/
    
    """
    - Build ML model with GridSearchCV
    
    Attributes / Parameters:
    non
    
    Returns:
    Trained model after performing GridSearchCV
    """
    
    # ML model pipeline
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC())))])

    # parameters
    parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
                  'vect__max_df': (0.75, 1.0)
                  }

    # create ML model
    model = GridSearchCV(estimator=pipeline,
            param_grid=parameters,
            verbose=3,
            cv=3)
    return model
Beispiel #27
0
def build_model():
    """
    This function returns a scikit-learn ML Pipeline that processs text messages
    according to NLP best practices using feature union and applying a classifier
    GridSearchCV  is used to find the best parameters for the model
    """

    pipeline = Pipeline([
        ('features',
         FeatureUnion([('text_pipeline',
                        Pipeline([('vect',
                                   CountVectorizer(tokenizer=tokenize)),
                                  ('tfidf', TfidfTransformer())])),
                       ('starting_verb', StartingVerbExtractor())])),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    parameters = {
        'clf__estimator__max_depth': [10, 50, None],
        'clf__estimator__min_samples_leaf': [2, 5, 10]
    }

    model = GridSearchCV(pipeline, parameters)
    return model
def build_model():
    """
    This founction is to build the model for classify messages.
    
    Summary or Description of the Function

    No Parameters:

    Returns:
    Returning the built model.


    """
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])
    parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'tfidf__use_idf': [True, False],
        'tfidf__norm': ['l1', 'l2']
    }
    model = GridSearchCV(pipeline, param_grid=parameters)
    return model
def build_model():
    """
        Function to build an ML model

        Inputs:
            None
        Returns:
            GridSearchCV object
    """

    pipeline = Pipeline([
        ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=-1)))
    ])

    parameters = {
        'tfidf__use_idf': [True, False],
        'clf__estimator__n_estimators': [10, 20]
    }

    cv = GridSearchCV(pipeline, parameters, verbose=True)

    return cv
Beispiel #30
0
def build_model():
    '''
    Build a Machine learning pipeline and Using grid search
    Input: 
        None
    Output:
        cv: Results of GridSearchCV
        '''
    # Build pipeline
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # Parameters for GridSearch
    parameters = {
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__min_samples_split': [2, 3, 4],
        #'tfidf__use_idf': (True, False),
        #'clf__estimator__criterion': ['entropy', 'gini']
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)
    return cv
Beispiel #31
0
def build_model():
    '''
    Build machine learning pipeline, trains and tunes the model using GridSearchCV
    
    
    
    OUTPUT - 
             GridSearchCV object
            
    '''

    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    parameters = {
        'clf__estimator__n_estimators': [10, 50, 100],
        'clf__estimator__max_depth': [None, 10, 50]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv
def build_model():
    """
    This function will build the ML pipeline using random forest Classifier.

    Please note that for the gridsearch I have taken only a few parameters beacuse the
    time taken was very long and the pkl model was very huge. Please refer the
    intital preps folder in the repo and the ML pipeline notebook to view the complete
    Gridsearch.

    Input: None
    Output: GridsearchCV results
    """
    ML_pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    parameters = {
        'clf__estimator__n_estimators': [100],
        'clf__estimator__min_samples_split': [2],
        'tfidf__smooth_idf': [True, False]
    }
    pipeline = GridSearchCV(ML_pipeline, param_grid=parameters, verbose=2, n_jobs = 3)
    return pipeline
def build_model():
    """
    Build model with GridSearchCV
    
    Returns:
    Trained model after performing grid search
    """
    # model pipeline
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(
                            OneVsRestClassifier(LinearSVC())))])

    # hyper-parameter grid
    parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
                  'vect__max_df': (0.75, 1.0)
                  }

    # create model
    model = GridSearchCV(estimator=pipeline,
            param_grid=parameters,
            verbose=3,
            cv=3)
    return model
Beispiel #34
0
def build_model():
    """Define sklearn pipeline and parameters"""
    pipeline = Pipeline([('features',
                          FeatureUnion([
                              ('text_pipeline',
                               Pipeline([('vect',
                                          CountVectorizer(tokenizer=tokenize)),
                                         ('tfidf', TfidfTransformer())])),
                              ('starting_verb', StartingVerbExtractor())
                          ])), ('clf', DecisionTreeClassifier())])

    parameters = [{
        'features__text_pipeline__vect__max_df': (0.5, 1.0),
        'features__text_pipeline__vect__min_df': (1, 0.01),
        'features__text_pipeline__vect__max_features': (None, 5000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf': (DecisionTreeClassifier(min_samples_split=3), ),
        'clf__max_depth': (None, 4)
    }, {
        'features__text_pipeline__vect__max_df': (0.5, 1.0),
        'features__text_pipeline__vect__min_df': (1, 0.01),
        'features__text_pipeline__vect__max_features': (None, 5000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf': (MultiOutputClassifier(LinearSVC(multi_class='ovr')), )
    }, {
        'features__text_pipeline__vect__max_df': (0.5, 1.0),
        'features__text_pipeline__vect__min_df': (1, 0.01),
        'features__text_pipeline__vect__max_features': (None, 5000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf': (MLPClassifier(), ),
        'clf__hidden_layer_sizes': ((100, 10), (50, ), (50, 10))
    }]

    cv = GridSearchCV(pipeline, parameters, cv=3, n_jobs=4, verbose=10)

    return cv
Beispiel #35
0
def build_model():
    '''
    build the GridSearchCV Model via usage of a pipeline and defines the parameters for the moden 
            Parameters:
                    None
            Returns:
                    cv(obj): GridSearchCV model 
    '''

    # define Pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))])

    # define parameters
    parameters = {
        'clf__estimator__min_samples_leaf': [50, 100, 200],
        'clf__estimator__min_samples_leaf': [2, 3, 4],
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=4)

    return cv
def build_model():
    """
    Build a Pipeline Function
    
    Returns:
    cv - Best Model
    """
    parameters = {
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__min_samples_split': [2, 3],
    }

    pipeline = Pipeline([
        ('features',
         FeatureUnion([('text_pipeline',
                        Pipeline([
                            ('vect', CountVectorizer(tokenizer=tokenize)),
                            ('tfidf', TfidfTransformer(smooth_idf=False)),
                        ])), ('Length', Length())])),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    cv = GridSearchCV(pipeline, param_grid=parameters, cv=2)
    return cv
Beispiel #37
0
def build_model():
    """ Pipeline and GridSearch model builder

    Returns:
        GridSearchCV model object with parameters set

    """

    # Pipeline structure
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(lgb.LGBMClassifier()))])

    # Parameters for GridSearchCV
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [1.0, 0.7],
        'clf__estimator__bagging_fraction': [1.0, 0.8]
    }

    # GridSearchCV object
    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=3)

    return cv
Beispiel #38
0
def build_model():

    # define parameters for GridSearchCV
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.75, 1.0),
        'features__text_pipeline__vect__max_features': (None, 5000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
    }

    # create gridsearch object and return as final model pipeline
    pipeline = Pipeline([
        ('features',
         FeatureUnion([('text_pipeline',
                        Pipeline([('vect',
                                   CountVectorizer(tokenizer=tokenize)),
                                  ('tfidf', TfidfTransformer())])),
                       ('verb_extractor', VerbExtractor())])),
        ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier(),
                                      n_jobs=-1))
    ])
    model_pipeline = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)

    return model_pipeline
Beispiel #39
0
def build_model(X_train, y_train):
    '''
    INPUT 
        X_Train: Training features for use in GridSearchCV
        y_train: Training labels for use in GridSearchCV
    OUTPUT
        Returns a pipeline model that has gone through tokenization, count vectorization, 
        TFIDTransofmration and created into a ML model
    '''
    #create pipeline
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])
    #set parameter
    parameters = {
        'clf__estimator__min_samples_split': [2, 4],
        'tfidf_vect__max_df': (0.75, 1.0),
        #'clf__estimator__n_estimators': [10, 25],
    }
    #create GridSearchCV and fit it
    cv = GridSearchCV(estimator=pipeline, param_grid=parameters)
    cv.fit(X_train, y_train)
    return cv
def build_model():
    '''
    Input: None
    Output: cv - grid search object
    '''

    # building pipline with Feature Union
    pipeline = Pipeline([
        ('features',
         FeatureUnion([('text_pipeline',
                        Pipeline([('vect',
                                   CountVectorizer(tokenizer=tokenize)),
                                  ('tfidf', TfidfTransformer())])),
                       ('starting_verb', StartingVerbExtractor())])),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    # specify parameters for grid search
    parameters = {'features__text_pipeline__vect__max_df': (0.5, 0.75)}

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv
mfcc_audio[columns] = (mfcc_audio[columns] >= 0.0).astype(np.int32)

audios = np.unique(mfcc_audio["Audio"])
train_audio, test_audio = train_test_split(
    audios, train_size=0.7, test_size=0.3, random_state=0)

X_train = mfcc_audio[mfcc_audio["Audio"].isin(train_audio)]
X_test = mfcc_audio[mfcc_audio["Audio"].isin(test_audio)]
y_train = X_train[columns]
y_test = X_test[columns]

X_train.drop(columns + ["Audio"], inplace=True, axis=1)
X_test.drop(columns + ["Audio"], inplace=True, axis=1)

mor = MultiOutputClassifier(
    RandomForestClassifier(random_state=0, n_estimators=1000), n_jobs=-1)
mor.fit(X_train, y_train)
mor_pred = mor.predict(X_test)

dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

estimators = mor.estimators_

for i, col in enumerate(columns):

    true = y_test[col]
    pred = mor_pred[:, i]
    d_p = dummy_pred[:, i]