def build_model():
    '''
    Build a machine learning pipeline using count-vectorizer, Tf-Idf, and Random forest
    
    Returns
      Results of GridSearchCV
    '''

    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    parameters = {
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__min_samples_split': [2, 3, 4],
        'clf__estimator__criterion': ['entropy', 'gini']
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv
Beispiel #2
0
def build_model():
    """
    Build a model to predict the class of a message.
    The model is a nlp pipeline made of tfidf and randomclassifier
    """
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(
                              RandomForestClassifier(max_depth=100,
                                                     min_samples_split=10)))])
    parameters = parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False),
        'clf__estimator__n_estimators': [1, 5, 10, 15],
        'clf__estimator__min_samples_split': [10, 20, 30, 40],
        'clf__estimator__max_depth': [50, 100, 200]
    }

    return GridSearchCV(pipeline, param_grid=parameters)
Beispiel #3
0
def build_model():
    # build pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',
         MultiOutputClassifier(RandomForestClassifier(random_state=42),
                               n_jobs=-1)),
    ])
    # set tuning parameters
    parameters = {
        'tfidf__norm': ['l1', 'l2'],
        'clf__estimator__criterion': ['gini', 'entropy']
    }
    # get uptimised model with grid search
    model = GridSearchCV(pipeline,
                         param_grid=parameters,
                         cv=2,
                         n_jobs=-1,
                         verbose=1)

    return model
Beispiel #4
0
def build_model():
    """ Note: """
    """ planned to run, but my mac crashes whenever running, so please refer to
    Jupyter notebook for complixity studies instead. """

    # pipeline = Pipeline([
    #     ('vect',TfidfVectorizer(tokenizer=tokenize)),
    #     ('clf',MultiOutputClassifier(RandomForestClassifier(n_estimators=100,random_state=20)))
    # ])
    # parameters = {
    #     'vect__norm': ['l1','l2'],
    #     'vect__min_df': [0, 0.25, 0.5]
    # }
    # cv = GridSearchCV(pipeline,param_grid=parameters, cv=5, n_jobs=-1)

    cv = Pipeline([('vect', TfidfVectorizer(tokenizer=tokenize, norm='l2')),
                   ('clf',
                    MultiOutputClassifier(
                        RandomForestClassifier(n_estimators=100,
                                               random_state=20)))])

    return cv
Beispiel #5
0
def build_model():
    '''
    input:
        None
    output:
        cv: GridSearch model result.
    '''
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',
         MultiOutputClassifier(OneVsRestClassifier(LinearSVC(random_state=0))))
    ])
    parameters = {
        'tfidf__smooth_idf': [True, False],
        'clf__estimator__estimator__C': [1, 2, 5]
    }
    cv = GridSearchCV(pipeline,
                      param_grid=parameters,
                      scoring='precision_samples',
                      cv=5)
    return cv
def build_model(X_train,y_train):
    '''
    INPUT 
        X_Train: Training features
        y_train: Training labels
    OUTPUT
        Returns a trained model
    '''
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    parameters = {  
        'clf__estimator__min_samples_split': [2, 4],
    }
    
    cv = GridSearchCV(estimator=pipeline, param_grid=parameters)
    cv.fit(X_train,y_train)
    return cv
Beispiel #7
0
def build_model():
    '''
    Model Pipeline with GridSearch optimization for parameters.

    Input: None.
    Output: classification model.
    '''

    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('rfc',
                          MultiOutputClassifier(RandomForestClassifier()))])

    parameters = {
        'tfidf__use_idf': (True, False),
        # 'clf__estimator__n_estimators': [50, 60, 70],
    }

    # Optimizes model parameters trough GridSearchCV
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv
Beispiel #8
0
def build_model():
    ''' creates pipeline to build model and finds the best parameters by using GridSearchCV.'''

    # build the pipeline for the text transformation and then for the estimator instance
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # parameters are set to reduce the size of the pickle file, since my first files were larger than 1GB.
    parameters = {
        'clf__estimator__n_estimators': [4, 6, 9],
        'clf__estimator__min_samples_split': [2, 3, 5],
    }

    model = GridSearchCV(pipeline,
                         param_grid=parameters,
                         cv=3,
                         verbose=2,
                         n_jobs=4)

    return model
Beispiel #9
0
def build_model():
    ''' Builds model as pipeline 

        Inputs: 
            None
        Output: 
            cv: model with best parameters found during GridSearch for 
                pipeline consisting of nlp steps and final estimator with multioutput wrapper
    '''

    model = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                      ('tfidf', TfidfTransformer()),
                      ('clf',
                       MultiOutputClassifier(SGDClassifier(random_state=42)))])
    parameters = {
        'vect__stop_words': (tokenized_stop_words, None),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_df': (0.5, 0.75, 1.0)
    }

    cv = GridSearchCV(model, param_grid=parameters, verbose=2)
    return cv
Beispiel #10
0
def build_model(gridsearch=True):
    """
    Builds an NLP pipeline to do teh following:
    1. Tokenize
    2. Vectorize (count then tfidf)
    3. other custom extractors
    4. finally, a classifier

    The pipeline will also support methods such as .fit and .predict

    Will also apply a grid search optionally.
    """
    pipeline = Pipeline([
        (
            'features',
            FeatureUnion([
                ('text_pipeline',
                 Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                           ('tfidf', TfidfTransformer())])),

                # ('starting_verb', StartingVerbExtractor())
            ])),
        ('clf', MultiOutputClassifier(MultinomialNB()))
    ])

    if gridsearch == True:
        parameters = {
            'features__text_pipeline__vect__ngram_range':
            ((1, 1), (1, 2), (1, 3)),
            'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
            'features__text_pipeline__vect__max_features': (None, 5000, 10000),
            'features__text_pipeline__tfidf__use_idf': (True, False)
        }

        cv = GridSearchCV(pipeline, param_grid=parameters)

        return cv

    return pipeline
Beispiel #11
0
def build_model():
    """
    Build machine learning model (KNeighborsClassifier)
    Input: 
    clean-tokens: X_train, Y_train, X_test, Y_test 
    Returns:
    pipline: sklearn.model_selection.GridSearchCV. 
    """
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(KNeighborsClassifier()))
                         ])

    parameters = {
        'clf__estimator__n_neighbors': [5, 10],
        'clf__estimator__weights': ['uniform', 'distance']
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    model = cv

    return model
Beispiel #12
0
def build_model():
    '''
    function that build the machine learning pipeline
    '''
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('multioutput',
         MultiOutputClassifier(LinearSVC(class_weight='balanced'), n_jobs=-1))
    ])

    parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False),
        'multioutput__estimator__C': [100, 1000, 10000]
    }

    with active_session():
        cv = GridSearchCV(pipeline, param_grid=parameters, cv=3)

    return cv
def build_model():
    """Returns the GridSearchCV object to be used as the model
    Args:
        None
    Returns:
        cv (scikit-learn GridSearchCV): Grid search model object
    """
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # specify parameters for grid search
    parameters = {
        'clf__estimator__n_estimators': [20],
        'clf__estimator__min_samples_split': [2]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv
def build_model():
    ''' build model

    Returns
    -----------------------
    model: model for prediction 
    '''

    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize, max_df=0.75)),
        ('tfidf', TfidfTransformer()),
        ('clf',
         MultiOutputClassifier(estimator=RandomForestClassifier(), n_jobs=-1))
    ])

    parameters = {
        'vect__max_df': (0.5, 0.75),
        'clf__estimator__n_estimators': [25, 50],
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1, verbose=2)
    return cv
Beispiel #15
0
def build_model():
    """
    Return Grid Search model with pipeline and Classifier

    parameters:
    
    return:
    cv :  the estimator
    
    """
    moc = MultiOutputClassifier(RandomForestClassifier())

    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()), ('clf', moc)])

    parameters = {
        'clf__estimator__max_depth': [10, 50, None],
        'clf__estimator__min_samples_leaf': [2, 5, 10]
    }

    cv = GridSearchCV(pipeline, parameters)
    return cv
def build_model():
    """
    Function to build model which involves setting up the pipeline of various steps to train an NLP model.
    Input: None
    Return: model: scikit-learn model: Can be used to train on data and evaluate on test set.
    """

    # Steps: tokenize, transform to get Tfidf vectors for data, classifier to train
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(AdaBoostClassifier()))])

    # Parameter search on sklearn cross validation
    parameters = {
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__learning_rate': [0.1, 0.5, 1.0]
    }

    # grid search on data to obtain best parameters.
    model = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)

    return model
def build_model():
    """
    Create model build pipline
    Then specify parameters
    Run gridsearch to select optimal parameters
    input: nothing
    output: model
    """
    model = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])


    parameters = {
        'clf__estimator__n_estimators': [1, 200],
        'clf__estimator__min_samples_split': [2, 100], 
        'clf__estimator__min_samples_leaf': [5, 100]
    }
    cv = GridSearchCV(estimator=model, param_grid=parameters, cv=3)
    return model
def build_model():
    '''
    Builds the machine learning pipeline using gridsearch.
    :return cv: model
    '''
    # model pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))
    ])

    parameters = {
        'vect__max_features': (None, 5000),
        'tfidf__use_idf': (True, False),
        'clf__estimator__n_estimators': [10, 20, 50],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }

    cv = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=2)

    return cv
def build_model():
    '''
    Function to build a model, create pipeline, hypertuning as well as gridsearchcv
    Input: N/A
    Output: Returns the model
    '''
    # Creating Machine Learning Pipeline
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # choose parameters
    parameters = {'clf__estimator__n_estimators': [50]}

    # create grid search object
    model = GridSearchCV(pipeline,
                         param_grid=parameters,
                         scoring='recall_micro',
                         cv=4)

    return model
def build_model():
    """
    Build the machine learning model.

    """

    # Create pipeline
    pipeline = Pipeline([('vectorize', CountVectorizer(tokenizer=tokenize)),
                         ('tf-idf', TfidfTransformer()),
                         ('classifier',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # Do parameter tuning
    parameters = {
        #'classifier__estimator__n_estimators': [100, 200],
        #'classifier__estimator__criterion': ['gini', 'entropy'],
        'classifier__estimator__max_depth': [6, 9]
    }
    # Fit the model
    cv = GridSearchCV(pipeline, parameters, n_jobs=-1)

    return cv
Beispiel #21
0
def build_model():
    '''
    model pipeline to train classifier to predict outputs for the 36 categories
    cv - grid search for tuning parameters
    
    '''

    rf = RandomForestClassifier()
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(rf))])

    # define parameters for GridSearchCV
    parameters = {
        'clf__estimator__min_samples_split': [2, 3, 4],
        'vect__ngram_range': ((1, 2), (2, 2))
    }

    # create gridsearch object and return as final model pipeline
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv
Beispiel #22
0
def build_model():
    '''
    Args: None
    Returns: a pipeline for model training
    '''

    # Build a machine learning pipeline
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    # set parameters for tuning

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__estimator__n_estimators': [10, 20, 50]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, cv=None)

    return cv
Beispiel #23
0
def build_model():
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])
    parameters = {
        # RandomForestClassifier
        'clf__estimator__n_estimators': [50, 100],
        #'clf__estimator__min_samples_split': [2,5],
        #'clf__estimator__criterion': ['entropy', 'gini']

        # SVC
        #"clf__estimator__C": [0.001, 0.01, 0.1, 1, 10],
        #"clf__estimator__gamma":[0.001, 0.01, 0.1, 1]

        # DecisionTreeClassifier
        #"clf__estimator__criterion": ['entropy', 'gini'],
        #"clf__estimator__min_samples_split":[2,4]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)
    return cv
Beispiel #24
0
def model_pipeline():
    """
    Set up model pipeline. Include custom transformers and optimise parameters using gridsearchCV
    :return: an instance of the model
    """
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor()),
            ('text_length', TextLengthExtractor()),
            ('word_count', WordCountExtractor()),
            ('sentiment', SentimentExtractor())
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    # parameters = {'features__text_pipeline__tfidf__norm': ['l1', 'l2'],
    #               'clf__estimator__criterion': ["gini", "entropy"],
    #               'clf__estimator__max_features': ['auto', 'sqrt', 'log2'],
    #               'clf__estimator__class_weight': ['balanced']}  # used to account for class imbalance

    # Best CV params
    parameters = {'features__text_pipeline__tfidf__norm': ['l2'],
                  'clf__estimator__criterion': ["gini"],
                  'clf__estimator__max_features': ['sqrt'],
                  'clf__estimator__class_weight': ['balanced']}  # used to account for class imbalance

    # Focus on the f1 score due to the unbalanced classes
    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=3, n_jobs=-1)

    logging.debug('function:model_pipeline: model pipeline instantiated')

    return cv
def build_model():
    '''
    Build a ML pipeline using ifidf, random forest, and gridsearch
    Input: None
    Output:
        Results of GridSearchCV
    '''
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    parameters = {
        'vec__max_df': [0.8],
        'clf__estimator__max_depth': (25, 50, None),
        'clf__estimator__min_samples_split': (2, 10, 25, 50, 100),
        'clf__estimator__n_estimators': [500]
    }

    cv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=10)

    return cv
def build_model():
    """ Function to build the ML pipeline.

    Arguments: 
        None
    
    Returns: 
        model: Scikit Pipeline or GridSearchCV object
    """ 

    # LinearSVC Classifier was selected as it performed better than DecisionTree,
    # RandomForest and AdaBoost, all tried in the ML Pipeline Preparation notebook.

    pipeline = Pipeline([

        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(LinearSVC()))
    ])

    model = pipeline

    ############ Use grid search to find better parameters ####################

    # check pipeline parameters
    # pipeline.get_params()

    parameters = {
        'clf__estimator__loss': ('hinge', 'squared_hinge'),
        'clf__estimator__C': (0.5, 1.0)
    } 

    cv = GridSearchCV(estimator=pipeline, n_jobs = -1, param_grid=parameters)
    
    model = cv

    ###########################################################################

    return model
def build_model():
    """ Build a TF-IDF pipeline that,
    processes text and then performs,
    multi-output classification on the 36 categories in the dataset.
    
    Returns:
       cv (scikit-learn GridSearchCV):  GridSearchCV model object. 
       
    """
    pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC())))
    ])
    
    parameters = {
    'tfidf__smooth_idf':[True, False],
    'clf__estimator__estimator__C':[1,2,5]}

    cv = GridSearchCV(pipeline, parameters)
    
    return cv
Beispiel #28
0
def build_model():
    """Builds classification model
    Args:
        N/A

    Returns:
        return trained model
    
     """

    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])

    parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'clf__estimator__min_samples_split': [2, 4],
    }

    cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2, n_jobs=4)
    return cv
def build_model():
    '''
    Function to build model pipeline with feature extraction and estimator.
    
    ARGS:
    None
    
    OUTPUT:
    pipeline: built model
    
    '''
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier(),
                                                n_jobs=-1))])
    parameters = {
        'clf__estimator__criterion': ["gini", "entropy"],
        'clf__estimator__n_jobs': [-1]
    }
    cv = GridSearchCV(pipeline, parameters, n_jobs=-1)
    return cv
Beispiel #30
0
def build_model():
    """
    Build Model pipeline
    
    Output is a tuned model that process text messages
    and apply model for scoring.
    """

    modelp = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                       ('tfidf', TfidfTransformer()),
                       ('clf', MultiOutputClassifier(AdaBoostClassifier()))])

    # hyper-parameter grid
    parameters = {'clf__estimator__n_estimators': (50, 100)}

    # create model
    model = GridSearchCV(estimator=modelp,
                         param_grid=parameters,
                         verbose=3,
                         cv=2)

    return model