def ML_analysis(X_train, X_test, y_train, y_test, data):
    X_train_counts, count_vectorizer = mo.vectorizer(
        X_train, (2, 2))

    # transforming testing data into document-term matrix
    X_test_counts = count_vectorizer.transform(
        X_test)

    pipeline = Pipeline(
        [('clf', LogisticRegression(solver='saga', multi_class='multinomial', class_weight='balanced'))])

    parameters = {'clf__penalty': ['l1', 'l2'],
                  'clf__C': [0.001, .009, 0.01, .09, 1, 2, 5, 10, 25, 30, 40]}
    #    scorers = {
    #    'precision_score': make_scorer(precision_score),
    #    'recall_score': make_scorer(recall_score),
    #    'accuracy_score': make_scorer(accuracy_score)}

    scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy']

    grid_search = RandomizedSearchCV(pipeline, parameters, cv=5, n_iter=10,
                                     n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro')
    #
    grid_search.fit(X_train_counts, y_train)
    grid_search.best_score_
    best_parameters = grid_search.best_params_
    print(best_parameters)
    print(grid_search.best_score_)
    predicted_level = grid_search.predict(X_test_counts)
    print(grid_search.best_estimator_)
    print_summary(y_test, predicted_level,
                  data, "Cat_level")
    plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index,
                               title='Confusion matrix, without normalization')
def ML_analysis_split(cleaned_data, column_target,classifier,label=None):
    # Leave it as a dataframe because our pipeline is called on a
    # pandas dataframe to extract the appropriate columns, remember?

    X = cleaned_data.drop(column_target, axis=1)
    # You can covert the target variable to numpy
    y = cleaned_data[column_target].values

    full_pipeline = Pipeline(steps=[('pre_regular_exp', CleaningTextRegularExp('Description')),
                                    ('pre_stop_words', removing_stop_words('Description','english')),
                                    ('Pre_selector', FeatureSelector('Description')),
                                    ('vectorized', CountVectorizer()), ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # The full pipeline as a step in another pipeline with an estimator as the final step
    full_pipeline_m = Pipeline(steps=[('full_pipeline', full_pipeline),

                                      ('model', classifier())])

    # Can call fit on it just like any other pipeline
    full_pipeline_m.fit(X_train, y_train)
    # Can predict with it like any other pipeline
    y_pred = full_pipeline_m.predict(X_test)
    print_summary(y_test, y_pred,
                  cleaned_data, label)
def ML_analysis(X_train, X_test, y_train, y_test, data):
    # transforming testing data into document-term matrix
    # X_test_counts = count_vectorizer.transform(
    #   X_test)

    # #############################################################################
    # Define a pipeline combining a text feature extractor with a simple
    # classifier
    pipeline = Pipeline([('vectorized', CountVectorizer()),
                         ('clf',
                          LogisticRegression(solver='saga',
                                             multi_class='multinomial',
                                             class_weight='balanced'))])

    param_grid = {
        'vectorized__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
                                    (2, 3)],
        'vectorized__max_features': [300, 1000, 5000, 8000, 9000],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1, 2, 5, 10, 25, 30, 40]
    }

    #    scorers = {
    #    'precision_score': make_scorer(precision_score),
    #    'recall_score': make_scorer(recall_score),
    #    'accuracy_score': make_scorer(accuracy_score)}

    scoring = [
        'precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy'
    ]

    grid_search = RandomizedSearchCV(pipeline,
                                     param_grid,
                                     cv=10,
                                     n_iter=10,
                                     n_jobs=-1,
                                     verbose=1,
                                     scoring=scoring,
                                     refit='recall_macro')
    #
    grid_search.fit(X_train, y_train)
    # grid_search.best_score_

    print(grid_search.best_params_)
    # print(grid_search.best_score_)
    predicted_level = grid_search.predict(X_test)
    # print(grid_search.best_estimator_)
    print_summary(y_test, predicted_level, data, "Cat_level")
    plot.plot_confusion_matrix(y_test,
                               predicted_level,
                               classes=data.groupby('Cat_level').count().index,
                               title='Confusion matrix, without normalization')

    best_parameters = grid_search.best_estimator_.get_params()
    print('Best Parameters are')
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    return predicted_level, grid_search
def ML_analysis(X_train, X_test, y_train, y_test, data):

    X_train_counts, count_vectorizer = mo.vectorizer(X_train, (1, 2))

    X_test_counts = count_vectorizer.transform(X_test)

    logreg = LogisticRegression(
        C=1)  # C=30.0, class_weight='balanced', solver='newton-cg',
    # multi_class='multinomial', n_jobs=-1, random_state=40)

    predicted_level = mo.train_model(logreg, X_train_counts, y_train,
                                     X_test_counts)

    print_summary(y_test, predicted_level, data, "Cat_level")

    plot.plot_confusion_matrix(y_test,
                               predicted_level,
                               classes=data.groupby('Cat_level').count().index,
                               title='Confusion matrix, without normalization')
def ML_analysis_separated_data(training_cleaned_data, cleaned_test_data ,column_target,classifier,label="Level"):
    # Leave it as a dataframe because our pipeline is called on a
    # pandas dataframe to extract the appropriate columns, remember?

    X_train = training_cleaned_data.drop(column_target, axis=1)
    # You can covert the target variable to numpy
    y_train = training_cleaned_data[column_target].values
    
    X_test = cleaned_test_data.drop(column_target, axis=1)
    y_test = cleaned_test_data[column_target].values
    

    full_pipeline = Pipeline(steps=[('pre_regular_exp', CleaningTextRegularExp('Description')),
                                    ('pre_stop_words', removing_stop_words('Description','english')),
                                    ('pre_lemmatize', LemmatizeWord('Description')),
                                    ('Pre_selector', FeatureSelector('Description')),
                                    ('vectorized', CountVectorizer()),
                                    ('tfidf', TfidfTransformer()),
                                    ])

    

    # The full pipeline as a step in another pipeline with an estimator as the final step
    full_pipeline_m = Pipeline(steps=[('full_pipeline', full_pipeline),

                                      ('model', classifier())])

    # Can call fit on it just like any other pipeline
    full_pipeline_m.fit(X_train, y_train)
    # Can predict with it like any other pipeline
    y_pred = full_pipeline_m.predict(X_test)
    print_summary(y_test, y_pred,
                  training_cleaned_data, label)
    plot.plot_confusion_matrix(y_test, y_pred,
                               classes=training_cleaned_data.groupby(
                                   'Cat_level').count().index,
                               title='Confusion matrix, without normalization')
def ML_analysis(X_train, X_test, y_train, y_test, data):

    # Encoding data to apply machine learning tools
    y_train_encoder = md.data_encoder(y_train)
    y_test_encoder = md.data_encoder(y_test)

    # setting stop words
    nltk.download("stopwords")
    stop = stopwords.words('english')

    # vectorization
    tfidf_vect = mo.vectorizer_Tfid(training_data_clean,
                                    "Description",
                                    stop_words=stop,
                                    token_pattern=r'\w{1,}',
                                    ngram_range=(1, 2),
                                    max_features=1000)
    X_train_counts = tfidf_vect.transform(X_train)
    X_test_counts = tfidf_vect.transform(X_test)

    logreg = LogisticRegression(
    )  # C=30.0, class_weight='balanced', solver='newton-cg',
    # multi_class='multinomial', n_jobs=-1, random_state=40)

    predicted_level = mo.train_model(logreg, X_train_counts, y_train_encoder,
                                     X_test_counts)

    print_summary(y_test_encoder, predicted_level, training_data_clean,
                  "Level")

    plot.plot_confusion_matrix(y_test_encoder,
                               predicted_level,
                               classes=data.groupby('Level').count().index,
                               title='Confusion matrix, without normalization')
    plot.precision_number_training_data(
        training_data_clean,
        recall_score(y_test_encoder, predicted_level, average=None), 'Level')
X_train_counts, count_vectorizer = mo.vectorizer(
    training_data_clean['lemmatizing'].tolist(), (1, 1))

# transforming testing data into document-term matrix
X_test_counts = count_vectorizer.transform(
    real_data_clean['lemmatizing'].tolist())

y_train = training_data_clean["Level"].tolist()
y_test = real_data_clean["Level"].tolist()

NB = MultinomialNB(alpha=0.5)

predicted_level = mo.train_model(NB, X_train_counts, y_train, X_test_counts)

print_summary(y_test, predicted_level, training_data_clean, "Cat_level")

accuracy, precision, recall, harmonic_mean = mo.get_metrics(
    y_test, y_predicted_counts)

#print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, harmonic_mean))

evaluation_list = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'Harmonic mean': harmonic_mean
}

print(evaluation_list)
def ML_analysis(X_train, X_test, y_train, y_test, data):

    # Encoding data to apply machine learning tools
    y_train_encoder = md.data_encoder(y_train)
    y_test_encoder = md.data_encoder(y_test)

    # setting stop words
    nltk.download("stopwords")
    stop = stopwords.words('english')

    
    pipeline = Pipeline([('vectorizer',TfidfVectorizer(stop_words = stop,token_pattern=r'\w{1,}')),
                          ('clf', MultinomialNB(class_prior=None,fit_prior=False))])
    
   #solver='saga', multi_class='multinomial',
                                #class_weight='balanced'
                                
    param_grid={'vectorizer__ngram_range': [(1,1),(1,2) ,(2,2)],
                'vectorizer__max_features':[ 9000, 10000],
                'clf__alpha': np.linspace(0.5, 1.5, 6, 7)
                }  

    
    
    scoring = ['precision_macro', 'recall_macro', 'f1_macro',
               'balanced_accuracy']

    #grid_search = RandomizedSearchCV(pipeline, param_grid, cv=50, n_iter=30,
                                     #n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro')
    grid_search = GridSearchCV(pipeline, param_grid, cv=5,
                                     n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro')
    #
    #
    grid_search.fit(X_train, y_train_encoder)
    #    grid_search.best_score_
    best_parameters = grid_search.best_params_
    print(best_parameters)
    #    print(grid_search.best_score_)
    predicted_level = grid_search.predict(X_test)
    print(grid_search.best_estimator_)
    print_summary(y_test_encoder, predicted_level,
                  data, "Level")

#    predicted_level = mo.train_model(
#        logreg, X_train_counts, y_train_encoder, X_test_counts)
#
    print_summary(y_test_encoder, predicted_level,
                  training_data_clean, "Level")
#
    plot.plot_confusion_matrix(y_test_encoder, predicted_level,
                               classes=data.groupby(
                                   'Level').count().index,
                               title='Confusion matrix, without normalization')
#    plot.precision_number_training_data(training_data_clean,recall_score(y_test_encoder, predicted_level,average=None),'Level')

                               
    accuracy, precision, recall, harmonic_mean = mo.get_metrics(
    y_test_encoder, predicted_level)
#                               
    evaluation_list = {'Accuracy': accuracy, 'Precision': precision,
                   'Recall': recall, 'Harmonic mean': harmonic_mean}
#
#    print(evaluation_list)
    return predicted_level, grid_search