def ML_analysis(X_train, X_test, y_train, y_test, data):
    X_train_counts, count_vectorizer = mo.vectorizer(
        X_train, (2, 2))

    # transforming testing data into document-term matrix
    X_test_counts = count_vectorizer.transform(
        X_test)

    pipeline = Pipeline(
        [('clf', LogisticRegression(solver='saga', multi_class='multinomial', class_weight='balanced'))])

    parameters = {'clf__penalty': ['l1', 'l2'],
                  'clf__C': [0.001, .009, 0.01, .09, 1, 2, 5, 10, 25, 30, 40]}
    #    scorers = {
    #    'precision_score': make_scorer(precision_score),
    #    'recall_score': make_scorer(recall_score),
    #    'accuracy_score': make_scorer(accuracy_score)}

    scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'balanced_accuracy']

    grid_search = RandomizedSearchCV(pipeline, parameters, cv=5, n_iter=10,
                                     n_jobs=-1, verbose=1, scoring=scoring, refit='recall_macro')
    #
    grid_search.fit(X_train_counts, y_train)
    grid_search.best_score_
    best_parameters = grid_search.best_params_
    print(best_parameters)
    print(grid_search.best_score_)
    predicted_level = grid_search.predict(X_test_counts)
    print(grid_search.best_estimator_)
    print_summary(y_test, predicted_level,
                  data, "Cat_level")
    plot.plot_confusion_matrix(y_test, predicted_level, classes=data.groupby('Cat_level').count().index,
                               title='Confusion matrix, without normalization')
Example #2
0
def ML_analysis(X_train, y_train, data, colum_name):

    X_train_counts, count_vectorizer = mo.vectorizer(
        X_train, (1, 1))

    logreg = LogisticRegression(
        )  # C=30.0, class_weight='balanced', solver='newton-cg',
    # multi_class='multinomial', n_jobs=-1, random_state=40)
    #ist_features = X_train_counts
    #list_labels = data[colum_name].tolist()
    predicted = cross_val_predict(logreg, X_train_counts, y_train, cv=3)
    print(metrics.accuracy_score(y_train, predicted))
    print(metrics.classification_report(y_train, predicted))
def ML_analysis(X_train, X_test, y_train, y_test, data):

    X_train_counts, count_vectorizer = mo.vectorizer(X_train, (1, 2))

    X_test_counts = count_vectorizer.transform(X_test)

    logreg = LogisticRegression(
        C=1)  # C=30.0, class_weight='balanced', solver='newton-cg',
    # multi_class='multinomial', n_jobs=-1, random_state=40)

    predicted_level = mo.train_model(logreg, X_train_counts, y_train,
                                     X_test_counts)

    print_summary(y_test, predicted_level, data, "Cat_level")

    plot.plot_confusion_matrix(y_test,
                               predicted_level,
                               classes=data.groupby('Cat_level').count().index,
                               title='Confusion matrix, without normalization')
Real_data = pd.read_csv('/Users/ruddirodriguez/Dropbox/Machine_Learning/NLP/' +
                        real_data_name)
real_data_clean = preparing_data(Real_data)
training_data_clean = preparing_data(training_data)

#real_data_clean = real_data_clean[real_data_clean['Level'] ==5]
#training_data_clean = training_data_clean[(
#    training_data_clean['Level'] == 5) | (training_data_clean['Level'] == 2)]

#fig = plt.figure(figsize=(8, 6))
#plot.number_of_levels(real_data_clean)
#
#fig = plt.figure(figsize=(8, 6))
#plot.number_of_levels(training_data_clean)

X_train_counts, count_vectorizer = mo.vectorizer(
    training_data_clean['lemmatizing'].tolist(), (1, 1))

# transforming testing data into document-term matrix
X_test_counts = count_vectorizer.transform(
    real_data_clean['lemmatizing'].tolist())

y_train = training_data_clean["Level"].tolist()
y_test = real_data_clean["Level"].tolist()

NB = MultinomialNB(alpha=0.5)

predicted_level = mo.train_model(NB, X_train_counts, y_train, X_test_counts)

print_summary(y_test, predicted_level, training_data_clean, "Cat_level")

accuracy, precision, recall, harmonic_mean = mo.get_metrics(
Example #5
0
#
#worcloud_generation (text)
## Create and generate a word cloud image:
#wordDict = {
#'data': '',
#'year': '',
#'science':''}
#text_copy = multipleReplace(text, wordDict)
#worcloud_generation (text_copy)

list_data_frames = []
cc = 0
for i in training_data_clean['Cat_level'].value_counts().index:
    cc += 1
    data = training_data_clean[training_data_clean['Level'] == cc]
    X_train_counts, count_vectorizer = mo.vectorizer(
        data['lemmatizing'].tolist(), (1, 2))
    freqss = pd.DataFrame(
        [(word, X_train_counts.getcol(idx).sum(), i)
         for word, idx in count_vectorizer.vocabulary_.items()],
        columns=['word', 'Freq', 'Level']).sort_values(by='Freq',
                                                       ascending=False)
    list_data_frames.append(freqss)
    text = " ".join(item for item in data['lemmatizing'].tolist())
    wordDict = {'data': '', 'year': '', 'science': ''}
    savepath = (
        "/Users/ruddirodriguez/Dropbox/Machine_Learning/Data_Science_day_UU/" +
        i + "." + "svg")
    text_copy = multipleReplace(text, wordDict)
    worcloud_generation(text_copy, savepath)