Ejemplo n.º 1
0
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from model import DataManager, Classifier, PlotGenerator

REPETITION = 50

probes, result = DataManager.load_data('data/diabetes.arff')
x = np.array(probes)
y = np.array(result)

bayes = Classifier(MultinomialNB(), REPETITION, x, y)
bayes.calculate_indicators()

logistic_regression = Classifier(LogisticRegression(), REPETITION, x, y)
logistic_regression.calculate_indicators()

kneighbours_classifier = Classifier(KNeighborsClassifier(10), REPETITION, x, y)
kneighbours_classifier.calculate_indicators()

mlp_classifier = Classifier(MLPClassifier(), REPETITION, x, y)
mlp_classifier.calculate_indicators()

classifiers_array = []
classifiers_array.append(bayes)
classifiers_array.append(logistic_regression)
classifiers_array.append(kneighbours_classifier)
Ejemplo n.º 2
0
alphaList = [0.000001, 0.01, 0.1, 1, 5, 10, 20, 1000]
for alpha in alphaList:
    print('ALPHA: {}'.format(alpha))
    from sklearn.decomposition import LatentDirichletAllocation
    lda = LatentDirichletAllocation(n_components=dim,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    doc_topic_prior=alpha,
                                    random_state=0)
    lda.fit(tf_train)
    trainTopicDistArr = lda.transform(tf_train)
    testTopicDistArr = lda.transform(tf_test)

    #%% 5. Classification using Naive Bayes
    # Train Model
    from sklearn.naive_bayes import MultinomialNB
    nb_LDA = MultinomialNB().fit(trainTopicDistArr, trainY)

    # Print Training Accuracy
    from sklearn.cross_validation import cross_val_score
    print(
        "(CV, LDA): ",
        cross_val_score(MultinomialNB(),
                        trainTopicDistArr,
                        trainY,
                        cv=3,
                        scoring="accuracy").mean())

    # Print Test Accuracy
    print('(TE, LDA): ' + str(nb_LDA.score(testTopicDistArr, testY)))
Ejemplo n.º 3
0
    salary_train[i] = number.fit_transform(salary_train[i])
    salary_test[i] = number.fit_transform(salary_test[i])

colnames = salary_train.columns
len(colnames[0:13])

trainX = salary_train[colnames[0:13]]
trainY = salary_train[colnames[13]]
testX = salary_test[colnames[0:13]]
testY = salary_test[colnames[13]]

from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB

sgnb = GaussianNB()
smnb = MultinomialNB()
spred_gnb = sgnb.fit(trainX, trainY).predict(testX)
confusion_matrix(testY, spred_gnb)
print("Accuracy", (10759 + 1209) / (10759 + 601 + 2491 + 1209))  # 80%

spred_mnb = smnb.fit(trainX, trainY).predict(testX)
confusion_matrix(testY, spred_mnb)
print("Accuracy", (10891 + 780) / (10891 + 780 + 2920 + 780))  # 75%

# Stratified Method
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
metric_names = [
    'f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall'
]
scores_df = pd.DataFrame(index=metric_names,
                         columns=['Random-CV',
Ejemplo n.º 4
0
              for i,d in enumerate(dictionary):
                if d[0] == word:
                  wordID = i
                  features_matrix[docID,wordID] = words.count(word)
        docID = docID + 1
    return features_matrix

train_dir = r'C:\\Users\\USERONE\\Desktop\\ling-spam\\train-mails'
dictionary = make_Dictionary(train_dir)
print (dictionary)
train_labels = np.zeros(702)
train_labels[351:701] = 1 #spam emails
train_matrix = extract_features(train_dir)
print(train_matrix[1])
# Training SVM and Naive bayes classifier
NB_model = MultinomialNB()
#GNB_model = GaussianNB()
#BNB_model = BernoulliNB()
LinearSVM_model = LinearSVC()
SVM_model = SVC()
NB_model.fit(train_matrix,train_labels)
#GNB_model.fit(train_matrix,train_labels)
#BNB_model.fit(train_matrix,train_labels)
LinearSVM_model.fit(train_matrix,train_labels)
SVM_model.fit(train_matrix,train_labels)

#use GridSearch to better choose tuning parameters C and gamma
param_grid = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(train_matrix,train_labels)
Ejemplo n.º 5
0
def predictEmotion(data, newRec):
    res = {}
    perf_score = {}
    
    df = prepData(data)
    text_hl_sum = df['headline'] + ' ' + df['summary']
    
    processedRec = prepNewRec(newRec)
    
    # create transformer 
    vectorizer = CountVectorizer()
    encoder = LabelEncoder()
    
    # tokenize and build vocabulary_
    vectorizer.fit(text_hl_sum)
    
    # encode document
    X = vectorizer.transform(text_hl_sum)
    #print('training data - transformed matrix shape: ', X.shape)
    
    vect_new = vectorizer.transform(processedRec)
    #print('new records - transformed matrix shape: ', vect_new.shape)
    #print()

    
    for i in range(10):
        emotion = 'emotion_'+str(i)
        
        y = df[emotion]
        #print (X.shape[0], len(y))
        
        # split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
        # resample data set to increase minority calss
        #resample = SMOTE()
        resample = RandomOverSampler()
        X_train_new, y_train_new = resample.fit_sample(X_train, y_train)
        
        clf_pipe = Pipeline([('tfidf', TfidfTransformer()),
                             ('mnb', MultinomialNB())])

        tuned_parameters = {
            'tfidf__norm': ('l1', 'l2'),
            'tfidf__use_idf': (False, True),
            'mnb__alpha': [1, 0.1, 0.01]
        }
        
        np.errstate(divide='ignore')
        clf = GridSearchCV(clf_pipe, tuned_parameters, cv=10, scoring=score)
        clf.fit(X_train_new, y_train_new)
        
        perf_score[emotion] = clf.best_score_
        
        print()
        print('~~~~~~~~~~~~~~~~~ %s ~~~~~~~~~~~~~~~' % emotion)
        print()
        
        print('Best score: %0.4f with parameters %r' % (clf.best_score_, clf.best_params_))
        print()
        
        print('Detailed model performance score with parameters to correctly predict the results:')
        for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
                                     clf.cv_results_['std_test_score'], 
                                     clf.cv_results_['params']):
            print('%0.4f +/-%0.04f with parameters %r' % (mean, std * 2, params))
        print()
        
        
        print("Detailed classification report (scores were computed on evaluation data set):")
        print()
        print(classification_report(y_test, clf.predict(X_test), digits=4))
        print()
        
        
        ####### predict the emotion for new headline and summary 
        pred = clf.predict(vect_new)
        #print (emotion, pred)
        res[emotion] = int(pred[0])
        
    return res
Ejemplo n.º 6
0
# In[7]:

# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

print("\n\nTarget: >> ", twenty_train.target)

# In[9]:

# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# In[14]:

# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

# In[15]:
	print('Dummy classifier, который будет всем новым наблюдениям присваивать класс ham, получит 75% precission и 87 -  recall, 80 - f-score')

	# print('\nNaive Bayes 1')
	# naive_model = MultinomialNB()
	# naive_model.fit(bowed_messages, messages['label'])
	# # print(len(msg_train), len(msg_test))
	# cv_results = cross_val_score(naive_model, bowed_messages, messages['label'], cv=10, scoring='accuracy')
	# print(cv_results.mean(), cv_results.std())
	# print(classification_report(messages['label'], naive_model.predict(bowed_messages)))

	msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) # поделить выборку в соотновении 80:20

	# Первая токенизация, Байес
	print('\n1) Naive Bayes,  tokenize 1')
	pipeline, label_predicted = do_smth_with_model(steps=[('bow', CountVectorizer(analyzer=tokenize)),
							  ('classifier', MultinomialNB())])

	draw_learning_curve(pipeline)
	draw_roc_curve(label_predicted)
	print('Судя по roc-curve, классификатор показывает высокие результаты, AUC-value очень высокий, roc-curve почти параллельна оси х')
	print('Learning curve показывает, что при увеличении обучающих данных, cross-validation score может незначительно '
		  'улучшиться, training score при этом останется статичен')


	# Вторая токенизация, Байес
	print('\n1) Naive Bayes tokenize 2')
	do_smth_with_model(steps=[('bow', CountVectorizer(analyzer=tokenize2)),
							  ('classifier', MultinomialNB())])

	# Первая токенизация, Байес, удаляем стоп слова
	print('\n3) Naive Bayes удаляем стоп слова')
Ejemplo n.º 8
0
dictionary = make_Dictionary(root_dir)

# Prepare feature vectors per training mail and its labels

features_matrix, labels = extract_features(root_dir)
np.save('enron_features_matrix.npy', features_matrix)
np.save('enron_labels.npy', labels)

# train_matrix = np.load('enron_features_matrix.npy');
# labels = np.load('enron_labels.npy');
print(features_matrix.shape)
print(labels.shape)
print(sum(labels == 0), sum(labels == 1))
X_train, X_test, y_train, y_test = train_test_split(features_matrix,
                                                    labels,
                                                    test_size=0.40)

## Training models and its variants

model1 = LinearSVC()
model2 = MultinomialNB()

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

result1 = model1.predict(X_test)
result2 = model2.predict(X_test)

print(confusion_matrix(y_test, result1))
print(confusion_matrix(y_test, result2))
Ejemplo n.º 9
0
    # Thanks at http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html
    print('_' * 80)
    print("Training: ")
    print(model)
    t0 = time()
    model.fit(dTr['dffi'], dTr['target'])
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = model.predict(dTe['dffi'])
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(dTe['target'], pred)
    print("accuracy:   %0.3f" % score)
    
    model_desc = str(model).split('(')[0]
    print("confusion matrix:")
    print(metrics.confusion_matrix(dTe['target'], pred))
    return model_desc, score, train_time, test_time


results = []
# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01),dTest,dTrain))
results.append(benchmark(BernoulliNB(alpha=.01),dTest,dTrain))

Ejemplo n.º 10
0
count_v0 = CountVectorizer()
counts_all = count_v0.fit_transform(all_text)
count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_)
counts_train = count_v1.fit_transform(train_texts)
print "the shape of train is " + repr(counts_train.shape)
count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
counts_test = count_v2.fit_transform(test_texts)
print "the shape of test is " + repr(counts_test.shape)

tfidftransformer = TfidfTransformer()
train_data = tfidftransformer.fit(counts_train).transform(counts_train)
test_data = tfidftransformer.fit(counts_test).transform(counts_test)

x_train = train_data
y_train = train_labels
x_test = test_data
y_test = test_labels

print '(3) Naive Bayes...'
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB(alpha=0.01)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
num = 0
preds = preds.tolist()
for i, pred in enumerate(preds):
    if int(pred) == int(y_test[i]):
        num += 1
print 'precision_score:' + str(float(num) / len(preds))
Ejemplo n.º 11
0
def main():
    # The file path where to find the data
    path_folder = "data/"

    # Opening metadata
    meta_data = pd.read_csv(path_folder + "Tobacco3482.csv")

    # Here I'm extracting the labels
    labels = np.unique(meta_data["label"])

    # Opening the data
    x = []
    y = []
    label_classes = {}
    i = 0
    for label in labels:
        path = path_folder + label + "/*.txt"
        print("Opening " + label + " data")
        files = glob.glob(path)
        for file in files:
            file_tmp = open(file, 'r')
            x.append(file_tmp.read())
            y.append(label)
            file_tmp.close()
        label_classes[i] = label
        i += 1
    print("Opened " + str(len(x)) + " documents, " + str(len(np.unique(y))) +
          " different classes")

    # Here I'm extracting the label
    labels = np.unique(meta_data["label"])

    # Treating the labels
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Splitting the data into train and test
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    # Transforming the data into token representation
    vectorizer = CountVectorizer()
    vectorizer.fit(x_train)

    x_train_counts = vectorizer.transform(x_train)
    x_test_counts = vectorizer.transform(x_test)

    # Bayesian part

    # Creation of the model
    clf = MultinomialNB()
    print("Training Bayesian for baseline")
    # Training
    clf.fit(x_train_counts, y_train)

    print("Printing results for Bayesian")
    # Printing of the results
    print("Accuracy score : ")
    print(clf.score(x_test_counts, y_test))
    y_pred = clf.predict(x_test_counts)
    print("Confusion matrix :")
    print(confusion_matrix(y_test, y_pred))
    print("Classification report :")
    print(classification_report(y_test, y_pred))
    print("Where classes are :")
    for label in label_classes:
        print(str(label) + " : " + label_classes[label])

    # Neural Network part
    # creation of the callbacks to save the best model

    checkpointer = ModelCheckpoint(filepath="weights.hdf5",
                                   verbose=1,
                                   save_best_only=True)
    callbacks = [checkpointer]

    # Extracting the size of the data
    dimension_data = len(x_train_counts.toarray()[0])

    # Creation of the model
    NN = model_creation(dimension_data)

    print("Training neural network, this may take while")
    # Training of the data
    NN.fit(x_train_counts.toarray(),
           to_categorical(y_train),
           epochs=10,
           validation_split=0.1,
           batch_size=128,
           callbacks=callbacks)

    # Loading the best model
    NN.load_weights('weights.hdf5')

    print("Printing neural network results")
    # Printing the results
    print("Accuracy score :")
    print(NN.evaluate(x_test_counts.toarray(), to_categorical(y_test))[1])

    print("Confusion matrix :")
    confusion_matrix_NN(NN, x_test_counts.toarray(), to_categorical(y_test))

    print("Classification report :")
    y_pred = NN.predict(np.array(x_test_counts.toarray()))
    y_test_class = np.argmax(to_categorical(y_test), axis=1)
    y_pred_class = np.argmax(y_pred, axis=1)
    print(classification_report(y_test_class, y_pred_class))

    print("Where classes are :")
    for label in label_classes:
        print(str(label) + " : " + label_classes[label])

    print(
        "The model is trained and the weights are saved at weights.hdf5, closing script"
    )
Ejemplo n.º 12
0
print(dane['text'].head().apply(process_text))

from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(
    dane['text'])
messages_bow.shape

print(messages_bow.shape)

# zmienna celu

y = df1['Class'].values  #target
X = df1.drop(['Class'], axis=1).values  #features

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

(classifier.fit(X_train, y_train))
print(classifier.predict(X_train))
print(y_train.values)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print('Confusion Matrix: \n', confusion_matrix(y_train, pred))
print()
print('Accuracy: ', accuracy_score(y_train, pred))

print('Predicted value: ', classifier.predict(X_test))
print('Actual value: ', y_test.values)
Ejemplo n.º 13
0
                                 shuffle=True)
print(len(twenty_train.data))
print(len(twenty_test.data))
print(twenty_train.target_names)
print("\n".join(twenty_train.data[0].split("\n")))
print(twenty_train.target[0])
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(twenty_train.data)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
mod = MultinomialNB()
mod.fit(X_train_tfidf, twenty_train.target)
X_test_tf = count_vect.transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = mod.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(twenty_test.target, predicted))
print(
    classification_report(twenty_test.target,
                          predicted,
                          target_names=twenty_test.target_names))
print("confusion matrix is \n", confusion_matrix(twenty_test.target,
                                                 predicted))
"""
Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
2257
def test_calibration():
    """Test calibration objects with isotonic and sigmoid"""
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples, n_features=6,
                               random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1)
    assert_raises(ValueError, pc_clf.fit, X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [(X_train, X_test),
                                      (sparse.csr_matrix(X_train),
                                       sparse.csr_matrix(X_test))]:
        for method in ['isotonic', 'sigmoid']:
            pc_clf = CalibratedClassifierCV(clf, method=method, cv=2)
            # Note that this fit overwrites the fit on the entire training
            # set
            pc_clf.fit(this_X_train, y_train, sample_weight=sw_train)
            prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1]

            # Check that brier score has improved after calibration
            assert_greater(brier_score_loss(y_test, prob_pos_clf),
                           brier_score_loss(y_test, prob_pos_pc_clf))

            # Check invariance against relabeling [0, 1] -> [1, 2]
            pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [-1, 1]
            pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [1, 0]
            pc_clf.fit(this_X_train, (y_train + 1) % 2,
                       sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = \
                pc_clf.predict_proba(this_X_test)[:, 1]
            if method == "sigmoid":
                assert_array_almost_equal(prob_pos_pc_clf,
                                          1 - prob_pos_pc_clf_relabeled)
            else:
                # Isotonic calibration is not invariant against relabeling
                # but should improve in both cases
                assert_greater(brier_score_loss(y_test, prob_pos_clf),
                               brier_score_loss((y_test + 1) % 2,
                                                prob_pos_pc_clf_relabeled))

        # check that calibration can also deal with regressors that have
        # a decision_function
        clf_base_regressor = CalibratedClassifierCV(Ridge())
        clf_base_regressor.fit(X_train, y_train)
        clf_base_regressor.predict(X_test)

        # Check failure cases:
        # only "isotonic" and "sigmoid" should be accepted as methods
        clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
        assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train)

        # base-estimators should provide either decision_function or
        # predict_proba (most regressors, for instance, should fail)
        clf_base_regressor = \
            CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid")
        assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
Ejemplo n.º 15
0
    final_text = " ".join([token.lemma_ for token in doc])
    print(final_text)

    return final_text
#
# data["modified_sentence"]=data["question"].apply(Cleaning)
# print (data["modified_sentence"])

def generate_answer(predict_class):
    ans=random.choice(answer_dictionary[predict_class])
    return ans


from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
clf2 = MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True).fit(X1_train, Y_train)

P=model.transform([Cleaning(question)])
predict2=clf2.predict(P)
print (predict2)

y_predict = clf2.predict(X1_test)
print(accuracy_score(Y_test,y_predict)*100)

# MLP MultiLevel Perception
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

clf6 = MLPClassifier(activation='relu',alpha=0.0019,hidden_layer_sizes=(300,), learning_rate='constant',power_t=1.5, solver='adam',random_state=15)
clf6.fit(X1_train,Y_train)
Ejemplo n.º 16
0
# sys.exit(0)

# print len(my_data), len(better_result)


def test(text):
    print text
    return text


# my_data = my_data[:len(my_data)-1]
vect = CountVectorizer(charset_error='ignore', preprocessor=test)
text_clf = Pipeline([
    ('vect', vect),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

print "Dataset length: %s " % len(my_data)
print("Training...")
my_clf = text_clf.fit(my_data[:, 4], my_data[:, 3])

print "# Features: %s" % len(vect.vocabulary_)

print("Done! \nClassifying test set...")
predicted = my_clf.predict(my_test_data[:, 4])

print(np.mean(predicted == my_test_data[:, 3]))

print "Accuracy: %.2f" % my_clf.score(my_data[:, 4], my_data[:, 3])
print "Accuracy: %.2f" % my_clf.score(predicted, my_test_data[:, 3])
Ejemplo n.º 17
0
train_data = train_data.drop(cols_to_drop, axis=1)

#tussentijds resultaat
print(train_data.head())

#hier zetten wij alles om naar lower character. en halen wij alle gekke tekens eruit
train_data['text'] = train_data['text'].str.lower()
train_data['text'] = train_data['text'].apply(lambda elem: re.sub(
    r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
train_data['text'] = train_data['text'].apply(
    lambda elem: re.sub(r"\d+", "", elem))

#tussentijds resultaat
print(train_data.head())

#hier stellen wij de variabelen op om straks de tekst naar vectors om te zetten
count_vectorizer = feature_extraction.text.CountVectorizer()
#hier wordt onze train data omgezet naar vectors
train_vectors = count_vectorizer.fit_transform(train_data["text"])
#voorbeeld van de vectors
print(train_vectors)
#hier wordt onze test data omgezet naar vectors
test_vectors = count_vectorizer.transform(test_df["text"])

clf = MultinomialNB(alpha=1, fit_prior=True, class_prior=None)
scores = model_selection.cross_val_score(clf,
                                         train_vectors,
                                         train_data["target"],
                                         cv=3,
                                         scoring="f1")
print(scores)
Ejemplo n.º 18
0
    def mod_knn_class(y_endogenous, x_exogenous, train_ratio=0.7, folds=5):
        """
        :param y_endogenous:
        :param x_exogenous:
        :param train_ratio:
        :param folds:
        :return:
        """
        random_state = 123
        """Drop NaN"""
        y_endogenous.dropna(inplace=True)
        x_exogenous.dropna(inplace=True)
        """Transform data for LogReg fitting"""
        scaler = StandardScaler()
        std_data = scaler.fit_transform(x_exogenous.values)
        std_data = pd.DataFrame(std_data,
                                index=x_exogenous.index,
                                columns=x_exogenous.columns)
        """Shuffle Data for IMBALANCES"""
        from sklearn.utils import shuffle
        X_shuf, Y_shuf = shuffle(std_data, y_endogenous)
        X_shuf = X_shuf.as_matrix().astype(np.float)
        Y_shuf = Y_shuf.as_matrix().astype(np.int)
        """K-fold CV"""
        cv = StratifiedKFold(n_splits=folds, shuffle=False)
        """Establish Models Settings"""
        # White-Box: GLM
        lasso = LogisticRegression(penalty='l1',
                                   C=0.1,
                                   random_state=random_state,
                                   solver='liblinear',
                                   n_jobs=1)
        ridge = LogisticRegression(penalty='l2',
                                   C=0.1,
                                   random_state=random_state,
                                   solver='liblinear',
                                   n_jobs=1)
        log = LogisticRegression(class_weight='balanced',
                                 C=0.1,
                                 random_state=random_state,
                                 solver='liblinear',
                                 n_jobs=1)
        svc = SVC(C=0.1,
                  kernel='linear',
                  cache_size=100,
                  shrinking=True,
                  decision_function_shape='ovo',
                  probability=True)
        # Black-Box: Bagging
        rfc = RandomForestClassifier(random_state=random_state,
                                     bootstrap=True,
                                     max_depth=80,
                                     criterion='entropy',
                                     min_samples_leaf=3,
                                     min_samples_split=10,
                                     n_estimators=500,
                                     max_features=None)
        gbc = GradientBoostingClassifier(learning_rate=0.5,
                                         n_estimators=250,
                                         min_samples_split=200,
                                         max_depth=3)
        # Non-Linear
        nb = GaussianNB()
        gpc = GaussianProcessClassifier()
        mnb = MultinomialNB()
        bnb = BernoulliNB(binarize=True)
        knn = KNeighborsClassifier(n_neighbors=2)
        """Storage List Dictionary for Models"""
        en_models = [{
            'label': 'K Neighbors Classifier',
            'model': knn,
            'dict_metrics': {},
        }]
        """Loop Models"""
        for m in en_models:
            MOD = m['model']
            print(m['label'])
            # AUC storage
            mean_tprs_y, mean_fpr_x = [], np.linspace(0, 1, 100)
            fprs_x, tprs_y, aucs = [], [], []
            # Other Metrics Storage: Evaluation Metrics Dictionary
            dict_metrics = {
                'fold_no': [],  # 1
                'acc_score': [],  # 2
                'jaccard_ind': [],  # 3
                'conf_matrix': [],  # 4
                'f1_score': [],  # 5
                'log_loss': [],  # 6
                'feat_coef': [],  # 7
                'feat_names': [],  # 8
                'fprs': [],
                'tprs': []
            }
            # Train / Test Split
            i = 1  # Start Loop
            for train_ind, test_ind in cv.split(X_shuf, Y_shuf):
                # Train Test Split
                X_train, X_test = X_shuf[train_ind], X_shuf[test_ind]
                y_train, y_test = Y_shuf[train_ind], Y_shuf[test_ind]
                # Fit Model
                MOD.fit(X_train, y_train)
                # ROC Curve
                fpr, tpr, thresholds = roc_curve(
                    y_test,
                    MOD.predict_proba(X_test).T[1])
                roc_auc = auc(fpr, tpr)
                fprs_x.append(fpr)
                tprs_y.append(tpr)
                mean_tprs_y.append(interp(mean_fpr_x, fpr, tpr))
                aucs.append(roc_auc)

                # Fold Number
                fold_no = i

                # Accuracy Score
                y_pred = MOD.predict(X_test)
                acc_score = metrics.accuracy_score(y_test, y_pred)
                # Jaccard Index
                j_index = jaccard_similarity_score(y_true=y_test,
                                                   y_pred=y_pred)
                j_index_rnd = round(j_index, 2)
                # Confusion Matricsw
                cm = confusion_matrix(y_test, y_pred)
                # F1 Score
                f1 = f1_score(y_test, y_pred)
                # Log Loss
                lg_loss = log_loss(y_test, y_pred)
                # Feature Importance
                try:
                    if m['label'] == 'Random Forest Classifier':
                        feature_imp = pd.Series(
                            rfc.feature_importances_,
                            index=x_exogenous.columns).sort_values(
                                ascending=False)
                        feature_coef = pd.Series(
                            feature_imp,
                            index=x_exogenous.columns).sort_values(
                                ascending=False)
                        dict_metrics['feat_coef'].append(feature_coef.values)
                        dict_metrics['feat_names'].append(feature_coef.index)
                    elif m['label'] == 'Gradient Boost Classifier':
                        feature_imp = pd.Series(
                            gbc.feature_importances_,
                            index=x_exogenous.columns).sort_values(
                                ascending=False)
                        feature_coef = pd.Series(
                            feature_imp,
                            index=x_exogenous.columns).sort_values(
                                ascending=False)
                        dict_metrics['feat_coef'].append(feature_coef.values)
                        dict_metrics['feat_names'].append(feature_coef.index)
                    elif m['label'] == 'none':
                        pass
                    else:
                        # Feature Coefficients
                        coefficients = MOD.coef_[0]
                        feature_coef = pd.Series(
                            coefficients,
                            index=x_exogenous.columns).sort_values(
                                ascending=False)
                        dict_metrics['feat_coef'].append(feature_coef.values)
                        dict_metrics['feat_names'].append(feature_coef.index)
                except Exception:  # (Valueerror, Attribute Error)
                    pass

                # Store Metrics
                dict_metrics['fold_no'].append(fold_no)
                dict_metrics['acc_score'].append(acc_score)
                dict_metrics['jaccard_ind'].append(j_index_rnd)
                dict_metrics['conf_matrix'].append(cm)
                dict_metrics['f1_score'].append(f1)
                dict_metrics['log_loss'].append(lg_loss)
                dict_metrics['fprs'].append(fpr)
                dict_metrics['tprs'].append(tpr)

                np.savetxt('/Users/Derrick-Vlad-/Desktop/' + 'FPR_KNN.csv',
                           fpr,
                           delimiter=",")
                np.savetxt('/Users/Derrick-Vlad-/Desktop/' + 'TPR_KNN.csv',
                           tpr,
                           delimiter=",")

                # Next Loop Indexer
                i = i + 1

            # Store All Metrics
            m['dict_metrics'] = dict_metrics
        """End??????"""
        labels = [i['label'] for i in en_models if 'label' in i]
        eva_all = [i['dict_metrics'] for i in en_models if 'dict_metrics' in i]
        accuracy = [i['acc_score'] for i in eva_all if 'acc_score' in i]
        f1 = [i['f1_score'] for i in eva_all if 'f1_score' in i]
        fprss = [i['fprs'] for i in eva_all if 'fprs' in i]
        tprss = [i['tprs'] for i in eva_all if 'tprs' in i]
        logL = [i['log_loss'] for i in eva_all if 'log_loss' in i]
        confmatrix = [i['conf_matrix'] for i in eva_all if 'conf_matrix' in i]
        # Prepare Data-frame
        # ACCURACY
        acc = np.vstack(accuracy)
        acc = np.transpose(acc)
        df1 = pd.DataFrame(acc, columns=labels)

        # F1 Score
        f1 = np.vstack(f1)
        f1 = np.transpose(f1)
        df2 = pd.DataFrame(f1, columns=labels)

        # FALSE POSITIVE RATES
        #fprs = np.vstack(fprss)  # [:, 0] OR [:, None]
        #fprs = np.transpose(fprs)
        #df3 = pd.DataFrame(fprs, columns=labels)
        print(fprss)

        # TRUE POSITIVE RATES
        #tprs = np.vstack(tprss)
        #tprs = np.transpose(tprs)
        #df4 = pd.DataFrame(tprs, columns=labels)
        print(tprss)

        # LOG LOSS SCORE
        logloss = np.vstack(logL)
        logloss = np.transpose(logloss)
        df5 = pd.DataFrame(logloss, columns=labels)

        # CONFUSION MATRIX
        # confmat = np.vstack(confmatrix)
        # confmat = np.transpose(confmat)
        # df6 = pd.DataFrame(confmat, columns=labels)
        print(confmatrix)

        results = Models()
        results.acc_score = df1
        results.f1_score = df2
        #results.fprs = df3
        #results.tprs = df4
        results.logloss = df5
        # results.confmat = df6

        return results
Ejemplo n.º 19
0
Informa atributos de bichos desconhecidos
"""
print(atributos_test.head(5))
#Quais são os bichos com os atributos acima?
print(resultados_test.head(5))


"""
3. Fazer previsões
"""
# Fazer predições
atributos_previsao = [0, 0, 0]
dados_previsao = [atributos_previsao]

# Criação do modelo
modelo = MultinomialNB()
modelo.fit(atributos_train, resultados_train)
#resultado_previsao = modelo.predict(dados_previsao)

#print("Bicho previsto:")
#print(resultado_previsao)

#print("Acurácia de " + str(accuracy_score(, resultado_previsao) * 100) + "%")

"""
A variável data representa um objeto Python que funciona como um dicionário. 
As chaves importantes do dicionário a considerar são:
 
    - os nomes dos rótulos de classificação (target_names)
 - os rótulos reais (target)                               
 
Ejemplo n.º 20
0
class TestModelTypeChecking(object):
    """
    Test model type checking utilities
    """

    ##////////////////////////////////////////////////////////////////////
    ## is_estimator testing
    ##////////////////////////////////////////////////////////////////////

    def test_estimator_alias(self):
        """
        Assert isestimator aliases is_estimator
        """
        assert isestimator is is_estimator

    @pytest.mark.parametrize("model", ESTIMATORS, ids=obj_name)
    def test_is_estimator(self, model):
        """
        Test that is_estimator works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_estimator(model)

        obj = model()
        assert is_estimator(obj)

    @pytest.mark.parametrize("cls", [
        list, dict, tuple, set, str, bool, int, float
    ], ids=obj_name)
    def test_not_is_estimator(self, cls):
        """
        Assert Python objects are not estimators
        """
        assert inspect.isclass(cls)
        assert not is_estimator(cls)

        obj = cls()
        assert not is_estimator(obj)

    def test_is_estimator_pipeline(self):
        """
        Test that is_estimator works for pipelines
        """
        assert is_estimator(Pipeline)
        assert is_estimator(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('linreg', LinearRegression())
        ])

        assert is_estimator(model)

    def test_is_estimator_search(self):
        """
        Test that is_estimator works for search
        """
        assert is_estimator(GridSearchCV)
        assert is_estimator(RandomizedSearchCV)

        model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']})
        assert is_estimator(model)

    @pytest.mark.parametrize("viz,params", [
        (Visualizer, {}),
        (ScoreVisualizer, {'model': LinearRegression()}),
        (ModelVisualizer, {'model': LogisticRegression()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_estimator_visualizer(self, viz, params):
        """
        Test that is_estimator works for Visualizers
        """
        assert inspect.isclass(viz)
        assert is_estimator(viz)

        obj = viz(**params)
        assert is_estimator(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_regressor testing
    ##////////////////////////////////////////////////////////////////////

    def test_regressor_alias(self):
        """
        Assert isregressor aliases is_regressor
        """
        assert isregressor is is_regressor

    @pytest.mark.parametrize("model", REGRESSORS, ids=obj_name)
    def test_is_regressor(self, model):
        """
        Test that is_regressor works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_regressor(model)

        obj = model()
        assert is_regressor(obj)

    @pytest.mark.parametrize("model",
        CLASSIFIERS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS,
    ids=obj_name)
    def test_not_is_regressor(self, model):
        """
        Test that is_regressor does not match non-regressor estimators
        """
        assert inspect.isclass(model)
        assert not is_regressor(model)

        obj = model()
        assert not is_regressor(obj)

    def test_is_regressor_pipeline(self):
        """
        Test that is_regressor works for pipelines
        """
        assert not is_regressor(Pipeline)
        assert not is_regressor(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('linreg', LinearRegression())
        ])

        assert is_regressor(model)

    @pytest.mark.xfail(reason="grid search has no _estimator_type it seems")
    def test_is_regressor_search(self):
        """
        Test that is_regressor works for search
        """
        assert is_regressor(GridSearchCV)
        assert is_regressor(RandomizedSearchCV)

        model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']})
        assert is_regressor(model)

    @pytest.mark.parametrize("viz,params", [
        (ScoreVisualizer, {'model': LinearRegression()}),
        (ModelVisualizer, {'model': Ridge()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_regressor_visualizer(self, viz, params):
        """
        Test that is_regressor works on visualizers
        """
        assert inspect.isclass(viz)
        assert not is_regressor(viz)

        obj = viz(**params)
        assert is_regressor(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_classifier testing
    ##////////////////////////////////////////////////////////////////////

    def test_classifier_alias(self):
        """
        Assert isclassifier aliases is_classifier
        """
        assert isclassifier is is_classifier

    @pytest.mark.parametrize("model", CLASSIFIERS, ids=obj_name)
    def test_is_classifier(self, model):
        """
        Test that is_classifier works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_classifier(model)

        obj = model()
        assert is_classifier(obj)

    @pytest.mark.parametrize("model",
        REGRESSORS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS,
    ids=obj_name)
    def test_not_is_classifier(self, model):
        """
        Test that is_classifier does not match non-classifier estimators
        """
        assert inspect.isclass(model)
        assert not is_classifier(model)

        obj = model()
        assert not is_classifier(obj)

    def test_classifier_pipeline(self):
        """
        Test that is_classifier works for pipelines
        """
        assert not is_classifier(Pipeline)
        assert not is_classifier(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('linreg', LogisticRegression())
        ])

        assert is_classifier(model)

    @pytest.mark.xfail(reason="grid search has no _estimator_type it seems")
    def test_is_classifier_search(self):
        """
        Test that is_classifier works for search
        """
        assert is_classifier(GridSearchCV)
        assert is_classifier(RandomizedSearchCV)

        model = GridSearchCV(SVC(), {'kernel': ['linear', 'rbf']})
        assert is_classifier(model)

    @pytest.mark.parametrize("viz,params", [
        (ScoreVisualizer, {'model': MultinomialNB()}),
        (ModelVisualizer, {'model': MLPClassifier()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_classifier_visualizer(self, viz, params):
        """
        Test that is_classifier works on visualizers
        """
        assert inspect.isclass(viz)
        assert not is_classifier(viz)

        obj = viz(**params)
        assert is_classifier(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_clusterer testing
    ##////////////////////////////////////////////////////////////////////

    def test_clusterer_alias(self):
        """
        Assert isclusterer aliases is_clusterer
        """
        assert isclusterer is is_clusterer

    @pytest.mark.parametrize("model", CLUSTERERS, ids=obj_name)
    def test_is_clusterer(self, model):
        """
        Test that is_clusterer works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_clusterer(model)

        obj = model()
        assert is_clusterer(obj)

    @pytest.mark.parametrize("model",
        REGRESSORS+CLASSIFIERS+TRANSFORMERS+DECOMPOSITIONS,
    ids=obj_name)
    def test_not_is_clusterer(self, model):
        """
        Test that is_clusterer does not match non-clusterer estimators
        """
        assert inspect.isclass(model)
        assert not is_clusterer(model)

        obj = model()
        assert not is_clusterer(obj)

    def test_clusterer_pipeline(self):
        """
        Test that is_clusterer works for pipelines
        """
        assert not is_clusterer(Pipeline)
        assert not is_clusterer(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('kmeans', KMeans())
        ])

        assert is_clusterer(model)

    @pytest.mark.parametrize("viz,params", [
        (ModelVisualizer, {'model': KMeans()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_clusterer_visualizer(self, viz, params):
        """
        Test that is_clusterer works on visualizers
        """
        assert inspect.isclass(viz)
        assert not is_clusterer(viz)

        obj = viz(**params)
        assert is_clusterer(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_gridsearch testing
    ##////////////////////////////////////////////////////////////////////

    def test_gridsearch_alias(self):
        """
        Assert isgridsearch aliases is_gridsearch
        """
        assert isgridsearch is is_gridsearch

    @pytest.mark.parametrize("model", SEARCH, ids=obj_name)
    def test_is_gridsearch(self, model):
        """
        Test that is_gridsearch works correctly
        """
        assert inspect.isclass(model)
        assert is_gridsearch(model)

        obj = model(SVC, {"C": [0.5, 1, 10]})
        assert is_gridsearch(obj)

    @pytest.mark.parametrize("model",
        [MLPRegressor, MLPClassifier, Imputer], ids=obj_name)
    def test_not_is_gridsearch(self, model):
        """
        Test that is_gridsearch does not match non grid searches
        """
        assert inspect.isclass(model)
        assert not is_gridsearch(model)

        obj = model()
        assert not is_gridsearch(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_probabilistic testing
    ##////////////////////////////////////////////////////////////////////

    def test_probabilistic_alias(self):
        """
        Assert isprobabilistic aliases is_probabilistic
        """
        assert isprobabilistic is is_probabilistic

    @pytest.mark.parametrize("model", [
        MultinomialNB, GaussianNB, LogisticRegression, SVC,
        RandomForestClassifier, GradientBoostingClassifier, MLPClassifier,
    ], ids=obj_name)
    def test_is_probabilistic(self, model):
        """
        Test that is_probabilistic works correctly
        """
        assert inspect.isclass(model)
        assert is_probabilistic(model)

        obj = model()
        assert is_probabilistic(obj)

    @pytest.mark.parametrize("model", [
        MLPRegressor, Imputer, StandardScaler, KMeans,
        RandomForestRegressor,
    ], ids=obj_name)
    def test_not_is_probabilistic(self, model):
        """
        Test that is_probabilistic does not match non probablistic estimators
        """
        assert inspect.isclass(model)
        assert not is_probabilistic(model)

        obj = model()
        assert not is_probabilistic(obj)
        if word.lower() not in stopwords.words('english')
    ]

    return wordss


data['text'].apply(process_text).head()
data.head()

#Splitting into training and testing data. Training data 70%
x_train, x_test, y_train, y_test = train_test_split(data['text'],
                                                    data['class'],
                                                    test_size=0.3)

#Creating the Model
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB()
     )  # training on TF-IDF vectors with Naive Bayes classifier
])

#Training the model
pipeline.fit(x_train, y_train)

#Testing
predictions = pipeline.predict(x_test)
print(classification_report(y_test, predictions))

#Confusion Matrix
sns.heatmap(confusion_matrix(y_test, predictions), annot=True)
    bl1.update (ml6)
    '''

    _trainfeatures, _trainlabels, _testfeatures, _testlabels = split(bf1, bl1)

    #(features, labels) = adapt (bf1, bl1)
    (trainfeatures, trainlabels) = adapt (_trainfeatures, _trainlabels)
    (testfeatures, testlabels) = adapt (_testfeatures, _testlabels)

    #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#GaussianProcessClassifier(), ExtraTreesClassifier(n_estimators=120), AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())
    #models = (ExtraTreesClassifier(n_estimators=128, random_state=0),  AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())
    #models = (SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())

    #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())

    models = (RandomForestClassifier(n_estimators = 128, random_state=0), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB())

    #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB())

    #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), )#GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB())

    #fsets = (FSET_FULL,FSET_NOICC, FSET_MIN, FSET_YYY_G, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
    #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YY, FSET_YYY):

    fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YYY)

    #fsets = (FSET_FULL, FSET_Y, FSET_YYY)

    #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
    #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
    #fsets = (FSET_FULL, FSET_G, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
Ejemplo n.º 23
0
def modelTraining(X_train, X_test, y_train, y_test, f):
    models = {}
    # Linear SVC
    try:
        lsvc = LinearSVC()
        y_pred = lsvc.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Linear Support Vector Classifier"] = model_accr
        f.writelines(
            "\n            Accuracy of Linear Support Vector Classifier is " +
            str(model_accr))
    except:
        logging.info("LSVC is throwing exception")
        f.writelines("\n            LSVC is throwing exception")

    # KNN
    try:
        knn = KNeighborsClassifier()
        y_pred = knn.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["KNN Classifier"] = model_accr
        f.writelines("\n            Accuracy of KNN Classifier is " +
                     str(model_accr))
    except:
        logging.info("KNN is throwing exception")
        f.writelines("\n            KNN is throwing exception")

    # DTC
    try:
        clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0)
        y_pred = clf_gini.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Decision Tree Classifier - GINI"] = model_accr
        f.writelines(
            "\n            Accuracy of Decision Tree Classifier - GINI is " +
            str(model_accr))
    except:
        logging.info("DTC GINI is throwing exception")
        f.writelines("\n            DTC GINI is throwing exception")

    try:
        clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                             random_state=0)
        y_pred = clf_entropy.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Decision Tree Classifier - ENTROPY"] = model_accr
        f.writelines(
            "\n            Accuracy of Decision Tree Classifier - ENTROPY is "
            + str(model_accr))
    except:
        logging.info("DTC ENTROPY is throwing exception")
        f.writelines("\n            DTC ENTROPY is throwing exception")

    # Multinomial NB
    try:
        mnb_model = MultinomialNB()
        y_pred = mnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Multinomial Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of Multinomial NB is " +
                     str(model_accr))
    except:
        logging.info("Multinomial NB is throwing exception")
        f.writelines("\n            Multinomial NB is throwing exception")

    # Bernoulli NB
    try:
        bnb_model = BernoulliNB()
        y_pred = bnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Bernoulli Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of Bernoulli NB is " +
                     str(model_accr))
    except:
        logging.info("Bernoulli NB is throwing exception")
        f.writelines("\n            Bernoulli NB is throwing exception")

    # Gaussian NB
    try:
        gnb_model = GaussianNB()
        y_pred = gnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Gaussian Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of GaussianNB is " +
                     str(model_accr))
    except:
        logging.info("GaussianNB is throwing exception")
        f.writelines("\n            GaussianNB is throwing exception")

    # ADB
    try:
        adb = AdaBoostClassifier(n_estimators=200, learning_rate=1)
        # Train Adaboost Classifer
        y_pred = adb.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["AdaBoost Classifier"] = model_accr
        f.writelines("\n            Accuracy of AdaBoost Classifier is " +
                     str(model_accr))
    except:
        logging.info("AdaBoost Classifier is throwing exception")
        f.writelines("\n            AdaBoost Classifier is throwing exception")

    # Random Forest Classifier
    try:
        rfc = RandomForestClassifier(n_estimators=100)
        y_pred = rfc.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Random Forest Classifier"] = model_accr
        f.writelines("\n            Accuracy of Random Forest Classifier is " +
                     str(model_accr))
    except:
        logging.info("Random Forest Classifier is throwing exception")
        f.writelines(
            "\n            Random Forest Classifier is throwing exception")

    return (models)
Ejemplo n.º 24
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

df = pd.read_csv('spam_ham_dataset.csv', encoding="latin-1")
#df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
#df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
X = df['text']
y = df['label_num']
cv = CountVectorizer()
X = cv.fit_transform(X)  # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
#print(classification_report(y_test, y_pred))

#from sklearn.externals import joblib
#joblib.dump(clf, 'NB_spam_model.pkl')

#NB_spam_model = open('NB_spam_model.pkl','rb')
#clf = joblib.load(NB_spam_model)
app = Flask(__name__)


@app.route("/")
def home1():
Ejemplo n.º 25
0
def bayes_classifier(data_train, class_labels_train):
    print("Fitting the classifier...")
    classifier = MultinomialNB(alpha=0.01)
    classifier.fit(data_train, class_labels_train)
    print("Classifier fitted...")
    return classifier
Ejemplo n.º 26
0
from sklearn.externals import joblib
import pickle
import logging
import numpy as np

test_case = load_files('reuter2/training')

count_vect = CountVectorizer(decode_error='ignore', strip_accents='unicode')
X_train_counts = count_vect.fit_transform(test_case.data)
X_train_counts.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

clf = MultinomialNB().fit(X_train_tfidf, test_case.target)
docs_new = [
    'I like bees',
    'Construction of a unique downtown highrise that would provide both living and working space to local artists is still at least a year away from starting, project organizers say.'
]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, test_case.target_names[category]))

text_clf = Pipeline([
    ('vect', CountVectorizer(decode_error='ignore')),
    ('tfidf', TfidfTransformer()),
Ejemplo n.º 27
0
testing_set = featuresets[:100]

# posterior = prior occurences * likelihood / evidence

classifier = nltk.NaiveBayesClassifier.train(training_set)

# load
#classifier_f = open('naivebayes.pickle', 'rb')
#classifier = pickle.load(classifier_f)
#classifier_f.close()

print('Original Naive Bayes Algo accuracy:',
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

MNB_classiflier = SklearnClassifier(MultinomialNB())
MNB_classiflier.train(training_set)
print('MNB_classiflier Naive Bayes Algo accuracy:',
      (nltk.classify.accuracy(MNB_classiflier, testing_set)) * 100)

#GaussianNB, BernoulliNB
#GaussianNB_classiflier = SklearnClassifier(GaussianNB())
#GaussianNB_classiflier.train(training_set)
#print('GaussianNB_classiflier Naive Bayes Algo accuracy:', (nltk.classify.accuracy(GaussianNB_classiflier, testing_set))*100)

BernoulliNB_classiflier = SklearnClassifier(BernoulliNB())
BernoulliNB_classiflier.train(training_set)
print('BernoulliNB_classiflier Naive Bayes Algo accuracy:',
      (nltk.classify.accuracy(BernoulliNB_classiflier, testing_set)) * 100)

#LogisticRegression, SGDClassifier
print(model.score(xtest,ytest))
# print(f1_score(ytest,y_pred))
# print(precision_score(ytest,y_pred))
# print(recall_score(ytest,y_pred))


# In[142]:


from sklearn.naive_bayes import MultinomialNB 


# In[143]:


nv=MultinomialNB()
model_nv = nv.fit(xtrain,ytrain) 
y_pred_nv=model_nv.predict(xtest)
print(accuracy_score(ytest,y_pred_nv))


# In[144]:


from sklearn.tree import DecisionTreeClassifier


# In[145]:


dt=DecisionTreeClassifier()
Ejemplo n.º 29
0
# Filter for JJ (adjectives)
train_txt_filtered = [filter_tag(i, 'JJ') for i in train_txt_tag]
test_txt_filtered = [filter_tag(i, 'JJ') for i in test_txt_tag]

# Lemmatization
wnl = WordNetLemmatizer()
train_txt_lemma = [
    lemmatize(words=words, lemmatizer=wnl.lemmatize)
    for words in train_txt_filtered
]
test_txt_lemma = [
    lemmatize(words=words, lemmatizer=wnl.lemmatize)
    for words in test_txt_filtered
]

# Counts and NB model with scikit learn

train_txt_sk = [' '.join(words) for words in train_txt_lemma]
test_txt_sk = [' '.join(words) for words in test_txt_lemma]

text_pipeline = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('nb', MultinomialNB())])

text_pipeline.fit(X=train_txt_sk, y=train_y)
pred = text_pipeline.predict(X=test_txt_sk)

cfm = confusion_matrix(y_true=test_y, y_pred=pred)
print(cfm)
Ejemplo n.º 30
0
def main():
    '''
    Notes for running:
    - written for python 2.7 -> change print statements if using 3
    - required deps -> install scikit learn (google it)
    - edit filepaths
    '''
    input_type = 'permissions'

    # good_path = '/home/josh/Documents/COSC/Research/APK_project/apk_repo/test_sets/large/v2/mal_badging_full_v2.txt'
    # mal_path = '/home/josh/Documents/COSC/Research/APK_project/apk_repo/test_sets/large/v2/benign_badging_full_v2.txt'
    results_dir = '/home/josh/Documents/COSC/Research/APK_project/DeepLearningResearch/Results/shallowResults/imbalanced-'
    good_path = "/home/noureldin/Desktop/workspace/freelancer/Olumerew/project1/DeepLearningResearch/Data/badging_med/ben_badging_med.txt"
    mal_path = "/home/noureldin/Desktop/workspace/freelancer/Olumerew/project1/DeepLearningResearch/Data/badging_med/mal_badging_med.txt"

    with open(good_path) as f:
        gdprm = f.readlines()
    with open(mal_path) as f:
        mlprm = f.readlines()

    features = gdprm + mlprm

    labels = np.array([])
    for x in gdprm:
        labels = np.append(labels, 0)
    for x in mlprm:
        labels = np.append(labels, 1)

    token_pattern = None
    if input_type == 'hardware':
        #token_pattern = 'android\.hardware\.[^\']*'
        token_pattern = "(?:\w|\.)+(?:hardware).(?:\w|\.)+"
    elif input_type == 'permissions':
        #token_pattern = 'android\.permission\.[^\']*'
        #token_pattern = "(?<=name=\')[^(?:p)]*(?:permission)[^\']*"
        token_pattern = "(?:\w|\.)+(?:permission).(?:\w|\.)+"
    else:
        #token_pattern = 'android\.(?:hardware|permission)\.[^\']*'
        #token_pattern = "(?<=name=\')[^(?:p|h)]*(?:permission|hardware)[^\']*"
        token_pattern = "(?:\w|\.)+(?:permission|hardware).(?:\w|\.)+"
    print token_pattern

    #count_vect = CountVectorizer(input=u'content', analyzer=u'word', token_pattern=token_pattern)
    count_vect = CountVectorizer(
        analyzer=partial(regexp_tokenize, pattern=token_pattern))

    time0 = timeit.default_timer()
    data_features = count_vect.fit_transform(features)
    time1 = timeit.default_timer()  #time to tokenize
    print type(features)
    print data_features.get_shape()
    #for x in count_vect.get_feature_names():
    #    print x
    print 'tokenize time: ' + str(time1 - time0)
    print '\n'

    words = list(
        map(lambda feature: re.split(token_pattern, feature), features))
    info = {'words': words, "labels": list(labels)}
    print('info=', json.dumps(info))

    #proportion of data to test on vs total
    ratios = [.8, .6, .4, .2]
    columns = [
        'avg_acc', 'fpos_rate', 'fneg_rate', 'precision', 'recall', 'f1_score',
        'avg_test_time', 'avg_train_time'
    ]
    indices = [.2, .4, .6, .8]
    print "BernoulliNB"
    bNBdf = pandas.DataFrame(columns=columns)
    print bNBdf
    for x in ratios:
        model_name = "BernoulliNB"
        BNclf = BernoulliNB()
        bNBdf = test_model(bNBdf, BNclf, data_features, labels, x)
        results_to_csv(bNBdf, model_name, results_dir, input_type)
        print '\n'
    print '---------------------------\n'
    print "MultiNomialNB"
    mnNBdf = pandas.DataFrame(columns=columns)  #, index=indices)
    for x in ratios:
        model_name = "MultinomialNB"
        NBclf = MultinomialNB()
        mnNBdf = test_model(mnNBdf, NBclf, data_features, labels, x)
        results_to_csv(mnNBdf, model_name, results_dir, input_type)
        print '\n'
    print '---------------------------\n'
    print "DecisionTree"
    dtdf = pandas.DataFrame(columns=columns)  #, index=indices)
    for x in ratios:
        model_name = "DecisionTree"
        DTclf = DecisionTreeClassifier()  #min_samples_split = 20)
        dtdf = test_model(dtdf, DTclf, data_features, labels, x)
        results_to_csv(dtdf, model_name, results_dir, input_type)
        print '\n'
    print '---------------------------\n'
    print "LogisticRegression"
    lgdf = pandas.DataFrame(columns=columns)  #, index=indices)
    for x in ratios:
        model_name = "Logistic_Regression"
        LRclf = LogisticRegression(C=10, solver='lbfgs')
        lgdf = test_model(lgdf, LRclf, data_features, labels, x)
        results_to_csv(lgdf, model_name, results_dir, input_type)
        print '\n'
    '''# alternative to shuffle_split