Esempio n. 1
0
                    items = []
                    if '),' in item:
                        items = item.split('),')
                    else:
                        items = item.split(',', 1)

                    for i in items:
                        i = i.lstrip().rstrip()
                        i = i.lstrip('\'').rstrip('\'')
                        i = i.lstrip('\"').rstrip('\"')
                        nx += [i]

                for i in range(len(nx)):
                    if i % 2 == 0:
                        if nx[i] == 'base_estimator':
                            p['base_estimator'] = tree.DecisionTreeClassifier(
                                splitter='random', max_depth=1)
                        elif nx[i] == 'learning_rate':
                            p[nx[i].lstrip().rstrip()] = float(nx[i + 1])
                        elif nx[i] == 'n_estimators':
                            p[nx[i].lstrip().rstrip()] = int(nx[i + 1])
                        else:
                            p[nx[i].lstrip().rstrip()] = nx[i + 1] if type(
                                nx[i + 1]) != str else nx[i +
                                                          1].lstrip().rstrip()

            (accuracy_train, accuracy_test) = runDecisionTreeClassifier(
                x_train, y_train, x_test, y_test, p)
            print("\t".join([
                'decision tree', data,
                str(accuracy_train),
                str(accuracy_test), fs
Esempio n. 2
0
def main():
    if len(sys.argv) != 4 or not sys.argv[2] in [
            'freq', 'chi2'
    ] or not sys.argv[3] in [
            'MultinomialNB', 'GaussianNB', 'SVM', 'DecisionTree', 'KNN'
    ]:
        print('usage:\n\
python categorisation.py <N> <selection> <classifier>\n\n\
with:\n\
N = number of relevant terms by text\n\
selection = \'freq\' or \'chi2\' (feature selection method)\n\
classifier = \'MultinomialNB\' or \'GaussianNB\' or \'SVM\' or \'DecisionTree\' or \'KNN\''
              )
        exit()

    train_set = []
    test_set = []

    N = sys.argv[1]
    method = sys.argv[2]
    s = shelve.open('featuresDictonaries')
    if ('train_' + str(N)) in s:
        train_set = s['train_' + str(N)]
    if ('test_' + str(N)) in s:
        test_set = s['test_' + str(N)]
    s.close()

    if not (train_set and test_set):
        print('building freq features dictionaries...')
        for cat in reuters.categories():
            dfs = defaultdict(lambda: 0)
            tfs = defaultdict(lambda: defaultdict(lambda: 0))
            tfsidfs = defaultdict(lambda: 0)
            for file_id in reuters.fileids(cat):
                fileWords = []
                for w in set(reuters.words(file_id)) - set(
                        stopwords.words('english')):
                    if w not in fileWords:
                        dfs[w] += 1
                        fileWords.append(w)
                    tfs[file_id][w] += 1
            for file_id in tfs:
                for w in tfs[file_id]:
                    tfsidfs[w] = float(tfs[file_id][w]) / dfs[w]
                tfidfSorted = dict(
                    sorted(tfsidfs.iteritems(),
                           key=operator.itemgetter(1),
                           reverse=True)[:int(N)])
                if file_id.startswith('train'):
                    train_set.append((tfidfSorted, cat))
                else:
                    test_set.append((tfidfSorted, cat))
        print('done')

        print('saving to featuresDictonaries...')
        s = shelve.open('featuresDictonaries')
        s['train_' + str(N)] = train_set
        s['test_' + str(N)] = test_set
        s.close()
        print('done')

    print('classifying...')

    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)),
                         ('svm', svm.LinearSVC())])
    classifier = SklearnClassifier(pipeline)

    # chi2
    if sys.argv[2] == 'chi2':
        if sys.argv[3] == 'KNN':
            pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)),
                                 ('svm', KNeighborsClassifier(n_neighbors=5))])
            classifier = SklearnClassifier(pipeline)
        elif sys.argv[3] == 'MultinomialNB':
            pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)),
                                 ('svm', MultinomialNB())])
            classifier = SklearnClassifier(pipeline)
        elif sys.argv[3] == 'GaussianNB':
            pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)),
                                 ('svm', GaussianNB())])
            classifier = SklearnClassifier(pipeline, sparse=False)
        elif sys.argv[3] == 'DecisionTree':
            pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)),
                                 ('svm', tree.DecisionTreeClassifier())])
            classifier = SklearnClassifier(pipeline, sparse=False)
    else:
        if sys.argv[3] == 'KNN':
            classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=5))
        elif sys.argv[3] == 'MultinomialNB':
            classifier = SklearnClassifier(MultinomialNB())
        elif sys.argv[3] == 'GaussianNB':
            classifier = SklearnClassifier(GaussianNB(), sparse=False)
        elif sys.argv[3] == 'DecisionTree':
            classifier = SklearnClassifier(tree.DecisionTreeClassifier(),
                                           sparse=False)
        elif sys.argv[3] == 'SVM':
            classifier = SklearnClassifier(svm.LinearSVC())

    classifier.train(train_set)

    test_skl = []
    t_test_skl = []
    for d in test_set:
        test_skl.append(d[0])
        t_test_skl.append(d[1])

    p = classifier.batch_classify(test_skl)

    print classification_report(t_test_skl,
                                p,
                                labels=list(set(t_test_skl)),
                                target_names=reuters.categories())
Esempio n. 3
0
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
sns.set(style="ticks")

df = sns.load_dataset("iris")
df02 = df.iloc[:,[0,2,4]] # 选择一对特征
print(df02)
sns.pairplot(df02, hue="species")
plt.show()
# ===================================
print('=======================================')
clf = tree.DecisionTreeClassifier()
train_index = [i for i in range(150) if i<30 or 50<=i<80 or 100<=i<130]
test_index = [i for i in range(150) if 30<=i<50 or 80<=i<100 or 130<=i<150]
train_data, train_target = df02.iloc[train_index,[0,1]],df02.iloc[train_index,2]
test_data, test_target = df02.iloc[test_index,[0,1]],df02.iloc[test_index,2]
clf = clf.fit(train_data, train_target)
print(clf)
# 结果
test_val = clf.predict(test_data)
print(test_val)
right = [i for i, j in zip(test_val,test_target) if i==j]
percent = len(right) / len(test_target)
print(percent)  # 0.95
Esempio n. 4
0
 def decisionTree(self):
     print "***** Testing Decision Tree *****"
     clf = tree.DecisionTreeClassifier()
     scores = cross_val_score(clf, self.X_selected, self.y, cv=5)
     print scores, scores.mean()
Esempio n. 5
0
def crossval(Matching, Mapping_exercises, Big_tbl, Worktlb):

    # Build a dataframe for the results
    results_cv = pd.DataFrame(columns=["Exercise","number", "Type_of_algorithm", "mean_Bcr_train", "mean_Bcr_test", 'Used_fold'])
    used_fold = 5
    '''Main loop'''
    # Pich one column corresponding to an exercise at a time and make it the label
    for exercise_number in Matching:
        # Extract the number of the exercise (example: 1001)
        name_of_exercise = exercise_number.replace("_frequency", "")
        # Extract the full name of the exercise (example: Exercise 1 (K): Circulation )
        name_of_exercise = Mapping_exercises['name'][
            Mapping_exercises.index[Mapping_exercises['number'] == int(name_of_exercise)].tolist()].values[0]
        # Create the label for the machine learning algorithm
        label = Big_tbl[exercise_number].notnull().astype(int).to_frame()

        mean_Bcr_train = 0
        mean_Bcr_test = 0
        kf = KFold(n_splits=used_fold, shuffle=True, random_state=42)  # Define the split - into "n_splits" folds
        iter = used_fold
        for train_index, test_index in kf.split(Worktlb):

            # Split the data and the label into test and train set
            train, test = Worktlb.iloc[train_index], Worktlb.iloc[test_index]
            label_train, label_test = label.iloc[train_index], label.iloc[test_index]

            # If trouble in train set

            if sum(label_train.values) == 0:
                print('Issue in kflod of ' + str(name_of_exercise))
                # Do nothing
                mean_Bcr_train = mean_Bcr_train
                iter -= 1
            else:
                # Train prediction
                clf = tree.DecisionTreeClassifier(max_depth=5,class_weight ='balanced')
                clf = clf.fit(train, label_train)
                # Get the most important feature

                # Predict the label for train set
                train_pred = clf.predict(train)

                # confusion_matrix(y_true, y_pred)

                bcr_train = balanced_accuracy_score(label_train, train_pred)
                mean_Bcr_train = mean_Bcr_train + bcr_train

                # Test prediction with the model build on the train set
                test_pred = clf.predict(test)
                # confusion_matrix(y_true, y_pred)

                bcr_test = balanced_accuracy_score(label_test, test_pred)
                mean_Bcr_test = mean_Bcr_test + bcr_test
        # Add everinthing to the Result table
        if iter != 0:
            mean_Bcr_train = mean_Bcr_train / used_fold
            mean_Bcr_test = mean_Bcr_test / used_fold
        results_cv = results_cv.append(
            {"Exercise": name_of_exercise,"number":exercise_number.replace("_frequency", ""), "Type_of_algorithm": "Tree", "mean_Bcr_train": mean_Bcr_train,
             "mean_Bcr_test": mean_Bcr_test, "Used_fold": iter}, ignore_index=True)

    return results_cv
Esempio n. 6
0
y = targets = labels = train_df['Results of Last election'].values
y = np.array(y)
columns = ["Average of last 3 elections", "Average of last 5 elections",
           "Average of polls 1 mo before election (>0 = Repub)", "% of registered republicans",
           "% of registered democrats", "State unemployment rate", "Party of governers"]
>>>>>>> 6b7f67e5c85747354cf8c8924176d441a10ccfcc

features = train_df[list(columns)].values
features = np.array(features)
print("Y data \n" + str(y))
print("------------------------")
print("X data \n" + str(features))
print("-------------------------")
X = features
<<<<<<< HEAD
clf = tree.DecisionTreeClassifier(criterion="entropy")
=======
clf = tree.DecisionTreeClassifier(criterion="entropy", max_features=3)
>>>>>>> 6b7f67e5c85747354cf8c8924176d441a10ccfcc
clf = clf.fit(X, y)
print("X shape: " + str(X.shape))
print("Y shape: " + str(y.shape))
print("--------------------------")
f = tree.export_graphviz(clf, out_file="decisiontree.dot", feature_names=columns)
<<<<<<< HEAD
test_df = pd.read_csv('test/florida_test.csv')
features2 = test_df[list(columns)].values
features2 = np.array(features2)
# print(features2.shape)
importance = clf.feature_importances_
for i, o in zip(columns, importance):
Esempio n. 7
0
Balanced_class_dataset.drop('Session_ID', axis=1, inplace=True)

#Split of the dataset into training set  and testing set
X_train, X_test, Y_train, Y_test = train_test_split(
    Balanced_class_dataset.iloc[:, :-1],
    Balanced_class_dataset['Buy_Outcome'],
    test_size=0.3,
    random_state=1)

#Define Decision Tree
Depths_Leaves = [(10, 5), (3, 7), (30, 1000), (10, 20)]
fposDT, trposDT, threshDT = [], [], []
for item in Depths_Leaves:
    print(item)
    clfDT = tree.DecisionTreeClassifier(max_depth=item[0],
                                        max_leaf_nodes=item[1],
                                        random_state=1)
    #Training the classifiers
    clfDT.fit(X_train, Y_train)
    #Test the trained model on the test set
    y_test_pred_DT = clfDT.predict(X_test)
    #Confusion matrix of our model towards the test data
    confMatrix_Test_DT = confusion_matrix(Y_test, y_test_pred_DT, labels=None)

    print(
        f'Decision Tree Depth: {clfDT.get_depth()}, Leaves: {clfDT.get_n_leaves()}'
    )
    print('Confusion Matrix')
    print(confMatrix_Test_DT, '\n')

    pr_y_test_pred_DT = clfDT.predict_proba(X_test)
Esempio n. 8
0
import pandas as pd
import numpy as np
from subprocess import call
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

data = pd.read_csv("../Dataset/onehot.csv", delimiter=",")

xTrain, xTest, yTrain, yTest = train_test_split(data.iloc[:, :-1],
                                                data.iloc[:, -1:],
                                                test_size=0.33,
                                                random_state=0)
treeClassifier = tree.DecisionTreeClassifier(max_depth=12, max_features="auto")
treeClassifier.fit(xTrain, yTrain)
yPredict = treeClassifier.predict(xTest)
yTrainPredict = treeClassifier.predict(xTrain)
print(accuracy_score(yTest, yPredict) * 100)
print(accuracy_score(yTrain, yTrainPredict) * 100)
# file = "../Visualization/binary.dot"
# tree.export_graphviz(treeClassifier, out_file=file, feature_names = data.columns[:-1], class_names = True, filled=True, rounded=True, special_characters=True)
# call(['dot', '-Tpng', file, '-o', 'binary.png', '-Gdpi=600'])
Esempio n. 9
0
#EDA
titanic_train.shape
titanic_train.info()

titanic_train1 = pd.get_dummies(titanic_train,
                                columns=['Pclass', 'Sex', 'Embarked'])
titanic_train1.shape
titanic_train1.info()
titanic_train1.head(6)

X_train = titanic_train1.drop(
    ['PassengerId', 'Age', 'Cabin', 'Ticket', 'Name', 'Survived'], 1)
y_train = titanic_train['Survived']

#Note that we take entire data into consideration in boosting.
dt_estimator = tree.DecisionTreeClassifier()
#Ensemble.AdaBoostClassifier by passing base_estimator as dt_Estimator and n_estimators(no of. trees to grow) = 5
ada_tree_estimator1 = ensemble.AdaBoostClassifier(dt_estimator, 5)
scores = model_selection.cross_val_score(ada_tree_estimator1,
                                         X_train,
                                         y_train,
                                         cv=10)
print(scores.mean())
ada_tree_estimator1.fit(X_train, y_train)

ada_tree_estimator1.estimators_

#extracting all the trees build by ada boost algorithm
#This tree building is only for display and understanding purpose but not requiered in reality
n_tree = 0
#Since we gave n_estimators(no of. trees to grow) = 5, it builds 5 trees
y_train = y[ind[:split_ind]]
y_test = y[ind[split_ind:]]
############################### KNN ###########################################
#print('\nKNN')
model_KNN = KNeighborsClassifier(weights='distance')
start = time.time()
model_KNN.fit(x_train, y_train)
predTrain = model_KNN.predict(x_train)
pred = model_KNN.predict(x_test)
elapsed_time = time.time() - start
print('{0:.6f} '.format(elapsed_time))
print((np.sum(predTrain == y_train) / len(y_train)) * 100)
print((np.sum(pred == y_test) / len(y_test)) * 100)
############################### Decision Tree #################################
print('\nDecision Tree')
model_DT = tree.DecisionTreeClassifier(criterion='entropy')
#criterion='entropy',max_depth = 34,min_samples_split=2, splitter= 'best'
start = time.time()
model_DT.fit(x_train, y_train)
predTrain = model_DT.predict(x_train)
pred = model_DT.predict(x_test)
elapsed_time = time.time() - start
print('{0:.6f} '.format(elapsed_time))
print((np.sum(predTrain == y_train) / len(y_train)) * 100)
print((np.sum(pred == y_test) / len(y_test)) * 100, "\n")
############################### Random Forests ################################
#print('\nRandom Forests')
model_RF = RandomForestClassifier(n_estimators=10, max_features='log2')
#n_estimators=65 , criterion='entropy', max_features='auto' , max_depth = none,min_samples_split=2
start = time.time()
model_RF.fit(x_train, y_train)
Esempio n. 11
0
def trainVectorizer():
    train_set = sklearn.datasets.load_files(
        container_path=
        r"C:\Users\Lucas\Documents\EC\10º Período\RI\RI-part1\Treino",
        random_state=42)
    data_train = train_set.data

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(data_train)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    test_set = sklearn.datasets.load_files(
        container_path=
        r"C:\Users\Lucas\Documents\EC\10º Período\RI\RI-part1\Teste\html",
        random_state=42)
    data_test = test_set.data

    X_test_counts = count_vect.transform(data_test)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    #Train vectorizer
    start_time = time.time()
    clf_MLP_tf = MLPClassifier(hidden_layer_sizes=(10, 5),
                               solver='lbfgs').fit(X_train_tfidf,
                                                   train_set.target)
    end_time = time.time()
    train_time_mlp_tf = end_time - start_time

    start_time = time.time()
    clf_multinomial_tf = MultinomialNB().fit(X_train_tfidf, train_set.target)
    end_time = time.time()
    train_time_multinomial_tf = end_time - start_time

    start_time = time.time()
    clf_gaussian_tf = GaussianNB().fit(X_train_tfidf.toarray(),
                                       train_set.target)
    end_time = time.time()
    train_time_gaussian_tf = end_time - start_time

    start_time = time.time()
    clf_rf_tf = RandomForestClassifier(n_estimators=100).fit(
        X_train_tfidf, train_set.target)
    end_time = time.time()
    train_time_rf_tf = end_time - start_time

    start_time = time.time()
    clf_lr_tf = linear_model.LogisticRegression().fit(X_train_tfidf,
                                                      train_set.target)
    end_time = time.time()
    train_time_lr_tf = end_time - start_time

    start_time = time.time()
    clf_dt_tf = tree.DecisionTreeClassifier().fit(X_train_tfidf,
                                                  train_set.target)
    end_time = time.time()
    train_time_dt_tf = end_time - start_time

    start_time = time.time()
    clf_svm_tf = LinearSVC().fit(X_train_tfidf, train_set.target)
    end_time = time.time()
    train_time_svm_tf = end_time - start_time

    #Predict
    predicted_MLP_tf = clf_MLP_tf.predict(X_test_tfidf)
    predicted_multinomial_tf = clf_multinomial_tf.predict(X_test_tfidf)
    predicted_gaussian_tf = clf_gaussian_tf.predict(X_test_tfidf.toarray())
    predicted_rf_tf = clf_rf_tf.predict(X_test_tfidf)
    predicted_lr_tf = clf_lr_tf.predict(X_test_tfidf)
    predicted_dt_tf = clf_dt_tf.predict(X_test_tfidf)
    predicted_svm_tf = clf_svm_tf.predict(X_test_tfidf)

    #Salvando resultados
    saveVectorizer(predicted_MLP_tf, test_set, "MLP", "TF-IDF",
                   X_train_tfidf.shape, train_time_mlp_tf)
    saveVectorizer(predicted_multinomial_tf, test_set, "MultinomialNB",
                   "TF-IDF", X_train_tfidf.shape, train_time_multinomial_tf)
    saveVectorizer(predicted_gaussian_tf, test_set, "GaussianNB", "TF-IDF",
                   X_train_tfidf.shape, train_time_gaussian_tf)
    saveVectorizer(predicted_rf_tf, test_set, "RandomForest", "TF-IDF",
                   X_train_tfidf.shape, train_time_rf_tf)
    saveVectorizer(predicted_lr_tf, test_set, "LogisticRegression", "TF-IDF",
                   X_train_tfidf.shape, train_time_lr_tf)
    saveVectorizer(predicted_dt_tf, test_set, "DecisionTree", "TF-IDF",
                   X_train_tfidf.shape, train_time_dt_tf)
    saveVectorizer(predicted_svm_tf, test_set, "SVM", "TF-IDF",
                   X_train_tfidf.shape, train_time_svm_tf)
Esempio n. 12
0
#建立决策树分类模型
from sklearn import tree
import pandas as pd
import time
from json import *

print("Scripts starts...")
start = time.time()

inputfile = 'data.xls'  #数据
outputfile = 'tree.xls'  #模型输出文件
picture = 'tree.pdf'
data = pd.read_excel(inputfile)  #读入数据
y = data.iloc[:, 62].as_matrix()  #样本标签列
x = data.iloc[:, 0:46].as_matrix()  #样本特征
clf = tree.DecisionTreeClassifier(splitter='random')
clf.fit(x, y)
clf.predict(x)

end1 = time.time()
print("modeltime: %f s" % (end1 - start))

count = 0  #统计预测正确的结果个数
for left, right in zip(clf.predict(x), y):
    if left == right:
        count += 1
print("预测准确度为:%f" % (float(count) / len(y)))

r = pd.DataFrame(clf.predict(x), columns=[u'预测结果'])
pd.concat([data.iloc[:, :63], r], axis=1).to_excel(outputfile)
Esempio n. 13
0
from sklearn import datasets
iris = datasets.load_iris()

X = iris.data
Y = iris.target

from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5)

from sklearn import tree
my_classifier = tree.DecisionTreeClassifier()

my_classifier.fit(X_train, Y_train)
predictions = my_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print("Decision Tree:-> ", accuracy_score(Y_test, predictions))

from sklearn.neighbors import KNeighborsClassifier
my_classifier = KNeighborsClassifier()

my_classifier.fit(X_train, Y_train)
predictions = my_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print("Kneighbors classifier:-> ", accuracy_score(Y_test, predictions))

from sklearn.ensemble import RandomForestClassifier
my_classifier = RandomForestClassifier(max_depth=5,
                                       n_estimators=10,
                                       max_features=1)

my_classifier.fit(X_train, Y_train)
Esempio n. 14
0
 def __init__(self):
     self.trainer = "skLearn decisionTree"
     self.clf = tree.DecisionTreeClassifier()
     print("Using %s Classifier" % (self.trainer))
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids
from imblearn.over_sampling import ADASYN
from imbalance.classifyCrossValidation import ClassifyCV
from imbalance.crossValidationStratified import CrossValidationStratified
import pandas as pd
import time, datetime

if __name__ == '__main__':

    trainSets = ['p2p_lendingclub_70_1percent.csv']
    testSets = ['p2p_lendingclub_30.csv']
    classe = 'loan_status'
    # defino a lista de classificadores
    clfs = [GaussianNB(), tree.DecisionTreeClassifier(), linear_model.LogisticRegression()]
    names = ["Naive Bayes", "Decision Tree", "Logistic Regression"]
    final_names = list()
    for set in testSets:
        for name in names:
            final_names.append(str(name+'_'+set[:-4]))

    # defino a lista de tecnicas de sampling
    sTechniques = [RandomUnderSampler(random_state=1), SMOTE(random_state=1)]
    technique_names = ["RU", "SM"]


    def getParamsReSampling(reSamplingTechnique):
        if type(reSamplingTechnique) is SMOTETomek:
            return dict(smt__ratio=[0.8, 0.9, 1.0], smt__k=[1, 3, 5, 7], smt__m=[1, 3, 5, 7])
    def fit(self, X, y):
        """
        Function to train and construct the AdaBoostClassifier
        Inputs:
        X: pd.DataFrame with rows as samples and columns as features (shape of X is N X P) where N is the number of samples and P is the number of columns.
        y: pd.Series with rows corresponding to output variable (shape of Y is N)
        """
        self.out_classes = list(set(list(y)))
        self.data = X
        self.labels = y

        for estimator in range(self.n_estimators):
            print("--------------------------------------", estimator,
                  "----------------------------------------------")

            self.all_Xs.append(X)
            self.all_ys.append(y)

            total_samples = len(X)
            # print("total_samples:", total_samples)

            sample_weights = [1 / total_samples] * total_samples

            Dtree = tree.DecisionTreeClassifier(criterion='entropy',
                                                max_depth=1)
            # fit the estimator
            Dtree.fit(X, y, sample_weight=sample_weights)

            self.estimators_list.append(Dtree)

            y_hat = Dtree.predict(X)

            # count all the wrong predicted output for curr estimator
            wrong_pred = 0
            index_wrong_pred = []
            for i in range(len(y)):
                if (y_hat[i] != y[i]):
                    wrong_pred += sample_weights[i]
                    index_wrong_pred.append(i)

            # add some delta value to prevent zero division err
            err = 0.00000001
            wrong_pred += err

            # calculate amount of say
            amount_of_say = 0.5 * (math.log2(((1 - wrong_pred) / wrong_pred)))
            self.all_amount_of_says.append(amount_of_say)

            # remake sampel weights
            for i in range(len(y)):
                if (y_hat[i] != y[i]):
                    sample_weights[i] = sample_weights[i] * math.exp(
                        amount_of_say)
                else:
                    sample_weights[i] = sample_weights[i] * math.exp(
                        -amount_of_say)

            normalize_val = sum(sample_weights)
            # normalize sample weights
            sample_weights = [w / normalize_val for w in sample_weights]
            # create new data based on new sample weights
            X, y = self.new_data(X, y, sample_weights)
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()




#########################################################
### your code goes here ###

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
t0 = time()
clf=clf.fit(features_train,labels_train)
print "training time:", round(time()-t0, 3), "s"
t1 = time()
pred=clf.predict(features_test)
print "training time:", round(time()-t1, 3), "s"
from sklearn.metrics import accuracy_score
acc = accuracy_score(labels_test, pred)

print(acc)
#########################################################


Esempio n. 18
0
    #print(featureList)

#特征向量化
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
#print(vec.feature_names_)
#print(dummyX)

#针对class label做向量化
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
#print(dummyY)

#使用决策树算法来分类
clf = tree.DecisionTreeClassifier(
    criterion="entropy"
)  #分类器 criterion默认选择cart算法的标准来计算结点,现在指定是用id3算法中计算信息熵的方法来选择结点 entropy 信息熵
clf = clf.fit(dummyX, dummyY)  #建模
#print(clf)

#生成一个dot文件 以展示决策树
with open(r"C:\Users\Administrator\Desktop\1.dot", "w") as f:
    f = tree.export_graphviz(clf,
                             feature_names=vec.get_feature_names(),
                             out_file=f)

#dos命令 打开命令提示符 将dot文件转pdf 能可视化地更直观的展示决策树
#dot -Tpdf C:\Users\Administrator\Desktop\1.dot -o output.pdf

#应用决策树来预测,先取原来数据集的第一行,然后改一改弄个新的
oneRowX = dummyX[0, :]
Esempio n. 19
0
                            min_samples_split=5,
                            random_state=1)
RF.fit(x_train, y_train)
predictions_RF = RF.predict(x_test)
probablity_RF = RF.predict_proba(x_test)
fpr_RF, tpr_RF, threshold_RF = roc_curve(y_test, probablity_RF[:, 1])

##(6) Linear Regression
LiR = LinearRegression()
LiR.fit(x_train, y_train)
predictions_LiR = LiR.predict(x_test)
#probablity_LiR= LiR.predict_proba(x_test)
fpr_LiR, tpr_LiR, threshold_LiR = roc_curve(y_test, predictions_LiR[:, 0])

###(7)Decision Tree
mode = tree.DecisionTreeClassifier(criterion='gini')
mode.fit(x_train, y_train)
predictions_tree = mode.predict(x_test)
probablity_tree = mode.predict_proba(x_test)
fpr_tree, tpr_tree, threshold_tree = roc_curve(y_test, probablity_tree[:, 1])

###(8)Deeplearn
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(105, )))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(x_train, y_train, epochs=4, batch_size=512)
y_pred_label = model.predict_classes(x_test)
Esempio n. 20
0
from itertools import product

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn import tree

iris = load_iris()
X = iris.data[:, [2, 3]]
y = iris.target

clf = tree.DecisionTreeClassifier(max_depth=2)
clf.fit(X, y)

# plt.plot()
# plt.scatter(X[:, 0], X[:, 1])
# plt.show()

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

plt.plot()
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.rainbow)
plt.scatter(X[:, 0], X[:, 1], c=y, alpha=1, cmap=plt.cm.YlOrRd)
plt.title('Decision Tree')
plt.xlabel('Petal.Length')
plt.ylabel('Petal.Width')
Esempio n. 21
0
#test_data = r"/home/pgupta/Dropbox/Shared with Parth/test-dssm.only2.svm";

trainX, trainY = load_svmlight_file(train_data);
testX, testY = load_svmlight_file(test_data);

train_set = trainX.toarray();
test_set = testX.toarray();

trainY = [int(round(trainY[i])) for i in xrange(len(trainY))]
testY  = [int(round(testY[i] )) for i in xrange(len(testY))]

n_features=24
num_estimators = 20
##DTC
clf = tree.DecisionTreeClassifier(max_depth=None, 
                                  min_samples_split=1,
                                  random_state=0)
clf.fit(train_set, trainY)
y_predicted = clf.predict( train_set )
score_train = clf.score( train_set, trainY )
y_predicted = clf.predict( test_set )
score_test = clf.score( test_set, testY )
print ("DTC")
print (precision_recall_fscore_support(testY, y_predicted, average='binary'))
                    

##RFC
clf = ensemble.RandomForestClassifier(n_estimators=num_estimators, 
                                      max_features = 5,
                                      max_depth=None, 
                                      min_samples_split=1, 
diag_map = {'B': 'benign', 'M': 'malignant'}
df['diagnosis'] = df['diagnosis'].map(diag_map)
labs = df['diagnosis']
df.drop(['Unnamed: 32', 'id', 'diagnosis'], axis=1, inplace=True)

#Split into train and test and fit decision trees
output_dir = output + 'decision_trees/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracy = []
for train_indices, test_indices in kf.split(df):
    X_train, X_test = df.iloc[train_indices], df.iloc[test_indices]
    Y_train, Y_test = labs[train_indices], labs[test_indices]

    tree_model = tree.DecisionTreeClassifier(random_state=42)
    tree_model.fit(X_train, Y_train)
    preds = tree_model.predict(X_test)
    accuracy = round((sum(preds == Y_test) / len(Y_test)) * 100, 3)
    print(' '.join([
        'Fold',
        str(len(fold_accuracy) + 1), 'Accuracy:',
        str(accuracy) + '%'
    ]))

    fname = ' '.join(['Decision Tree Fold', str(len(fold_accuracy) + 1)])
    with (open(output_dir + fname + '.dot', 'w')) as f:
        export_graphviz(tree_model,
                        out_file=f,
                        filled=True,
                        rounded=True,
Esempio n. 23
0
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion="entropy",
                                  min_impurity_split=0.02,
                                  min_samples_split=370)
ll = []
tcpORudp = []
with open('traceDMA.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]

for i in range(len(content)):
    l = []
    a, b, c = content[i].split()
    #print(a)
    l.append(a)
    #print(b)
    l.append(b)
    #print(c)

    ll.append(l)
    tcpORudp.append(c)
print("XXXXXXXXX")
#print (ll)
#print (tcpORudp )
clf = clf.fit(ll, tcpORudp)
#prediction = clf.predict([[80, 1480]])
#print(prediction)

import graphviz
featurenames = ["PORT", "SIZE"]
Esempio n. 24
0
def decision_tree_classifier(train_x, train_y):
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)
    return model
Esempio n. 25
0
def main(multi_mode='ovo', winL=90, winR=90, do_preprocess=True, use_weight_class=True,
         maxRR=True, use_RR=True, norm_RR=True, compute_morph={''}, oversamp_method='', pca_k='', feature_selection='',
         do_cross_val='', C_value=0.001, gamma_value=0.0, reduced_DS=False, leads_flag=[1, 0]):
    print("Runing train_SVM.py!")

    # db_path = '/home/mondejar/dataset/ECG/mitdb/m_learning/scikit/'
    db_path = 'C:/Users/Matteo/Desktop/data_mining_prog/mit-bih-database/m_learning/scikit/'

    # Load train data
    [tr_features, tr_labels, tr_patient_num_beats] = load_mit_db('DS1', winL, winR, do_preprocess,
                                                                 maxRR, use_RR, norm_RR, compute_morph, db_path,
                                                                 reduced_DS, leads_flag)
    # Load Test data
    [eval_features, eval_labels, eval_patient_num_beats] = load_mit_db('DS2', winL, winR, do_preprocess,
                                                                       maxRR, use_RR, norm_RR, compute_morph, db_path,
                                                                       reduced_DS, leads_flag)
    if reduced_DS == True:
        np.savetxt('mit_db/' + 'exp_2_' + 'DS2_labels.csv', eval_labels.astype(int), '%.0f')
    else:
        np.savetxt('mit_db/' + 'DS2_labels.csv', eval_labels.astype(int), '%.0f')

        # if reduced_DS == True:
    #    np.savetxt('mit_db/' + 'exp_2_' + 'DS1_labels.csv', tr_labels.astype(int), '%.0f')
    # else:
    # np.savetxt('mit_db/' + 'DS1_labels.csv', tr_labels.astype(int), '%.0f')

    ##############################################################
    # 0) TODO if feature_Selection:
    # before oversamp!!?????

    # TODO perform normalization before the oversampling?
    if oversamp_method:
        # Filename
        oversamp_features_pickle_name = create_oversamp_name(reduced_DS, do_preprocess, compute_morph, winL, winR,
                                                             maxRR, use_RR, norm_RR, pca_k)

        # Do oversampling
        tr_features, tr_labels = perform_oversampling(oversamp_method, db_path + 'oversamp/python_mit',
                                                      oversamp_features_pickle_name, tr_features, tr_labels)

    # Normalization of the input data
    # scaled: zero mean unit variance ( z-score )
    scaler = StandardScaler()
    scaler.fit(tr_features)
    tr_features_scaled = scaler.transform(tr_features)

    # scaled: zero mean unit variance ( z-score )
    eval_features_scaled = scaler.transform(eval_features)
    ##############################################################
    # 0) ????????????? feature_Selection: also after Oversampling???
    if feature_selection:
        print("Runing feature selection")
        best_features = 7
        tr_features_scaled, features_index_sorted = run_feature_selection(tr_features_scaled, tr_labels,
                                                                          feature_selection, best_features)
        eval_features_scaled = eval_features_scaled[:, features_index_sorted[0:best_features]]
    # 1)
    if pca_k > 0:
        # Load if exists??
        # NOTE PCA do memory error!

        # NOTE 11 Enero: TEST WITH IPCA!!!!!!
        start = time.time()

        print("Runing IPCA " + str(pca_k) + "...")

        # Run PCA
        IPCA = sklearn.decomposition.IncrementalPCA(pca_k, batch_size=pca_k)  # gamma_pca

        # tr_features_scaled = KPCA.fit_transform(tr_features_scaled)
        IPCA.fit(tr_features_scaled)

        # Apply PCA on test data!
        tr_features_scaled = IPCA.transform(tr_features_scaled)
        eval_features_scaled = IPCA.transform(eval_features_scaled)

        """
        print("Runing TruncatedSVD (singular value decomposition (SVD)!!!) (alternative to PCA) " + str(pca_k) + "...")

        svd = decomposition.TruncatedSVD(n_components=pca_k, algorithm='arpack')
        svd.fit(tr_features_scaled)
        tr_features_scaled = svd.transform(tr_features_scaled)
        eval_features_scaled = svd.transform(eval_features_scaled)

        """
        end = time.time()

        print("Time runing IPCA (rbf): " + str(format(end - start, '.2f')) + " sec")
    ##############################################################
    # 2) Cross-validation:

    if do_cross_val:
        print("Runing cross val...")
        start = time.time()

        # TODO Save data over the k-folds and ranked by the best average values in separated files
        perf_measures_path = create_svm_model_name(
            'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/' + multi_mode, winL,
            winR, do_preprocess,
            maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag,
            reduced_DS, pca_k, '/')

        # TODO implement this method! check to avoid NaN scores....

        if do_cross_val == 'pat_cv':  # Cross validation with one fold per patient
            cv_scores, c_values = run_cross_val(tr_features_scaled, tr_labels, tr_patient_num_beats, do_cross_val,
                                                len(tr_patient_num_beats))

            if not os.path.exists(perf_measures_path):
                os.makedirs(perf_measures_path)
            np.savetxt(perf_measures_path + '/cross_val_k-pat_cv_F_score.csv', (c_values, cv_scores.astype(float)),
                       "%f")

        elif do_cross_val == 'beat_cv':  # cross validation by class id samples
            k_folds = {5}
            for k in k_folds:
                ijk_scores, c_values = run_cross_val(tr_features_scaled, tr_labels, tr_patient_num_beats, do_cross_val,
                                                     k)
                # TODO Save data over the k-folds and ranked by the best average values in separated files
                perf_measures_path = create_svm_model_name(
                    'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/' + multi_mode,
                    winL, winR, do_preprocess,
                    maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method,
                    leads_flag, reduced_DS, pca_k, '/')

                if not os.path.exists(perf_measures_path):
                    os.makedirs(perf_measures_path)
                np.savetxt(perf_measures_path + '/cross_val_k-' + str(k) + '_Ijk_score.csv',
                           (c_values, ijk_scores.astype(float)), "%f")

            end = time.time()
            print("Time runing Cross Validation: " + str(format(end - start, '.2f')) + " sec")
    else:

        ################################################################################################
        # 3) Train models

        models_path_randomForest = db_path + 'models/' + 'random_forest/'+multi_mode + '_rbf'
        models_path_kNN = db_path + 'models/' + 'kNN/' + multi_mode + '_rbf'
        models_path_c45 = db_path + 'models/' + 'c45/' + multi_mode + '_rbf'

        models_path_randomForest=create_svm_model_name(models_path_randomForest, winL, winR, do_preprocess,
                                               maxRR, use_RR, norm_RR, compute_morph, use_weight_class,
                                               feature_selection,
                                               oversamp_method, leads_flag, reduced_DS, pca_k, '_')
        models_path_kNN=create_svm_model_name(models_path_kNN, winL, winR, do_preprocess,
                                               maxRR, use_RR, norm_RR, compute_morph, use_weight_class,
                                               feature_selection,
                                               oversamp_method, leads_flag, reduced_DS, pca_k, '_')
        models_path_c45=create_svm_model_name(models_path_c45, winL, winR, do_preprocess,
                                               maxRR, use_RR, norm_RR, compute_morph, use_weight_class,
                                               feature_selection,
                                               oversamp_method, leads_flag, reduced_DS, pca_k, '_')


        if os.path.isfile(models_path_randomForest):
            # Load the trained model!
            randomForest = joblib.load(models_path_randomForest)

        else:
            print("Training model on MIT-BIH DS1: " + models_path_randomForest + "...")
            randomForest = RandomForestRegressor(n_estimators=100,random_state=42)
            start = time.time()
            randomForest.fit(tr_features_scaled, tr_labels)
            end = time.time()
            print("Trained completed!\n\t" + models_path_randomForest+ "\n \
                            \tTime required: " + str(format(end - start, '.2f')) + " sec")

            # Export model: save/write trained SVM model
            joblib.dump(randomForest, models_path_randomForest)

        if os.path.isfile(models_path_kNN):
            # Load the trained model!
            kNN = joblib.load(models_path_kNN)

        else:
            print("Training model on MIT-BIH DS1: " + models_path_kNN + "...")
            kNN = KNeighborsClassifier(n_neighbors=5)
            start = time.time()
            kNN.fit(tr_features_scaled, tr_labels)
            end = time.time()
            print("Trained completed!\n\t" + models_path_kNN + "\n \
                            \tTime required: " + str(format(end - start, '.2f')) + " sec")

            # Export model: save/write trained SVM model
            joblib.dump(kNN, models_path_kNN)

        if os.path.isfile(models_path_c45):
            # Load the trained model!
            c45 = joblib.load(models_path_c45)

        else:
            print("Training model on MIT-BIH DS1: " + models_path_c45 + "...")
            c45 = tree.DecisionTreeClassifier()
            start = time.time()
            c45.fit(tr_features_scaled, tr_labels)
            end = time.time()
            print("Trained completed!\n\t" + models_path_c45+ "\n \
                            \tTime required: " + str(format(end - start, '.2f')) + " sec")

            # Export model: save/write trained SVM model
            joblib.dump(c45, models_path_c45)

        # 4) Test SVM model
        print("Testing model on MIT-BIH DS2: " + models_path_randomForest + "...")

        # Evaluate the model on the training data
        perf_measures_path = create_svm_model_name(
            'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/randomForest/' + multi_mode, winL,
            winR, do_preprocess,
            maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag,
            reduced_DS, pca_k, '/')

        print("Evaluation of randon forest on DS1 ...")
        eval_other_model(randomForest, tr_features_scaled, tr_labels, perf_measures_path,'Training')

        print("Evaluation of randon forest on DS2 ...")
        eval_other_model(randomForest, eval_features_scaled, eval_labels, perf_measures_path, 'Testing')

        perf_measures_path = create_svm_model_name(
            'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/kNN/' + multi_mode,
            winL,
            winR, do_preprocess,
            maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag,
            reduced_DS, pca_k, '/')

        print("Evaluation of kNN on DS1 ...")
        eval_other_model(kNN, tr_features_scaled, tr_labels, perf_measures_path, 'Training')

        print("Evaluation of kNN forest on DS2 ...")
        eval_other_model(kNN, eval_features_scaled, eval_labels, perf_measures_path, 'Testing')

        perf_measures_path = create_svm_model_name(
            'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/c45/' + multi_mode,
            winL,
            winR, do_preprocess,
            maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag,
            reduced_DS, pca_k, '/')

        print("Evaluation of randon forest on DS1 ...")
        eval_other_model(c45, tr_features_scaled, tr_labels, perf_measures_path, 'Training')

        print("Evaluation of randon forest on DS2 ...")
        eval_other_model(c45, eval_features_scaled, eval_labels, perf_measures_path, 'Testing')


        '''
        # TODO load best params from cross validation!

        use_probability = False

        model_svm_path = db_path + 'svm_models/' + multi_mode + '_rbf'

        model_svm_path = create_svm_model_name(model_svm_path, winL, winR, do_preprocess,
                                               maxRR, use_RR, norm_RR, compute_morph, use_weight_class,
                                               feature_selection,
                                               oversamp_method, leads_flag, reduced_DS, pca_k, '_')

        if gamma_value != 0.0:
            model_svm_path = model_svm_path + '_C_' + str(C_value) + '_g_' + str(gamma_value) + '.joblib.pkl'
        else:
            model_svm_path = model_svm_path + '_C_' + str(C_value) + '.joblib.pkl'

        print("Training model on MIT-BIH DS1: " + model_svm_path + "...")

        if os.path.isfile(model_svm_path):
            # Load the trained model!
            svm_model = joblib.load(model_svm_path)

        else:
            class_weights = {}
            for c in range(4):
                class_weights.update({c: len(tr_labels) / float(np.count_nonzero(tr_labels == c))})

            # class_weight='balanced', 
            if gamma_value != 0.0:  # NOTE 0.0 means 1/n_features default value
                svm_model = svm.SVC(C=C_value, kernel='rbf', degree=3, gamma=gamma_value,
                                    coef0=0.0, shrinking=True, probability=use_probability, tol=0.001,
                                    cache_size=200, class_weight=class_weights, verbose=False,
                                    max_iter=-1, decision_function_shape=multi_mode, random_state=None)
            else:
                svm_model = svm.SVC(C=C_value, kernel='rbf', degree=3, gamma='auto',
                                    coef0=0.0, shrinking=True, probability=use_probability, tol=0.001,
                                    cache_size=200, class_weight=class_weights, verbose=False,
                                    max_iter=-1, decision_function_shape=multi_mode, random_state=None)

            # Let's Train!

            start = time.time()
            svm_model.fit(tr_features_scaled, tr_labels)
            end = time.time()
            # TODO assert that the class_ID appears with the desired order, 
            # with the goal of ovo make the combinations properly
            print("Trained completed!\n\t" + model_svm_path + "\n \
                \tTime required: " + str(format(end - start, '.2f')) + " sec")

            # Export model: save/write trained SVM model
            joblib.dump(svm_model, model_svm_path)

            # TODO Export StandardScaler()
        
        #########################################################################
        '''
        '''

predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))


# <h1>Decision Tree</h1>

# In[55]:


from sklearn import tree
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', tree.DecisionTreeClassifier()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))


# <h1>Random Forest</h1>

# In[56]:

Esempio n. 27
0
test_Boosting = True
cm_plot = True
times_plot = True

# Split into training and test data. Use random_state to get the same results in every run
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=rs)
"""
Decision tree - test decision for learning curve, model complexity, hyper parm tuning
"""
# Learning Curve, sample size, fit times
if test_DT:
    #print('learning curve processing')
    clf_dt = tree.DecisionTreeClassifier(random_state=rs)
    train_sizes = np.linspace(0.1, 1.0, 5)
    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(clf_dt, X_train, y_train, cv=cv, n_jobs=-1, train_sizes=train_sizes, return_times=True)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    DT_train_mean = train_scores_mean
    DT_test_mean = test_scores_mean
    DT_fit_mean = fit_times_mean

    # plot learning curve
Esempio n. 28
0
#Preprocesamiento
imp = SimpleImputer(missing_values=np.NaN, strategy='mean')
X_salida = imp.fit_transform(X_inicial)
Aprepro = preprocessing.normalize(X_salida)
Aprepro = preprocessing.scale(Aprepro)
aux1 = Aprepro
#print(Aprepro)
X = np.delete(aux1, 20, axis=1)
#print(len(X[1]))
#y=np.delete(Aprepro, np.arange(20), axis=1)
y = np.delete(X_inicial, np.arange(20), axis=1)
#print(len(y[1]))
#print(y)

from sklearn import tree
clasificador = tree.DecisionTreeClassifier(criterion='entropy')
clasificador.fit(X, y)

#Datos prueba
#yp=pd.read_csv(r'test.csv')
#yp=datos.to_numpy()
#yp=np.delete(yp, 0, axis=1)

#print(len(yp[1]))
from sklearn import model_selection
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.33)

clasificador.fit(X_train, y_train)
Esempio n. 29
0
# Print out how many wrong classifications it did
print("Number of mislabeled points out of a total %d points : %d" %
      (x_test.shape[0], (y_test != y_pred).sum()))
# Calculate the accuracy
correct = (y_test == y_pred).sum()
accuracy = correct / len(y_pred)
# Printing the accuracy
print('Accuracy NB:', accuracy * 100)

# Confusion Matrix
names = ['recurr', 'no-recurr']
metrics.plot_confusion_matrix(gnb, x_test, y_test, display_labels=names)

# Decision Tree classification
# Create an instance of a Decision Tree
clf = tree.DecisionTreeClassifier(criterion='gini', ccp_alpha=0.0075)
# Fit the model to the training data and predict the testing data
clf = clf.fit(x_train, y_train)
# Calculate the accuracy
tree_pred = clf.predict(x_test)
corr = (y_test == tree_pred).sum()
accuracy_tree = corr / len(tree_pred)
# Print out how many wrong classifications it did
print("Number of mislabeled points out of a total %d points : %d" %
      (x_test.shape[0], (y_test != tree_pred).sum()))
# Printing the accuracy
print('Accuracy Tree:', accuracy_tree * 100)

# Matrix

metrics.plot_confusion_matrix(clf, x_test, y_test, display_labels=names)
import sklearn.metrics as metrics
iris = datasets.load_iris()
# X = iris.data[:, [2, 3]]
X = iris.data
y = iris.target
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
# split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(X_train, y_train)
# generate evaluation metrics
print("Train - Accuracy :",
      metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Train - Confusion matrix :",
      metrics.confusion_matrix(y_train, clf.predict(X_train)))
print("Train - classification report :",
      metrics.classification_report(y_train, clf.predict(X_train)))
print("Test - Accuracy :", metrics.accuracy_score(y_test, clf.predict(X_test)))
print("Test - Confusion matrix :",
      metrics.confusion_matrix(y_test, clf.predict(X_test)))
print("Test - classification report :",
      metrics.classification_report(y_test, clf.predict(X_test)))
tree.export_graphviz(clf, out_file='tree.dot')