def classifier(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestClassifier as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeClassifier as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesClassifier as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "logistic": from sklearn.linear_model import LogisticRegression as lr cases = y.nunique() if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial") else: est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVC as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingClassifier as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPClassifier as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def regression(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestRegressor as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeRegressor as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesRegressor as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "linear": from sklearn.linear_model import LinearRegression as lr cases = y.nunique() est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVR as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingRegressor as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPRegressor as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore): # TODO: Initialize the three models clf_A = dtc(random_state=13) clf_B = rfc(random_state=13) clf_C = abc(random_state=13) # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`) # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) samples_100 = len(y_train) samples_10 = len(y_train) // 10 samples_1 = len(y_train) // 100 # Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test) # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore) return clf_C
def fit_model(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None) # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None) cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=0) # TODO: Create a decision tree regressor object regressor = dtc() # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) # TODO: Create the grid search cv object --> GridSearchCV() # Make sure to include the right parameters in the object: # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively. grid = gscv(regressor, params, scoring=scoring_fnc) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def DecisionTree(): w scoring = ['precision_macro', 'recall_macro','f1_macro'] clf = dtc() socres = cross_validate(clf,x,y,scoring=scoring,cv=10,return_train_score=False) socres clf.predict(x)
def __init__(self, pathToData): self.dataFilePath = pathToData self.algoname = 'Boosting' self.datasetName = 'Letter' self.baseEstimater = dtc(class_weight='balanced') # x = {'base_estimator': self.baseEstimater, # 'base_estimator__max_depth': 15} self.classifier = abc(base_estimator=self.baseEstimater, algorithm='SAMME') # self.classifier.set_params(**x) self.cv = 5
def decision_tree_implementation(df, x, x_train, y_train, x_test, y_test): print("Decision Tree") print("*************") my_tree = dtc(random_state=0) my_tree.fit(x_train, y_train) data = tree.export_graphviz(my_tree, out_file=None, feature_names=x.columns, filled=True, special_characters=True) graph = graphviz.Source(data) graph.render("mushroom") y_pred = my_tree.predict(x_test) print(classification_report(y_test, y_pred))
def classification(file, X, Y, x, y): param = [] acc = [] criterion = ['gini', 'entropy'] for i in it.product(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort): # print(*i) dtree = dtc(*i) dtree.fit(X, Y) # print('Accuracy: ' + str(dtree.score(x,y)) + '\n') acc.append(dtree.score(x, y)) param.append([*i]) _results(file, acc, param)
def classification(self, metric, folds, alphas, graph): size = 1.3 * self.report_width // 10 models = {} models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Logistic classifier"] = logitc() models["SVM classifier with RBF kernel"] = svc(gamma='scale') models["SVM classifier with linear kernel"] = svc(kernel='linear') models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Random forest classifier"] = rfc(n_estimators=100) models["Gradient boosting classifier"] = gbc() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] for model_name in models: cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Classifier': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def run_min_samples_leaf(training_data, training_labels, validation_data, validation_labels): min_samples_leaf_list = range(1, 51) training_accuracy_list = [] validation_accuracy_list = [] for this_min_samples_leaf in min_samples_leaf_list: print('Processing min samples leaf: ' + str(this_min_samples_leaf) + '/' + str(len(min_samples_leaf_list))) clf = dtc(criterion='entropy', min_samples_leaf=this_min_samples_leaf) (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data, training_labels, validation_data, validation_labels) training_accuracy_list.append(training_accuracy) validation_accuracy_list.append(validation_accuracy) print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE) # Plot data ------------------------------------------------------------------------------------ training_accuracy_list = [training_accuracy*100 for training_accuracy in training_accuracy_list] validation_accuracy_list = [validation_accuracy*100 for validation_accuracy in validation_accuracy_list] pylab.plot(min_samples_leaf_list, training_accuracy_list) pylab.plot(min_samples_leaf_list, validation_accuracy_list) pylab.xlabel('Min Samples Leaf') pylab.ylabel('Accuracy (% out of 100)') pylab.title('Training and Validation Accuracy as function of Min Samples Leaf') pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2) pylab.grid(True) pylab.savefig("Accuracy_vs_Min_Samples_Leaf.png") #pylab.show() pylab.close() pylab.clf() # End plot data -------------------------------------------------------------------------------- (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1)) best_min_samples_leaf = min_samples_leaf_list[best_index] return best_min_samples_leaf
def decisionTree(self, screens, sym1, sym2, sym3, sym4, sym5): self.decisionText = Text(screens, height=1, width=30, bg="orange", fg="black") self.decisionText.grid(row=1, column=7, padx=10) self.decisionTreeClass = dtc() self.decisionTreeClass = self.decisionTreeClass.fit(self.X, self.Y) # Check the accuracy of the algorithm self.YPrediction = self.decisionTreeClass.predict(self.XTest) print(accs(self.YTest, self.YPrediction)) print(accs(self.YTest, self.YPrediction, normalize=False)) self.clientSymp = [sym1, sym2, sym3, sym4, sym5] for s in range(0, len(self.symptoms)): for t in self.clientSymp: if t == self.symptoms[s]: self.newList[s] = 1 self.inputs = [self.newList] self.predictions = self.decisionTreeClass.predict(self.inputs) self.predicted = self.predictions[0] self.ans = 'no' for a in range(0, len(self.diseases)): if self.predicted == a: self.ans = 'yes' break if self.ans == 'yes': self.decisionText.delete("1.0", END) self.decisionText.insert(END, self.diseases[a]) else: self.decisionText.delete("1.0", END) self.decisionText.insert(END, "Not Found")
def trainClassifier(): trainMat=[] training=[] trainMat=preprocess(trainfile) training=prepare(trainMat) labels=training[1] ''' Fitting the training data into the decision tree ''' topicClf = dtc(criterion='entropy',random_state=0) topicClf.fit(training[0],labels) #Cross validating the results using 10% of the training set as the test set ''' X_train, X_test, y_train, y_test = cross_validation.train_test_split( training[0], labels, test_size=0.1, random_state=0) print "Cross Validation Score" print topicClf.score(X_test, y_test) ''' '''
scale_num = ('scale_num', Scale_NumCols(['Age', 'SibSp', 'Parch', 'Fare'], take_log=True)) pipeline = Pipeline([('deal_na', Deal_NAs()), ('encode_cat', Encode_CatCols(drop=['Name', 'Ticket'])), scale_num]) #X_prepared = pipeline.fit_transform(X_) X_train_p = pipeline.fit_transform(X_train) X_vali_p = pipeline.transform(X_vali) from sklearn.linear_model import LogisticRegression as lr from sklearn.tree import DecisionTreeClassifier as dtc from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.ensemble import AdaBoostClassifier as abc from sklearn.ensemble import GradientBoostingClassifier as gbc model = lr(C=1) model = dtc(min_samples_split=10, max_features=5) model = abc(dtc(max_depth=4), n_estimators=100) model = gbc(n_estimators=200) #model = rfc(n_estimators=200 ,min_samples_split = 5) model.fit(X_train_p, Y_train) # print(model.score(X_train_p, Y_train)) # print(model.score(X_vali_p, Y_vali)) # coef_df = pd.DataFrame({'name':X_train_p.columns.tolist(), 'coef':model.coef_[0]}) # coef_df.sort_values('coef', ascending = False) from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix Y_pred = model.predict(X_vali_p) print(classification_report(Y_vali, Y_pred)) print(submit.head())
# Encode Embarked # [C, S, Q] train["Embarked_C"] = train["Embarked"] == "C" train["Embarked_S"] = train["Embarked"] == "S" train["Embarked_Q"] = train["Embarked"] == "Q" test["Embarked_C"] = test["Embarked"] == "C" test["Embarked_S"] = test["Embarked"] == "S" test["Embarked_Q"] = test["Embarked"] == "Q" #train # feature_names = ["Pclass","Sex_encode","Fare_fillin","Embarked_C","Embarked_S","Embarked_Q",] feature_names = ["Pclass","Sex_encode","Embarked_C","Embarked_S","Embarked_Q",] X_train = train[feature_names] y_train = train['Survived'] X_test = test[feature_names] from sklearn.tree import ExtraTreeClassifier as dtc model = dtc(max_depth=5) predictions = model.fit(X_train, y_train).predict(X_test) print (predictions) submission = pd.read_csv("./data/titanic/gender_submission.csv", index_col="PassengerId") submission["Survived"] = predictions submission.to_csv("./data/titanic/result_decisiontree.csv") # the accuray is 0.77990 (above 77%)
random_state=0) print( cl('X_train shape : {}'.format(X_train.shape), attrs=['bold'], color='red')) print(cl('X_test shape : {}'.format(X_test.shape), attrs=['bold'], color='red')) print( cl('y_train shape : {}'.format(y_train.shape), attrs=['bold'], color='green')) print( cl('y_test shape : {}'.format(y_test.shape), attrs=['bold'], color='green')) model = dtc(criterion='entropy', max_depth=4) model.fit(X_train, y_train) pred_model = model.predict(X_test) print( cl('Accuracy of the model is {:.0%}'.format( accuracy_score(y_test, pred_model)), attrs=['bold'])) feature_names = df.columns[:5] target_names = df['Drug'].unique().tolist() plot_tree(model, feature_names=feature_names, class_names=target_names,
#%% import math def validation(l1, l2): match_count = 0 if (len(l1) != len(l2)): print("two list is not same length") return -1 for i in range(len(l1)): if (l1[i] == l2[i]): match_count = match_count + 1 return str(math.floor(match_count / len(l1) * 100)) + "%" #%% from sklearn.tree import DecisionTreeClassifier as dtc model = dtc() predictions = model.fit(X=train[features], y=train[label]).predict(X=test[features]) print(validation(predictions, list(test[label]))) #%% model = cb.Classifier() model.class_names model.fit(train, features, label) test[features] predictions = model.getPredictions(test[features], mode='CLASS') predictions print(validation(predictions, list(test[label])))
for l1 in fbank_feat[50:100, :]: for l2 in l1: fl2.append(l2) fl = ['%.4f' % elem for elem in fl] fl2 = ['%.4f' % elem for elem in fl2] for l1 in fl: fList.append(l1) for l1 in fl2: fList.append(l1) mfList.append(fList) fList = [] n = n + 1 path1 = '/home/hp/Desktop/Trainingsamplesmono/' path2 = '/home/hp/Desktop/set9/' clf = dtc() # this class is used to make decision tree build(path1) clf.fit(mfList, labels) dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("iris1.pdf") mfList = [] #clear mflist build(path2) res = clf.predict(mfList) # prediction of sentiments print(res)
from sklearn import datasets from sklearn.tree import DecisionTreeClassifier as dtc from sklearn.metrics import auc as skauc iris_data = datasets.load_iris() vals = iris_data["data"] target = iris_data["target"] from numpy import random, c_ as add_col shuffled_data = add_col[vals, target] random.shuffle(shuffled_data) vals = shuffled_data[:, :-1] target = shuffled_data[:, -1] dt = dtc(criterion="gini", splitter="best") tpr = [] fpr = [] for i in range(1, vals.shape[0] - 1): classifier = dt.fit(X=vals[:i, :], y=target[:i]) preds_array = classifier.predict(vals[i:, :]) true_labels = target[i:] # Class "0" vs all cm = Valclass.confusion_matrix(true_labels, preds_array, warnings=False) if cm is not None: aux1 = Valclass.tpr(cm)
for w in words: bag.append(1) if w in pattern_words else bag.append(0) # output is a '0' for each tag and '1' for current tag output_row = list(output_empty) output_row[classes.index(doc[1])] = 1 training.append([bag, output_row]) # shuffle our features and turn into np.array random.shuffle(training) training = np.array(training) # create train and test lists train_x = list(training[:,0]) train_y = list(training[:,1]) model=dtc(criterion = "entropy", random_state=100) model.fit(train_x,train_y) # save all of our data structures #pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data", "wb" ) ) # restore all of our data structures #data = pickle.load( open( "training_data", "rb" ) ) #words = data['words'] #classes = data['classes'] #train_x = data['train_x'] #train_y = data['train_y'] def clean_up_sentence(sentence): # tokenize the pattern sentence_words = nltk.word_tokenize(sentence) # stem each word sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] return sentence_words
import pandas as pd from sklearn.tree import DecisionTreeClassifier as dtc #to train the model from sklearn.model_selection import train_test_split as tts #to split train and test from sklearn.metrics import accuracy_score as asc #to calculate the accuracy my_data = pd.read_csv('music.csv') input = my_data.drop(columns=['genre']) output = my_data['genre'] X_train, X_test, y_train, y_test = tts( input, output, test_size=0.2) #it returns a tuple of size 4 my_model = dtc() #my_model.fit(input,output) my_model.fit(X_train, y_train) #training data #pred=my_model.predict([[21,1],[34,0]]) print(X_test) pred = my_model.predict(X_test) accuracy = asc(y_test, pred) print(pred) print('the accurancy is: ', accuracy)
from cross_validation import cross_validation as CV import matplotlib.pyplot as plt from feature_selection import feature_selection #Loading data x_train = np.loadtxt('../Data/x_train.txt') y_train_binary = np.loadtxt('../Data/y_train_binary.txt') x_test = np.loadtxt('../Data/x_test.txt') y_test_binary = np.loadtxt('../Data/y_test_binary.txt') x_orig_train = np.loadtxt('../Data/x_orig_train.txt') y_orig_train_binary = np.loadtxt('../Data/y_orig_train_binary.txt') x_final_test = np.loadtxt('../Data/x_final_test.txt') y_final_test_binary = np.loadtxt('../Data/y_final_test_binary.txt') #Modeling classifier clf = dtc(max_depth = 3) #Calling feature selection methods fs = feature_selection() #clf,x_train,x_test,x_final_test,y_out = fs.PCASelection(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf) clf,x_train,x_test,x_final_test,y_out = fs.KBest(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf) #clf.fit (x_train,y_train_binary) #y_out = clf.predict(x_test) #Printing scores score = clf.score(x_test,y_test_binary) print "Score : ", score print "Precision recall f-score support : " , prfs(y_test_binary,y_out) #Cross validation
''' Step.3 Training the model ''' trainMat=[] training=[] trainMat=preprocess(trainfile) training=prepare(trainMat) labels=training[1] #Decision Tree #topicClf=dtc(criterion='entropy',random_state=0) #topicClf=dtc(random_state=0) #topicClf = RFC(n_estimators=12, max_features=5, random_state=0) topicClf=ovr(dtc(criterion='entropy',random_state=0)) topicClf.fit(training[0],labels) #print topicClf.multilabel_ #scores = cross_val_score(topicClf, training[0], labels) #print "Mean2: ",scores.mean() ''' topicClf = dtc(max_depth=None, min_samples_split=1,random_state=0) scores = cross_val_score(topicClf, training[0], labels) print "Mean1: ", scores.mean() topicClf = ETC(n_estimators=10, max_depth=None,min_samples_split=1, random_state=0) scores = cross_val_score(topicClf, training[0], labels) print "Mean3: ",scores.mean()
print(train.shape, test.shape) # train[features] # training X #%% import math def validation(l1, l2): match_count = 0 if(len(l1)!=len(l2)): print("two list is not same length") return -1 for i in range(len(l1)): if(l1[i]==l2[i]): match_count = match_count+1 return str(math.floor(match_count/len(l1) * 100)) +"%" #%% from sklearn.tree import DecisionTreeClassifier as dtc model = dtc() predictions = model.fit(X=train[features], y=train[label]).predict(X=test[features]) print(validation(predictions, list(test[label]))) #%% model = cb.Classifier() model.class_names model.fit(train, features, label) test[features] predictions = model.getPredictions(test[features], mode='CLASS') predictions print(validation(predictions, list(test[label])))
'Percentage_of_disposable_income', 'Duration_in_Present_Residence', 'Age_in_years', 'No_of_Credits_at_this__Bank', 'No_of_dependents', 'Creditability' ] categorical_columns = [] for i in df.columns: if i not in numerical_columns: df[i] = df[i].astype(str) categorical_columns.append(i) for i in categorical_columns: dummies = pd.get_dummies(df[i], prefix=i) df = pd.concat([df, dummies], axis=1) y = df['Creditability'] x = df.drop('Creditability', axis=1) #print(x.info()) print(x.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) clf = dtc(random_state=0) clf.fit(x_train, y_train) print(clf.score(x_train, y_train)) print(clf.score(x_test, y_test))
sex = 'F' scaler = StandardScaler() data_partial = data[data['Sex'] == sex].drop('Sex', axis=1) # corr_matrix_f, corr_matrix_m = data_f.corr(), data_m.corr() # plot_corr_matrices(corr_matrix_f, corr_matrix_m) y = data_partial['EmoState'] X = scaler.fit_transform(data_partial.drop('EmoState', axis=1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=71) models = (('DTC', dtc()), ('SVM', svc(C=10)), ('KNN', knc(n_neighbors=10)), ('SGDC', sgdc()), ('GNBC', gnbc()), ('MLPC', mlpc(max_iter=1000, learning_rate='adaptive'))) results = [] names = [] seed = 13 scoring = 'accuracy' for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
model.fit( X , y ) X_test = [ row[ :-1 ] for row in test_data ] y_real = [ row[ -1 ] for row in test_data ] y_pred = model.predict( X_test ) print report( y_real , y_pred ) tp = lambda x : 1 if x == 'spam' else 0 real = [ tp( v ) for v in y_real ] pred = [ tp( v ) for v in y_pred ] print mean_absolute_error( real , pred ) print mean_squared_error( real , pred ) if __name__ == '__main__' : if len( sys.argv ) > 2 : train_fpath , test_fpath = sys.argv[ 1: ] train_data = import_csv( train_fpath ) test_data = import_csv( test_fpath ) ''' DECISION TREE ''' cf = dtc( criterion = 'gini' , max_depth = 50 ) classify( cf , train_data , test_data , 'decision_tree' ) ''' NEAREST NEIGHBORS ''' cf = knc( n_neighbors = 1 , metric = 'hamming' ) classify( cf , train_data , test_data , 'knearest_neighbors' ) ''' NAIVE BAYES ''' cf = mnb( alpha = 100.0 ) classify( cf , train_data , test_data , 'naive_bayes' ) else : print "Usage python %s [train_csv_file] [test_csv_file]" % sys.argv[ 0 ]
import matplotlib.pyplot as plt diab = pd.read_csv('diabetes.csv') # print(diab) # print(diab.columns) # print(diab.isna().sum()) # print(diab.shape) # print(diab.tail(5)) X = diab[['plas', 'insu', 'mass']] y = diab['class'] le = LabelEncoder() y = le.fit_transform(y) # print(y) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=109) model = dtc(criterion="gini", max_depth=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print("Accuracy:", metrics.accuracy_score(y_pred, y_test)) fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=100) fn=['plas', 'insu', 'mass'] cn=['possitive', 'negative'] tree.plot_tree(model,feature_names = fn,class_names=cn,filled = True, ); plt.show() text_representation = tree.export_text(model) print(text_representation)
from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier as dtc from sklearn import metrics n_class = 3 colors = 'ryb' step = 0.2 iris = load_iris() for pairdx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): x = iris.data[:, pair] y = iris.target clf = dtc() clf.fit(x, y) plt.subplot(2, 3, pairdx + 1) x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1 y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) cs = plt.contour(xx, yy, z, cmap=plt.cm.RdYlBu) plt.xlabel(iris.feature_names[pair[0]]) plt.ylabel(iris.feature_names[pair[1]])
from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier as dtc from time import time from random_dataset import random_dataset import numpy as np #from sklearn.metrics import accuracy_score # Training Model features_train, labels_train, features_test, labels_test = random_dataset() model1 = GaussianNB() model2 = SVC(kernel='linear') model3 = dtc() t1 = time() model1.fit(features_train, labels_train) print "Training time NB : ", round(time() - t1, 3), "s" accuracy1 = model1.score(features_test, labels_test) print "Accuracy of NB:", accuracy1 t2 = time() model2.fit(features_train, labels_train) print "Training time SVM : ", round(time() - t2, 3), "s" accuracy2 = model2.score(features_test, labels_test) print "Accuracy of SVM:", accuracy2 t3 = time() model3.fit(features_train, labels_train)
validation_data[missing_headers] = validation_data[missing_headers].applymap(lambda x: False) missing_headers = validation_data.columns.diff(training_data.columns) if len(missing_headers) > 0: training_data[missing_headers] = validation_data[missing_headers] training_data[missing_headers] = training_data[missing_headers].applymap(lambda x: False) # Process Decision Tree best_max_depth = classifier_run.run_max_depth(training_data, training_labels, validation_data, validation_labels) best_min_samples_leaf = classifier_run.run_min_samples_leaf(training_data, training_labels, validation_data, validation_labels) print('Optimal max depth was: ' + str(best_max_depth)) print('Optimal min samples leaf: ' + str(best_min_samples_leaf)) clf = dtc(criterion='entropy', max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf) clf.fit(training_data, training_labels) dot_data = StringIO.StringIO() export_graphviz(clf, out_file=dot_data, max_depth=2) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Decision_Tree.pdf") (best_n_estimator, best_n_estimator_accuracy) = classifier_run.run_random_forest( training_data, training_labels, validation_data, validation_labels) (best_n_estimator_modified, best_n_estimator_modified_accuracy) = classifier_run.run_random_forest( training_data, training_labels, validation_data, validation_labels, best_max_depth=30, best_min_samples_leaf=1)
svm_classifier = clf.fit(training_features, training_labels) predictions = svm_classifier.predict(training_features) print("Precision of linear SVM classifier is:") precision = calculate_precision(predictions, training_labels) print("Training data\t" + str(precision)) predictions = svm_classifier.predict(test_features) #precision = calculate_precision(predictions,test_gold_labels) #print("Test data\t" + str(precision)) #Real time tesing #real_time_test(svm_classifier,vocabulary_mv) ##Decision tree algorithm from sklearn.tree import DecisionTreeClassifier as dtc clf_gini = dtc(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5) dt_classifier = clf_gini.fit(training_features, training_labels) predictions = dt_classifier.predict(training_features) print("Precision of DecisionTreeClassifier is") precision = calculate_precision(predictions, test_gold_labels) print("Test data\t" + str(precision)) #Real time tesing real_time_test(dt_classifier, vocabulary_mv) ##Implementation of logistice regression from sklearn.linear_model import LinearRegression as lr lmModel = lr() lm_classifier = lmModel.fit(training_features, training_labels) predictions = lm_classifier.predict(test_features) print("Precision of LinearRegression is")
X[:, 1] = le_sex.transform(X[:, 1]) le_BP = pproc.LabelEncoder() le_BP.fit(['LOW', 'NORMAL', 'HIGH']) X[:, 2] = le_BP.transform(X[:, 2]) le_chol = pproc.LabelEncoder() le_chol.fit(['LOW', 'NORMAL', 'HIGH']) X[:, 3] = le_chol.transform(X[:, 3]) print(X[0:5]) y = data['Drug'] print(y[0:5]) from sklearn.model_selection import train_test_split as tts X_trn, X_test, y_trn, y_test = tts(X, y, test_size=0.3, random_state=3) print(X_trn.shape) print(y_trn.shape) #modelling from here now drugtree = dtc(criterion='entropy', max_depth=4) drugtree.fit(X_trn, y_trn) predtree = drugtree.predict(X_test) print(predtree[0:5]) print(y_test[0:5]) #finding the accuracy of model from sklearn import metrics import matplotlib.pyplot as plt print("decision tree accuracy:", metrics.accuracy_score(y_test, predtree)) #visualization from sklearn.externals.six import StringIO import pydotplus import matplotlib.image as mpimg from sklearn import tree dot_data = StringIO() filename = "drugtree.png"
plt.xlabel('Predicted') plt.ylabel('Truth') print(classification_report(y_test, y_pred)) #applying k-fold cross validation from sklearn.model_selection import cross_val_score as cvs accuracies = cvs(estimator=classifier,X=x_train,y=y_train,cv=10) print(accuracies.mean()) print(accuracies.std()) """Decision Tree""" #fitting decision tree classifier to the training set from sklearn.tree import DecisionTreeClassifier as dtc classifier = dtc(criterion='entropy' , random_state=0) classifier.fit(x_train, y_train) #predicting the test set results y_pred=classifier.predict(x_test) from sklearn.metrics import confusion_matrix, classification_report cm=confusion_matrix(y_test, y_pred) plt.figure(figsize = (5,5)) sns.heatmap(cm, annot=True) plt.xlabel('Predicted') plt.ylabel('Truth') print(classification_report(y_test, y_pred))
import sys import sklearn from classifier_utils import * from sklearn.tree import DecisionTreeClassifier as dtc if __name__ == '__main__' : if len( sys.argv ) > 3 : infilepath , crit , depth = sys.argv[ 1: ] data = import_csv( infilepath ) cf = dtc( criterion = crit , max_depth = int( depth ) ) stats = cross_validation( data , cf ) print "PARAMS: criterion=%s , max_depth=%s" % ( crit, depth ) print_stats( stats ) else : print "Usage python %s [csv_file] [neighbors] [distance]" % sys.argv[ 0 ]
print(f"Manual Training Accuracy: {training_accuracy:.2%}") print(f"Manual Test Accuracy: {test_accuracy:.2%}") # ============================================================================= # Compare to actual function using pandas and sklearn # ============================================================================= df = pd.read_csv("iris.csv") train, test = train_test_split(df, train_size=.75, stratify=df["species"], random_state=7) target = ["species"] X_train = train.drop(target, axis=1) y_train = train[target] X_test = test.drop(target, axis=1) y_test = test[target] clf = dtc(max_depth=5, min_samples_split=4) clf.fit(X_train, y_train) # Find accuracy of sklearn implementation training_accuracy = clf.score(X_train, y_train) test_accuracy = clf.score(X_test, y_test) print(f"Sklearn Train Score: {training_accuracy:.2%}") print(f"Sklearn Test Score:, {test_accuracy:.2%}")
def main(): df = getDF(sys.argv[1]) df_shuffled = df.sample(frac=1) df = df_shuffled.reset_index(drop=True) df_list = [] for i in range(0, 150, 30): df_list.append(df[i:i + 30]) Columns = ['sepal_l', 'sepal_w', 'petal_l', 'petal_w', 'result'] train_knn = pd.DataFrame(columns=Columns) test_knn = pd.DataFrame(columns=Columns) k_vals = {} all_accuracies = [] for i in range(5): print(i) test_knn = pd.concat([test_knn, df_list[i]]) for j in range(5): if i != j: train_knn = pd.concat([train_knn, df_list[j]]) x_test_knn = test_knn.loc[:, :'petal_w'] y_test_knn = test_knn.loc[:, ['result']] x_train_knn = train_knn.loc[:, :'petal_w'] y_train_knn = train_knn.loc[:, ['result']] accuracies = [] for i in range(1, 51): knn_classifier = knn(n_neighbors=i) knn_classifier.fit(x_train_knn, y_train_knn) knn_y_pred = knn_classifier.predict(x_test_knn) # print(accuracy_score(y_test_knn, knn_y_pred)) accuracies.append(accuracy_score(y_test_knn, knn_y_pred)) all_accuracies.append(accuracies) tot = 0 for i in range(50): for l in all_accuracies: tot += l[i] tot = tot / 5 k_vals[i + 1] = tot tot = 0 print(k_vals) test_knn = pd.DataFrame(columns=Columns) train_knn = pd.DataFrame(columns=Columns) print(k_vals) num = 1 for training_df in df_list[1:]: x_train = training_df.loc[:, :'petal_w'] y_train = training_df.loc[:, ['result']] knn_classifier = knn(n_neighbors=2) knn_classifier.fit(x_train, y_train) dtree_classifier = dtc(criterion='entropy', random_state=100, max_depth=8, min_samples_leaf=4) dtree_classifier.fit(x_train, y_train) knn_y_pred = knn_classifier.predict(x_test) dtree_y_pred = dtree_classifier.predict(x_test) # print('{}: Dtree Accuracy - {}'.format(num, accuracy_score(y_test, dtree_y_pred))) # print('Report: {}'.format(classification_report(y_test, dtree_y_pred))) # print('{}: KNN Accuracy - {}'.format(num, accuracy_score(y_test, knn_y_pred))) # print('Report: {}'.format(classification_report(y_test, knn_y_pred))) # print('-'*50) num += 1 return
y = covid['Direction'] covid.info() # print(set(alc["Series_reference"])) # Series_reference_encoded = le.fit_transform(alc["Series_reference"]) # print('Series_reference_encoded', Series_reference_encoded) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=50) model = dtc(criterion="entropy", max_depth=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print(set(covid['Direction'])) print("Accuracy:", metrics.accuracy_score(y_pred, y_test)) fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=266) fn = ['Value', 'Cumulative', 'Year', 'Commodity', 'Country'] cn = ['right', 'left', 'center'] tree.plot_tree(model, feature_names=fn, class_names=cn, filled=True) text_representation = tree.export_text(model) print(text_representation)
def classification(self, metric, folds, printt=True, graph=False): size = self.graph_width if len(self.y.iloc[:,0].unique()) > 2: struct = 'multiclass' else: struct = 'binary' # significant model setup differences should be list as different models models = {} models["Linear discriminant analysis"] = ldac() models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean') models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan') models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc(binarize=0.5) models["Multinomial naive bayes"] = mnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Ridge classifier"] = rc() if len(self.Xt_train) < 10000: models["SVM classifier RBF"] = svc(gamma='scale') models["SVM classifier Linear"] = svc(kernel='linear') models["SVM classifier Poly"] = svc(kernel='poly') if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5: models["Gradient boosting classifier"] = gbc() models["Random forest classifier"] = rfc(n_estimators=100) if struct == 'multiclass': models["Logistic classifier multinomial"] = logitc(multi_class='multinomial', solver='lbfgs') models["Logistic classifier auto"] = logitc(multi_class='auto') models["Logistic One vs Rest"] = ovrc(logitc()) models["Logistic One vs One"] = ovoc(logitc()) if struct == 'binary': models["Logistic classifier"] = logitc(max_iter=2000) self.models = models kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] et = [] for model_name in models: start = time.time() cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan) results.append(cv_scores) names.append(model_name) et.append((time.time() - start)) #print(model_name, time.time() - start) report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) report.reset_index(inplace=True, drop=True) self.report_performance = report if printt: print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') print(self.report_width * '*', '') print(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0, bottom=0.25) self.graphs_model.append(fig) plt.show() return None