class ExtraTreeClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._etc = ETC() def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._etc = self._etc.fit(matrix, classes) print 'Fitting complete...' self._has_fit = True output = self._etc.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest): count = 0 bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "1" print tree2.score(xtest,ytest) if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "2" print bagging1.score(xtest,ytest) # if count < numfiers: # # votingClassifiers combine completely different machine learning classifiers and use a majority vote # clff1 = SVC() # clff2 = RFC(bootstrap=False) # clff3 = ETC() # clff4 = neighbors.KNeighborsClassifier() # clff5 = quadda() # print"3" # eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) # eclf = eclf.fit(xtrain,ytrain) # #print(eclf.score(xtest,ytest)) # # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # # cla # # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # # print () # count+=1 # classifiers = np.append(classifiers,eclf.score(xtest,ytest)) # if count < numfiers: # svc1 = SVC() # svc1.fit(xtrain,ytrain) # dec = svc1.score(xtest,ytest) # count+=1 # classifiers = np.append(classifiers,svc1.score(xtest,ytest)) # print "3" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "4" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] #print len(classifiers) #print classifiers for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
# In[21]: ### TREESSSSS from sklearn import tree from sklearn.tree import DecisionTreeClassifier as DTC tree1 = DTC() print tree1 tree1.fit(xtrain,ytrain1) print tree1.fit(xtrain,ytrain1) print tree1.score(xtest,ytest1) # In[22]: from sklearn.tree import ExtraTreeClassifier as ETC tree2 = ETC() print tree2 tree2.fit(xtrain,ytrain1) print tree2.fit(xtrain,ytrain1) print tree2.score(xtest,ytest1) # In[23]: from sklearn.ensemble import BaggingClassifier bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain1) print bagging1.score(xtest,ytest1) # In[24]:
def build_separate_tree(X,y,max_features,max_depth,min_samples_split): clf = ExtraTreeClassifier(max_features=max_features,max_depth=max_depth,min_samples_split=min_samples_split) clf = clf.fit(X,y) return clf
def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._etc = ETC()
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest): # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest = xtest[~np.isnan(xtest).any(axis=1),:] xtest = xtest[~np.isinf(xtest).any(axis=1),:] xtrain = np.append(xtrain_1,xtrain_2,0) ytrain = np.append(ytrain_1,ytrain_2) ytrain = np.ravel(ytrain) xtrunclength = sio.loadmat('../Files/xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'][0] #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] modeCol = predWindowVecModeFinder(tempCol,xtrunclength) modeStr = predVec2Str(modeCol) predictionStringMat.append(modeStr) finalPredMat += map(int,modeCol) return predictionStringMat,finalPredMat
def classify(topicmodel, plotconfusionmatrix=False, multilabel=False): """ Method takes feature vectors (including topic model) and class labels as arrays, and trains and tests a number of classifiers on them. Outputs classifier scores and confusion matrices.""" names = [ #"Dummy", "Logistic Regression", "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes" ] classifiers = [ #DummyClassifier(strategy='most_frequent',random_state=10), LogisticRegression(C=1e5, multi_class="ovr"), KNeighborsClassifier(5), SVC(kernel="linear", C=0.025), SVC(kernel='rbf', gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB() ] multinames = [ "Logistic Regression", 'MLkNN', 'Decision Tree', 'Extra Tree', 'KNN', 'Neural Net', 'Random Forest', 'Naive Bayes', "RBF SVM", "Linear SVM" ] multiclassifiers = [ LabelPowerset(LogisticRegression(C=1e5)), MLkNN(k=5, s=1.0, ignore_first_neighbours=0), LabelPowerset(DecisionTreeClassifier(max_depth=5)), LabelPowerset(ExtraTreeClassifier(max_depth=5)), LabelPowerset(KNeighborsClassifier(5)), LabelPowerset(MLPClassifier(alpha=1)), LabelPowerset( RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)), LabelPowerset(GaussianNB()), LabelPowerset(SVC(kernel='rbf', gamma=2, C=1)), LabelPowerset(SVC(kernel="linear", C=0.025)), #RidgeClassifierCV() #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True, multi_class= "one_vs_rest") ] measurements = topicmodel[2] vec = DictVectorizer() X = vec.fit_transform(measurements).toarray() print(vec.get_feature_names()) #print(X) classlabels = topicmodel[1] Y = (classlabels if multilabel == False else ToIndicatorMatrix(classlabels)) #print(X) #print(y) # X_train, X_test, y_train, y_test = \ # train_test_split(X, y, test_size=.2, random_state=42) classes = (list(set(classlabels)) if multilabel == False else list( set([classe for sublist in classlabels for classe in sublist]))) #Number of cross validations cvn = 5 print(('\n Start multi-label classification!' if multilabel else 'Start single-label classification!')) print('Labels (classes):') print(classes) #see https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/ #http://scikit.ml/api/index.html print('\n Results of the model evaluation: \n') scores = [ 'accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted' ] #Naive model (majority vote) clf = DummyClassifier(strategy='most_frequent', random_state=10) if multilabel: clf = LabelPowerset(clf) dummyscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[0]) dummyprescores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[1]) dummyrescores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[2]) dummyfescores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[3]) print( "\n {}-CV naive classifier (most frequent class): {}: {} (+/- {}), {}: {}, {}: {}, {}: {}" .format(cvn, scores[0], dummyscores.mean(), dummyscores.std(), scores[1], dummyprescores.mean(), scores[2], dummyrescores.mean(), scores[3], dummyfescores.mean())) y_pred = clf.fit(X, Y).predict(X) if multilabel == False: print("Fitting on entire dataset (no CV):") cnf_matrix = metrics.confusion_matrix(Y, y_pred, labels=classes) print(cnf_matrix) print(metrics.classification_report(Y, y_pred, labels=classes)) else: myscores = myCVAScore(clf, X, Y, cvn) print( "\n {}-CV naive classifier (my own) {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}" .format(cvn, scores[0], myscores[0], scores[1], myscores[1], scores[2], myscores[2], scores[3], myscores[3], 'coverage', myscores[4], 'hamming loss', myscores[5], 'jaccard', myscores[6])) print("Fitting on entire dataset (no CV):") print('subset accuracy: {}'.format(accuracy_score(y_pred, Y))) # iterate over classifiers classifiers = (multiclassifiers if multilabel else classifiers) names = (multinames if multilabel else names) for name, clf in zip(names, classifiers): #standard scores given by sklearn accscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[0]) pscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[1]) rscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[2]) fscores = cross_val_score(clf, X, Y, cv=cvn, scoring=scores[3]) print("\n {}-CV {}: {}: {} (+/- {}), {}: {}, {}: {}, {}: {}".format( cvn, name, scores[0], accscores.mean(), accscores.std(), scores[1], pscores.mean(), scores[2], rscores.mean(), scores[3], fscores.mean())) clffit = clf.fit(X, Y) y_pred = clffit.predict(X) if multilabel == False: print("Fitting on entire dataset (no CV):") cnf_matrix = metrics.confusion_matrix(Y, y_pred, labels=classes) print(cnf_matrix) print(metrics.classification_report(Y, y_pred, labels=classes)) else: #my own scores for multilabel classification myscores = myCVAScore(clf, X, Y, cvn) print( "\n {}-CV {} (my own): {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}, {}: {}" .format(cvn, name, scores[0], myscores[0], scores[1], myscores[1], scores[2], myscores[2], scores[3], myscores[3], 'coverage', myscores[4], 'hamming loss', myscores[5], 'jaccard', myscores[6])) print("Fitting on entire dataset (no CV):") print(' subset accuracy: {}'.format(accuracy_score(y_pred, Y))) #print decision tree from sklearn import tree if name == "Decision Tree" and multilabel == False: tree.export_graphviz(clffit, out_file='tree.dot', class_names=sorted(classes), feature_names=vec.get_feature_names()) # Plot non-normalized confusion matrix #plt.figure() np.set_printoptions(precision=2) if plotconfusionmatrix: plot_confusion_matrix(cnf_matrix, classes=classes, title='Confusion matrix for ' + name)
def main(): #dataset = pd.read_csv('good_representations_aug.csv') dataset = pd.read_csv('inception_representations_aug.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values all_objects = [ "Vase", "Teapot", "Bottle", "Spoon", "Plate", "Mug", "Knife", "Fork", "Flask", "Bowl" ] #The csv has file path for class labels. This code cleans up this path and adds class name for index in range(y.size): for obj in all_objects: if obj in y[index]: y[index] = obj break #Converting labels from strings to integer encoded encoder = LabelEncoder() y_enc = encoder.fit_transform(y) #Feature Scaling before PCA sc = StandardScaler() X = sc.fit_transform(X) #Dimensionality reduction with PCA #for GOOD 125 features accounts for 99.9% variance #pca = PCA(n_components=230) #for inception 500 features accounts for 99.9% variance pca = PCA(n_components=500) X = pca.fit_transform(X) #Dataset is imbalanced. To account for this, we upsample the representations to get new samples sample_count = 100 #The upsampled dataset is split into Dependent and independent variables X = upsampled_dataset.iloc[:, :-1].values y = upsampled_dataset.iloc[:, -1].values # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ada_base = AdaBoostClassifier() ada_deci = AdaBoostClassifier(DecisionTreeClassifier()) ada_extr = AdaBoostClassifier(ExtraTreeClassifier()) ada_logr = AdaBoostClassifier(LogisticRegression()) ada_svml = AdaBoostClassifier(SVC(probability=True, kernel='linear')) models = [ada_base, ada_deci, ada_extr, ada_logr, ada_svml] model_names = [ 'Base', 'DecisonTree', 'ExtraTree', 'LogisticRegression', 'SVM' ] s = ['accuracy'] r = fill_results_df(models, model_names, s, X_train, X_test, y_train, y_test, 10) print('Results from untuned classifiers', r) # Find best parameters base = clone(ada_base) ada_base_hyperparameter_tuning(base, X, y) deci = clone(ada_deci) ada_deci_hyperparameter_tuning(deci, X, y) extr = clone(ada_extr) ada_extr_hyperparameter_tuning(extr, X, y) svml = clone(ada_svml) ada_svml_hyperparameter_tuning(svml, X, y) logr = clone(ada_logr) ada_logr_hyperparameter_tuning(logr, X, y)
def do_classification(clm, data_fname, clm_type): d_name = "output" if os.path.isdir(d_name) is False: os.mkdir(d_name) fname = os.path.basename(data_fname).replace('.csv', '') fn = 'result_' + fname + "_type" + str(clm_type) + "_" +\ datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '.csv' csv_out_fnamee = os.path.join(d_name, fn) fi = open(csv_out_fnamee, 'w') csv_out = csv.writer(fi, delimiter=',') # Create dataframe for training base_df = pd.read_csv(data_fname) df = base_df[clm] df = df[df['heartRate'] > 40] df = df[df['skinTemperature'] > 10] df = df[df['met'] > 0.4] X_train = df[clm[:-2]] Y_train = [df[clm[-2]], df[clm[-1]]] # Model: Decision Tree ML_NAME = 'Decision Tree' depth_list = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 100, 10), np.arange(150, 1000, 50))) for t in [0, 1]: for depth in depth_list: clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=depth, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, depth) # Model: Extra Tree Classifier ML_NAME = 'Extremely randomized tree classifier' for t in [0, 1]: clf = ExtraTreeClassifier() do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, 0) # Model: Gaussian ML_NAME = 'Gaussian Naive Bayes' for t in [0, 1]: clf = GaussianNB(priors=None) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, 0) # Model: Multivariate Bernoulli Model ML_NAME = 'Multivariate Bernoulli Model' alphas = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for a in alphas: clf = BernoulliNB(alpha=a) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, a) # Model: AdaBoost Classifier ML_NAME = 'AdaBoost classifier' noestimator = np.arange(5, 1000, 20) for t in [0, 1]: for n in noestimator: clf = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1, n_estimators=n, random_state=None) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) # Model: Gradient Boosting Classifier ML_NAME = 'Gradient Boosting Classifier' noestimator = np.arange(5, 1000, 20) for t in [0, 1]: for n in noestimator: clf = GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=4, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=n, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) # Model: Random Forest Classifier ML_NAME = 'Random Forest Classifier' noestimator = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10), np.arange(150, 1000, 50))) for t in [0, 1]: for n in noestimator: clf = RandomForestClassifier(n_estimators=n) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) # Model: Support Vector Machines - RBF ML_NAME = 'Support Vector Machines - RBF' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='rbf') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: Support Vector Machines - poly ML_NAME = 'Support Vector Machines - poly' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='poly') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: Support Vector Machines - Sigmoid ML_NAME = 'Support Vector Machines - Sigmoid' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='sigmoid') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: Support Vector Machines - Linear ML_NAME = 'Support Vector Machines - Linear' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='linear') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: KNeighborsClassifier ML_NAME = 'KNeighborsClassifier' n_neighbors = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for n in n_neighbors: try: clf = KNeighborsClassifier(n_neighbors=n) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) except: pass # Model: Radius Neighbors Classifier ML_NAME = 'Radius Neighbors Classifier' n_neighbors = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for n in n_neighbors: try: clf = RadiusNeighborsClassifier(radius=n) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) except: pass # Model: NearestCentroid ML_NAME = 'Nearest Centroid Classifier' for t in [0, 1]: clf = NearestCentroid() do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, 0) fi.close()
# Creating gini DecisionTree clf_gini = DecisionTreeClassifier(random_state=100, max_depth=3, min_samples_leaf=5) clf_gini.fit(X_train, y_train) # Training gini tree # Creating entropy DecisionTree clf_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=3, min_samples_leaf=5) clf_entropy.fit(X_train, y_train) # Training entropy tree # Creating SVM with polynomial kernel clf_svc = svm.SVC(random_state=100, kernel='poly') clf_svc.fit(X_train, y_train) # Training SVM # Extra trees classifier clf_ext = ExtraTreeClassifier(random_state=100, max_depth=3, min_samples_leaf=5) clf_ext.fit(X_train, y_train) # Training extra tree y_pred_gi = clf_gini.predict(X_test) # gini tree prediction test y_pred_en = clf_entropy.predict(X_test) # entropy tree prediction test y_pred_sv = clf_svc.predict(X_test) # SVM prediction test y_pred_et = clf_ext.predict(X_test) # extra tree prediction test # Print accuracy scores print("Gini accuracy score: ", accuracy_score(y_test, y_pred_gi)*100) print("Entropy accuracy score: ", accuracy_score(y_test, y_pred_en)*100) print("SVM accuracy score: ", accuracy_score(y_test, y_pred_sv)*100) print("Extra tree accuracy score: ", accuracy_score(y_test, y_pred_et)*100) print(y_test) print(y_pred_sv)
# print(multilabel_confusion_matrix(y_test_og, preds)) clear_session() # SVM - classifier = svm.SVC() classifier.fit(X_train, y_train_og) preds = classifier.predict(X_test) print("SVC Accuracy:", accuracy_score(y_test_og, preds)) # print("Confusion Matrix:") # print(multilabel_confusion_matrix(y_test_og, preds)) clear_session() # SVM - classifier = svm.LinearSVC() classifier.fit(X_train, y_train_og) preds = classifier.predict(X_test) print("LinearSVC Accuracy:", accuracy_score(y_test_og, preds)) # print("Confusion Matrix:") # print(multilabel_confusion_matrix(y_test_og, preds)) clear_session() # Extra Tree - classifier = ExtraTreeClassifier() classifier.fit(X_train, y_train_og) preds = classifier.predict(X_test) print("ExtraTreeClassifier Accuracy:", accuracy_score(y_test_og, preds)) # print("Confusion Matrix:") # print(multilabel_confusion_matrix(y_test_og, preds)) clear_session()
def ML_model_Geno(args): if not os.path.exists(args.outpath+'/ML_Geno'): os.system('mkdir '+args.outpath+'/ML_Geno') disease_train=np.load(args.outpath+'/train/disease_train.npy') Cxlist_NN=[(20,20,20),(30,30),(10,10,10,10),(30,20,10)] Cxlist_ada=[DecisionTreeClassifier(),LogisticRegression(),ExtraTreeClassifier(),GaussianNB()] Cxlist_GB=[0.0001,0.001,0.01,0.1] Cxlist_LR=[0.0001,0.001,0.01,0.1] Cxlist_RF=[0.0001,0.001,0.01,0.1] slist=[] if os.path.exists(args.outpath+'/train/genotype_train_5E3.npy'): slist.append(1) if os.path.exists(args.outpath+'/train/genotype_train_5E4.npy'): slist.append(2) if os.path.exists(args.outpath+'/train/genotype_train_5E5.npy'): slist.append(3) if os.path.exists(args.outpath+'/train/genotype_train_5E6.npy'): slist.append(4) for s in slist: if s==1: Geno_train=np.load(args.outpath+'/train/genotype_train_5E3.npy') Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E3.npy') Geno_test=np.load(args.outpath+'/test/genotype_test_5E3.npy') elif s==2: Geno_train=np.load(args.outpath+'/train/genotype_train_5E4.npy') Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E4.npy') Geno_test=np.load(args.outpath+'/test/genotype_test_5E4.npy') elif s==3: Geno_train=np.load(args.outpath+'/train/genotype_train_5E5.npy') Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E5.npy') Geno_test=np.load(args.outpath+'/test/genotype_test_5E5.npy') else: Geno_train=np.load(args.outpath+'/train/genotype_train_5E6.npy') Geno_valid=np.load(args.outpath+'/valid/genotype_valid_5E6.npy') Geno_test=np.load(args.outpath+'/test/genotype_test_5E6.npy') for t in [1,2,3,4]: Cx_NN=Cxlist_NN[t-1] NN=MLPClassifier(hidden_layer_sizes=Cx_NN,max_iter=1000) NN.fit(Geno_train,disease_train) Y=NN.predict_proba(Geno_valid) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_NN_valid.npy',Y[:,1]) Y=NN.predict_proba(Geno_test) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_NN_test.npy',Y[:,1]) Cx_ada=Cxlist_ada[t-1] ada = AdaBoostClassifier(base_estimator=Cx_ada) ada.fit(Geno_train,disease_train) Y=ada.predict_proba(Geno_valid) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_ada_valid.npy',Y[:,1]) Y=ada.predict_proba(Geno_test) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_ada_test.npy',Y[:,1]) Cx_GB=Cxlist_GB[t-1] GB = GradientBoostingClassifier(min_impurity_decrease=Cx_GB) GB.fit(Geno_train,disease_train) Y=GB.predict_proba(Geno_valid) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_GB_valid.npy',Y[:,1]) Y=GB.predict_proba(Geno_test) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_GB_test.npy',Y[:,1]) Cx_LR=Cxlist_LR[t-1] LR = LogisticRegression(penalty='l1', C=Cx_LR, max_iter=10000) LR.fit(Geno_train,disease_train) Y=LR.predict_proba(Geno_valid) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_LR_valid.npy',Y[:,1]) Y=LR.predict_proba(Geno_test) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_LR_test.npy',Y[:,1]) Cx_RF=Cxlist_RF[t-1] RF = RandomForestClassifier(min_impurity_decrease=Cx_RF) RF.fit(Geno_train,disease_train) Y=RF.predict_proba(Geno_valid) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_RF_valid.npy',Y[:,1]) Y=RF.predict_proba(Geno_test) np.save(args.outpath+'/ML_Geno/'+str(s)+'_'+str(t)+'_RF_test.npy',Y[:,1])
def main(): # Checks for correct number of arguments if len(sys.argv) != 3: print( 'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]') sys.exit() # set up dataset data_train = pd.read_csv(sys.argv[1]) data_test = pd.read_csv(sys.argv[2]) print('train: {}'.format(sys.argv[1])) print('test: {}'.format(sys.argv[2])) x_train = data_train.drop( [data_train.columns[0], data_train.columns[1], data_train.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_train = pd.Series(data_train.iloc[:, -1]) x_test = data_test.drop( [data_test.columns[0], data_test.columns[1], data_test.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_test = pd.Series(data_test.iloc[:, -1]) type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ') if type == 1: method = input('method: [1: classification, 2: regression] ') if method == 1: classifier = input( 'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] ' ) if classifier == 1: criterion = input('criterion: [1: gini, 2: entropy] ') if criterion == 1: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='gini') elif criterion == 2: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='entropy') else: print('no criterion chosen') exit() elif classifier == 2: print(type, method, classifier) model = ExtraTreeClassifier() elif classifier == 3: print(type, method, classifier) model = ExtraTreesClassifier() elif classifier == 4: n = input('n: [1: 1, 2: 3: 3: 5] ') if n == 1: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=1) elif n == 2: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=3) elif n == 3: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=5) else: print('no n chosen') exit() elif classifier == 5: version = input( 'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] ' ) if version == 1: print(type, method, classifier, version) model = GaussianNB() elif version == 2: print(type, method, classifier, version) model = BernoulliNB() elif version == 3: print(type, method, classifier, version) model = MultinomialNB() elif version == 4: print(type, method, classifier, version) model = ComplementNB() else: print('no version chosen') exit() elif classifier == 6: print(type, method, classifier) model = RadiusNeighborsClassifier(radius=1.0) elif classifier == 7: print(type, method, classifier) model = RandomForestClassifier(n_estimators=50, random_state=1) elif classifier == 8: print(type, method, classifier) model = LinearSVC( multi_class='crammer_singer') #multi_class='ovr' elif classifier == 9: print(type, method, classifier) model = GradientBoostingClassifier() elif classifier == 10: print(type, method, classifier) model = GaussianProcessClassifier(multi_class='one_vs_one') # model = GaussianProcessClassifier(multi_class='one_vs_rest') elif classifier == 11: print(type, method, classifier) model = SGDClassifier() elif classifier == 12: print(type, method, classifier) model = PassiveAggressiveClassifier() elif classifier == 13: print(type, method, classifier) model = NearestCentroid() elif classifier == 14: print(type, method, classifier) model = Perceptron(tol=1e-3, random_state=0) elif classifier == 15: print(type, method, classifier) model = MLPClassifier() elif classifier == 16: print(type, method, classifier) model = AdaBoostClassifier(n_estimators=100) else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) filename = '{},{},{}.txt'.format(type, method, classifier) with open(filename, 'w') as output: output.write('{:10}\t{:10}\t{:10}\t{:10}'.format( 'actual', 'predict', 'approximate', 'match?')) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False output.write('{:10}\t{:10}\t{:10}'.format( y_train[i], predictions[i], match)) output.write('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print( classification_report( y_test, predictions, target_names=['RightTroll', 'LeftTroll', 'Other'])) print( confusion_matrix(y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"])) elif method == 2: # transform into binary classification problem # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1) # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1) # transform string labels into integers # le = LabelEncoder() # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1])) # print(le.classes_) # # y_train = le.transform(y_train) # y_test = le.transform(y_test) regressor = input( 'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] ' ) if regressor == 1: print(type, method, regressor) model = LinearDiscriminantAnalysis() elif regressor == 2: print(type, method, regressor) model = LogisticRegression( solver='lbfgs', multi_class='multinomial') #'newton-cg' elif regressor == 3: print(type, method, regressor) model = RidgeClassifier() elif regressor == 4: print(type, method, regressor) model = QuadraticDiscriminantAnalysis() elif regressor == 5: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(LinearRegression()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(LinearRegression()) else: print('no strategy selected') exit() elif regressor == 6: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(DecisionTreeRegressor()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(DecisionTreeRegressor()) else: print('no strategy selected') exit() elif regressor == 7: print(type, method, regressor) model = PLSRegression(n_components=2) elif regressor == 8: print(type, method, regressor) model = PLSCanonical(n_components=2) elif regressor == 9: print(type, method, regressor) model = CCA(n_components=1) elif regressor == 10: print(type, method, regressor) model = Lasso(alpha=0.1) elif regressor == 11: print(type, method, regressor) model = MultiTaskLasso(alpha=0.1) elif regressor == 12: print(type, method, regressor) model = ElasticNet(random_state=0) elif regressor == 13: print(type, method, regressor) model = MultiTaskElasticNet(random_state=0) elif regressor == 14: print(type, method, regressor) model = Lars(n_nonzero_coefs=1) elif regressor == 15: print(type, method, regressor) model = LassoLars(alpha=.1) elif regressor == 16: print(type, method, regressor) model = OrthogonalMatchingPursuit() elif regressor == 17: print(type, method, regressor) model = BayesianRidge() elif regressor == 18: print(type, method, regressor) model = ARDRegression() elif regressor == 19: print(type, method, regressor) model = TheilSenRegressor(random_state=0) elif regressor == 20: print(type, method, regressor) model = HuberRegressor() elif regressor == 21: print(type, method, regressor) model = RANSACRegressor(random_state=0) else: print('no regressor chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # print('coefficient:', model.coef_) # print('intercept:', model.intercept_) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no method chosen') exit() elif type == 2: classifier = input( 'classifier: [1: label propagation, 2: label spreading] ') if classifier == 1: print(type, classifier) model = LabelPropagation() elif classifier == 2: print(type, classifier) model = LabelSpreading() else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) elif type == 3: method = input( 'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] ' ) if method == 1: clusterer = input('clustere: [1: k means]') if clusterer == 1: clusters = input('clusters: [1: 1, 2: 2, 3: 3] ') if clusters == 1: print(type, method, clusters) model = KMeans(n_clusters=1, random_state=0) elif clusters == 2: print(type, method, clusters) model = KMeans(n_clusters=2, random_state=0) elif clusters == 3: print(type, method, clusters) model = KMeans(n_clusters=3, random_state=0) else: print('no clusters chosen') exit() else: print('no clusterer chosen') exit() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.predict(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # check details print('centroids: ' + model.cluster_centers_) # print('labels: ' + model.labels_) elif method == 2: model = RandomTreesEmbedding() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.apply(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) elif method == 3: model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree') # train the model using the training sets and check score model.fit(x_train) distances, indices = nbrs.kneighbors(X) else: print('no method chosen') exit() # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no type chosen') exit()
#%% for col in cols: hotlab[col] = data[col] promoted = data[["is_promoted"]] #%% x_train, x_test, y_train, y_test = train_test_split(hotlab, promoted) sm = SMOTE(random_state=20) train_input_new, train_output_new = sm.fit_sample(x_train, y_train) #%% class1 = ExtraTreeClassifier() class1.fit(x_train, y_train) pred1 = class1.predict(x_test) score = f1_score(y_test, pred1) #%% confussion = confusion_matrix(y_test, pred1) #%% #For submission submission_data = pd.read_csv("D:\\Hackathons\\Promotion\\test_2umaH9m.csv") #%% submission_data["education"] = submission_data["education"].fillna("Unknown") submission_data["previous_year_rating"] = submission_data["previous_year_rating"].fillna(np.mean(submission_data["previous_year_rating"]))
class stacked_generalization(): def __init__(self, data, target): self.data = data if len(target.shape) == 2: # Convert 2-dim target array into 1-dim target array self.target = target.reshape(target.shape[0]) else: self.target = target self.training_data = None self.training_target = None self.test_data = None self.test_target = None # Construct 3 Tier-1 (base) classifiers self.Tier1_classifier1 = LogisticRegression(solver="lbfgs") self.Tier1_classifier2 = MultinomialNB() self.Tier1_classifier3 = LinearSVC(penalty="l2") self.Tier1_classifier4 = ExtraTreeClassifier() # self.Tier1_classifier5 = SGDClassifier(max_iter=1000, tol=1e-3) # Construct Tier-2 (meta) classifier # self.meta_classifier = LogisticRegression(solver="lbfgs") # self.meta_classifier = MultinomialNB() # self.meta_classifier = LinearSVC(penalty = "l2") self.meta_classifier = ExtraTreeClassifier() # self.meta_classifier = XGBClassifier() # self.meta_classifier = RandomForestClassifier(n_estimators=100) # Divide training data into different n_split training blocks and evaluation blocks # Create T Tier-1 classifiers, C1,..,CT, based on a cross-validation partition of the training data. To do so, # the entire training dataset is divided into B blocks, and each Tier-1 classifier is first trained on (a different set of) # B-1 blocks of the training data. Each classifier is then evaluated on the Bth (pseudo-test) block def TrainingData_Stratified_KFold_split(self, n_split=5, shuffle=False): # Blocks of training data Partition. n_splits cannot be greater than the number of members in each class skf_blocks = StratifiedKFold(n_splits=n_split, shuffle=shuffle) # Creat the indexes of blocks of training data. The number of blocks is n_split training_blocks_index = [] evaluation_blocks_index = [] for trainingBlock_index, evaluationBlock_index in skf_blocks.split( self.training_data, self.training_target): training_blocks_index.append(trainingBlock_index) evaluation_blocks_index.append(evaluationBlock_index) training_blocks_data = [ self.training_data[index, :] for index in training_blocks_index ] training_blocks_target = [ self.training_target[index] for index in training_blocks_index ] evaluation_blocks_data = [ self.training_data[index, :] for index in evaluation_blocks_index ] evaluation_blocks_target = [ self.training_target[index] for index in evaluation_blocks_index ] return training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target def train_meta_classifier(self): training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target = self.TrainingData_Stratified_KFold_split( ) # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs Tier1_outputs = [] for block in range(len(training_blocks_data)): # all Tier-1 base classifiers fit n-1 training data blocks (n blocks totally) self.Tier1_classifier1.fit(training_blocks_data[block], training_blocks_target[block]) self.Tier1_classifier2.fit(training_blocks_data[block], training_blocks_target[block]) self.Tier1_classifier3.fit(training_blocks_data[block], training_blocks_target[block]) self.Tier1_classifier4.fit(training_blocks_data[block], training_blocks_target[block]) # self.Tier1_classifier5.fit(training_blocks_data[block],training_blocks_target[block]) # All Tier-1 base classifiers fit nth training data blocks (n blocks totally).The outputs of all Tier-1 base # classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs output_C1 = self.Tier1_classifier1.predict( evaluation_blocks_data[block]) output_C1 = output_C1.reshape(output_C1.shape[0], 1) output_C2 = self.Tier1_classifier2.predict( evaluation_blocks_data[block]) output_C2 = output_C2.reshape(output_C2.shape[0], 1) output_C3 = self.Tier1_classifier3.predict( evaluation_blocks_data[block]) output_C3 = output_C3.reshape(output_C3.shape[0], 1) output_C4 = self.Tier1_classifier4.predict( evaluation_blocks_data[block]) output_C4 = output_C4.reshape(output_C4.shape[0], 1) # output_C5 = self.Tier1_classifier5.predict(evaluation_blocks_data[block]) # output_C5 = output_C5.reshape(output_C5.shape[0],1) # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs block_outputs = np.hstack((output_C1, output_C2, output_C3, output_C4)) # horizontally combined Tier1_outputs.append(block_outputs) # Vertically combine all training data blocks' classification outputs of all Tier-1 classifiers. # The function np.vstack() can be given a list Tier1_outputs = np.vstack(Tier1_outputs) # Combine all training data blocks' real labels evaluation_blocks_target = np.concatenate([ eva_block_target for eva_block_target in evaluation_blocks_target ]) # Using all training data blocks' classification outputs of all Tier-1 classifiers and all training data blocks' # real labels to train the meta classifier self.meta_classifier.fit(Tier1_outputs, evaluation_blocks_target) print("The training of meta classifier is finished") # return accuracy, recall and precision of test data # Train stacked generalization by cross-validation partition. def train_stacked_generalization_CV(self, n_split=5, shuffle=False): # Cross-validation Partition. n_splits cannot be greater than the number of members in each class skf_cv = StratifiedKFold(n_splits=n_split, shuffle=shuffle) # Creat the indexes of training data and test data training_sets_index = [] test_sets_index = [] for training_index, test_index in skf_cv.split(self.data, self.target): training_sets_index.append(training_index) test_sets_index.append(test_index) training_sets_data = [ self.data[index, :] for index in training_sets_index ] training_sets_target = [ self.target[index] for index in training_sets_index ] test_sets_data = [self.data[index, :] for index in test_sets_index] test_sets_target = [self.target[index] for index in test_sets_index] # Store all metrics of cross-validation in different lists test_cv_accuracy = [] test_cv_recall = [] test_cv_precision = [] time_start = time.time() # start time for cv_time in range(n_split): self.training_data = training_sets_data[cv_time] self.training_target = training_sets_target[cv_time] self.test_data = test_sets_data[cv_time] self.test_target = test_sets_target[cv_time] # train the meta classifier self.train_meta_classifier() # Using all training data to retrain the all Tier-1 base classifiers self.Tier1_classifier1.fit(self.training_data, self.training_target) self.Tier1_classifier2.fit(self.training_data, self.training_target) self.Tier1_classifier3.fit(self.training_data, self.training_target) self.Tier1_classifier4.fit(self.training_data, self.training_target) # self.Tier1_classifier5.fit(self.training_data,self.training_target) # All retrained Tier-1 base classifiers are utilized to predict the test data testset_output_C1 = self.Tier1_classifier1.predict(self.test_data) testset_output_C1 = testset_output_C1.reshape( testset_output_C1.shape[0], 1) testset_output_C2 = self.Tier1_classifier2.predict(self.test_data) testset_output_C2 = testset_output_C2.reshape( testset_output_C2.shape[0], 1) testset_output_C3 = self.Tier1_classifier3.predict(self.test_data) testset_output_C3 = testset_output_C3.reshape( testset_output_C3.shape[0], 1) testset_output_C4 = self.Tier1_classifier4.predict(self.test_data) testset_output_C4 = testset_output_C4.reshape( testset_output_C4.shape[0], 1) # testset_output_C5 = self.Tier1_classifier5.predict(self.test_data) # testset_output_C5 = testset_output_C5.reshape(testset_output_C5.shape[0],1) # Horizontally combine all Tier-1 base classifiers' predictions on test data testset_outputs_Tier1 = np.hstack( (testset_output_C1, testset_output_C2, testset_output_C3, testset_output_C4)) # Based on predictions on test data, of all Tier-1 base classifiers , it would use the meta classifier to predict labels of test data testset_outputs_meta = self.meta_classifier.predict( testset_outputs_Tier1) # Round all predictions of meta classifier xgboost testset_outputs_meta = np.round(testset_outputs_meta) # Store all metrics of cross-validation in different lists test_cv_accuracy.append( accuracy_score(self.test_target, testset_outputs_meta)) test_cv_recall.append( recall_score(self.test_target, testset_outputs_meta)) test_cv_precision.append( precision_score(self.test_target, testset_outputs_meta)) # Convert lists into numpy arrays, since only numpy arrays can be used to calculate mean values, min values, max values and std values test_cv_accuracy = np.array(test_cv_accuracy) test_cv_recall = np.array(test_cv_recall) test_cv_precision = np.array(test_cv_precision) time_end = time.time() # end time print("\nTime cost: ", time_end - time_start, "seconds") cv_scores = { "test_accuracy": test_cv_accuracy, "test_precision_weighted": test_cv_recall, "test_recall_weighted": test_cv_precision } return cv_scores
X_train, X_test, y_train, y_test = train_test_split(data_values, data_labels, test_size=0.25, random_state=42) # In[ ]: from sklearn.grid_search import GridSearchCV from sklearn import metrics from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier parameters = { 'base_estimator': [ DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=4), ExtraTreeClassifier(max_depth=4) ], 'learning_rate': [0.01, 0.1, 0.5, 1.], 'n_estimators': [5, 10, 15, 20, 30, 40, 50, 75, 100, 125], 'algorithm': ['SAMME', 'SAMME.R'] } model = AdaBoostClassifier() AdaBoostClf = GridSearchCV(model, param_grid=parameters) AdaBoostClf.fit(X_train, y_train) score = AdaBoostClf.score(X_test, y_test) prediction = AdaBoostClf.predict(X_test) print("Accuracy using ", AdaBoostClf, " classifier is: ", score) print("-------------------------------------------") print("Below is the confusion Matrix for ", AdaBoostClf) print(metrics.confusion_matrix(y_test, prediction))
def third_generation(X, y, size=200, seed=None): mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\ [0, 0.2, 0.5, 0.9], [0.1, 0.3, 0.6])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m, learning_rate_init=a) for (h, m, a) in mlp_parameters ] mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters] neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)] weighting_methods = ['uniform', 'distance'] knn_clf = [ KNeighborsClassifier(n_neighbors=nn, weights=w) for (nn, w) in itertools.product(neigbhors_number, weighting_methods) ] knn_name = [ 'knn_{0}_{1}'.format(*param) for param in itertools.product( neigbhors_number, ['uniform', 'distance']) ] C = np.logspace(-3, 7, num=11) degree = [2, 3, 4] gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_clf_poly = [ SVC(C=c, kernel='poly', degree=d) for (c, d) in itertools.product(C, degree) ] svm_clf_poly_name = [ 'svm_poly_{0}_{1}'.format(*param) for param in itertools.product(C, degree) ] svm_clf_rbf = [ SVC(C=c, kernel='rbf', gamma=g) for (c, g) in itertools.product(C, gamma) ] svm_clf_rbf_name = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] dt_params = list(itertools.product(['gini', 'entropy'], \ [1, 2, 3, 4, 5, None], \ [None, 'sqrt', 'log2'], \ ['best', 'random'])) dt_clf = [ DecisionTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] et_clf = [ ExtraTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] ada_params = list(itertools.product([2**i for i in range(1, 14)], \ [1, 2, 3])) ada_dt_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=DecisionTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_et_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=ExtraTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params] ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params] nb_bag_est = 50 nb_bag_stumps = 200 bag_dt = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=DecisionTreeClassifier()) bag_et = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=ExtraTreeClassifier()) bag_stumps = BaggingClassifier( n_estimators=nb_bag_stumps, base_estimator=DecisionTreeClassifier(max_depth=1)) bag_dt.fit(X, y) bag_et.fit(X, y) bag_stumps.fit(X, y) dt_bag_clf = bag_dt.estimators_ et_bag_clf = bag_et.estimators_ stump_bag_clf = bag_stumps.estimators_ dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] stump_bag_name = [ 'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps) ] bag_dt_clf = [bag_dt] bag_et_clf = [bag_dt] bag_stump_clf = [bag_stumps] bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))] bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))] bag_stump_name = ['bag_stump_{0}'.format(str(200))] nb_rf = 15 rf = RandomForestClassifier(n_estimators=nb_rf) rf.fit(X, y) dt_rf_clf = rf.estimators_ dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)] log_parameters = list(itertools.product(['l1', 'l2'],\ np.logspace(-5, 9, num=15), [True, False])) log_clf = [ LogisticRegression(penalty=l, C=c, fit_intercept=f) for (l, c, f) in log_parameters ] log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters] sgd_parameters = list( itertools.product([ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1))) sgd_clf = [ SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1) for (l, p, f, l1) in sgd_parameters ] sgd_name = [ 'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters ] pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \ dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \ log_clf + sgd_clf pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \ ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \ bag_stump_name + dt_rf_name + log_name + sgd_name for model in pool: if not check_model_is_fitted(model, X[0, :].reshape((1, -1))): model.fit(X, y) np.random.seed(seed) order = np.random.permutation(range(len(pool))) estimators = [pool[i] for i in order[:size]] return estimators, pool_name
def run(): data = pd.read_csv('train.csv') submission_data = pd.read_csv('test.csv') train_data, test_data = split_data(data, test_size=0.01) num_atribs = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] cat_atribs = ['Sex', 'Embarked'] transformed_train_data = transform_data(train_data, num_atribs=num_atribs, cat_atribs=cat_atribs) transformed_test_data = transform_data(test_data, num_atribs=num_atribs, cat_atribs=cat_atribs) transformed_submission_data = transform_data(submission_data, num_atribs=num_atribs, cat_atribs=cat_atribs, no_fit=True) train_data_labels = train_data['Survived'] test_data_labels = test_data['Survived'] models = { 0: RandomForestClassifier(max_depth=None, max_leaf_nodes=None, warm_start=True), 1: LinearSVC(), 2: NuSVC(), 3: SVC(C=1.0), 4: DecisionTreeClassifier(), 5: ExtraTreeClassifier(), 6: GaussianNB(), 7: KNeighborsClassifier(), 8: MLPClassifier(max_iter=1000), 9: AdaBoostClassifier(), 10: GaussianProcessClassifier(), } model_num_single_model = 0 # MLP, SVC, RandForest # param_dist = { # 'C': list(range(1, 15)), # 'kernel': ['rbf', 'sigmoid'], #['rbf'], 'poly', 'linear', 'sigmoid', 'precomputed'], # 'degree': [3], # 'gamma': ['auto'], # 'coef0': [0.0], # 'shrinking': [True], # 'probability': [False], # 'tol': [1e-3], # 'cache_size': list(range(1, 2000)), # 'class_weight': [None], # 'verbose': [False], # 'max_iter': [-1], # 'decision_function_shape': ['ovr'], # 'random_state': [42] # } # # transformed_train_data, saved_pasid_train = drop_passenger_iD(transformed_train_data) # transformed_submission_data, saved_pasid_pred = drop_passenger_iD(transformed_submission_data) # # rand_search = RandomizedSearchCV(models[model_num_single_model], n_iter=10000, param_distributions=param_dist) # rand_search.fit(transformed_train_data, train_data_labels) # print(rand_search.best_estimator_) # submission_prediction = rand_search.predict(transformed_submission_data) # # df = pd.DataFrame() # df['PassengerId'] = saved_pasid_pred # df['Survived'] = submission_prediction # # df.to_csv( # '{}submission_data_randcv_SVC.csv'.format(paths.get('saved_predictions_path')), index=False) if model_num_single_model is None: for _, model in models.items(): # prediction_on_train_split = run_model(model, transformed_train_data, train_data_labels, transformed_test_data) submission_prediction = run_model(model, transformed_train_data, train_data_labels, transformed_submission_data) submission_prediction.to_csv('{}submission_data_{}.csv'.format( paths.get('saved_predictions_path'), type(model).__name__), index=False) else: model = models[model_num_single_model] submission_prediction = run_model(model, transformed_train_data, train_data_labels, transformed_submission_data) submission_prediction.to_csv('{}submission_data_{}.csv'.format( paths.get('saved_predictions_path'), type(model).__name__), index=False) df = pd.DataFrame() df['PassengerId'] = list(range(892, 1310)) df['Survived'] = np.zeros(1310 - 892) df.to_csv('all_true.csv', index=False)
def main(): """ Given training data, this script trains and tracks each prediction for several algorithms and saves the predictions and ground truth to a CSV file """ # Parameters for the training and predictions CV = 10 subsets = ('fiss', 'act', 'fissact', 'all') subset = subsets[2] pkl_base = './pkl_trainsets/2jul2018/2jul2018_trainset' for trainset in ('1', '2'): pkl = pkl_base + trainset + '_nucs_' + subset + '_not-scaled.pkl' trainXY = pd.read_pickle(pkl) trainX, rY, cY, eY, bY = splitXY(trainXY) if subset == 'all': top_n = 100 nuc_set = top_nucs(trainX, top_n) trainX = filter_nucs(trainX, nuc_set, top_n) trainX = scale(trainX) # loops through each reactor parameter to do separate predictions for Y in ('r', 'b', 'c', 'e'): trainY = pd.Series() # get param names and set ground truth if Y == 'c': trainY = cY parameter = 'cooling' elif Y == 'e': trainY = eY parameter = 'enrichment' elif Y == 'b': trainY = bY parameter = 'burnup' else: trainY = rY parameter = 'reactor' ####################### # optimize parameters # ####################### # initialize learners score = 'explained_variance' kfold = KFold(n_splits=CV, shuffle=True) alg1_init = DecisionTreeRegressor() alg2_init = ExtraTreeRegressor() alg3_init = BayesianRidge() if Y is 'r': score = 'accuracy' kfold = StratifiedKFold(n_splits=CV, shuffle=True) alg1_init = DecisionTreeClassifier(class_weight='balanced') alg2_init = ExtraTreeClassifier(class_weight='balanced') alg3_init = GaussianNB() # CV search the hyperparams # alg1 alg1_grid = { "max_depth": np.linspace(3, 90).astype(int), "max_features": np.linspace(5, len(trainXY.columns) - 6).astype(int) } alg1_opt = RandomizedSearchCV(estimator=alg1_init, param_distributions=alg1_grid, n_iter=20, scoring=score, n_jobs=-1, cv=kfold, return_train_score=True) alg1_opt.fit(trainX, trainY) alg1_init = alg1_opt.best_estimator_ d1 = alg1_opt.best_params_['max_depth'] f1 = alg1_opt.best_params_['max_features'] # alg2 alg2_grid = alg1_grid alg2_opt = RandomizedSearchCV(estimator=alg2_init, param_distributions=alg2_grid, n_iter=20, scoring=score, n_jobs=-1, cv=kfold, return_train_score=True) alg2_opt.fit(trainX, trainY) alg2_init = alg2_opt.best_estimator_ d2 = alg2_opt.best_params_['max_depth'] f2 = alg2_opt.best_params_['max_features'] # alg3 alg3_grid = { 'n_iter': np.linspace(50, 1000).astype(int), 'alpha_1': np.logspace(-8, 2), 'alpha_2': np.logspace(-8, 2), 'lambda_1': np.logspace(-8, 2), 'lambda_2': np.logspace(-8, 2) } if Y is not 'r': alg3_opt = RandomizedSearchCV(estimator=alg3_init, param_distributions=alg3_grid, n_iter=20, scoring=score, n_jobs=-1, cv=kfold, return_train_score=True) alg3_opt.fit(trainX, trainY) alg3_init = alg3_opt.best_estimator_ it = alg3_opt.best_params_['n_iter'] a1 = alg3_opt.best_params_['alpha_1'] a2 = alg3_opt.best_params_['alpha_2'] l1 = alg3_opt.best_params_['lambda_1'] l2 = alg3_opt.best_params_['lambda_2'] # Save dat info param_file = 'trainset_' + trainset + '_hyperparameters_alt-algs.txt' with open(param_file, 'a') as pf: pf.write( 'The following parameters are best from the randomized search for the {} parameter prediction:\n' .format(parameter)) pf.write('max depth for dtree is {}\n'.format(d1)) pf.write('max features for dtree is {}\n'.format(f1)) pf.write('max depth for xtree is {}\n'.format(d2)) pf.write('max features for xtree is {}\n'.format(f2)) if Y is not 'r': pf.write('num iterations for bayes reg is {}\n'.format(it)) pf.write('alpha 1 for bayes reg is {}\n'.format(a1)) pf.write('alpha 2 for bayes reg is {}\n'.format(a2)) pf.write('lambda 1 for bayes reg is {}\n'.format(l1)) pf.write('lambda 2 for bayes reg is {}\n'.format(l2)) ######################## # run predictions, etc # ######################## #scores = ['explained_variance', 'neg_mean_absolute_error'] #if Y is 'r': # scores = ['accuracy', ] #csv_name = 'trainset_' + trainset + '_' + subset + '_' + parameter # #print("The {} predictions in trainset {} are beginning\n".format(parameter, trainset), flush=True) # ## track predictions #track_predictions(trainX, trainY, alg1_init, alg2_init, alg3_init, scores, kfold, csv_name) #print("\t Prediction tracking done\n", flush=True) ## calculate errors and scores #errors_and_scores(trainX, trainY, alg1_init, alg2_init, alg3_init, scores, kfold, csv_name) #print("\t CV scoring done\n", flush=True) ## learning curves #learning_curves(trainX, trainY, alg1_init, alg2_init, alg3_init, kfold, csv_name) #print("\t Learning curves done\n", flush=True) # #print("The {} predictions in trainset {} are complete\n".format(parameter, trainset), flush=True) #print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", flush=True) return
def main(tmpdir: str): # Define classification models, and hyperparams. decision_tree_classifier = SKLearnWrapper( DecisionTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) extra_tree_classifier = SKLearnWrapper( ExtraTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) ridge_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RidgeClassifier(), HyperparameterSpace({ 'alpha': Choice([0.0, 1.0, 10.0, 100.0]), 'fit_intercept': Boolean(), 'normalize': Boolean() })) ]).set_name('RidgeClassifier') logistic_regression = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( LogisticRegression(), HyperparameterSpace({ 'C': LogUniform(0.01, 10.0), 'fit_intercept': Boolean(), 'penalty': Choice(['none', 'l2']), 'max_iter': RandInt(20, 200) })) ]).set_name('LogisticRegression') random_forest_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RandomForestClassifier(), HyperparameterSpace({ 'n_estimators': RandInt(50, 600), 'criterion': Choice(['gini', 'entropy']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4), 'bootstrap': Boolean() })) ]).set_name('RandomForestClassifier') # Define a classification pipeline that lets the AutoML loop choose one of the classifier. # See also ChooseOneStepOf documentation: https://www.neuraxle.org/stable/api/steps/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf pipeline = Pipeline([ ChooseOneStepOf([ decision_tree_classifier, extra_tree_classifier, ridge_classifier, logistic_regression, random_forest_classifier ]) ]) # Create the AutoML loop object. # See also AutoML documentation: https://www.neuraxle.org/stable/api/metaopt/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchSampler(), validation_splitter=ValidationSplitter(validation_size=0.20), scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True), n_trials=7, epochs=1, hyperparams_repository=HyperparamsOnDiskRepository( cache_folder=tmpdir), refit_best_trial=True, continue_loop_on_error=False) # Load data, and launch AutoML loop ! X_train, y_train, X_test, y_test = generate_classification_data() auto_ml = auto_ml.fit(X_train, y_train) # Get the model from the best trial, and make predictions using predict, as per the `refit_best_trial=True` argument to AutoML. y_pred = auto_ml.predict(X_test) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print("Test accuracy score:", accuracy) shutil.rmtree(tmpdir)
def trainModel(): #Read processed dataframe df = joblib.load('J:\Datasets\Exercises\Exercise5\EngineeredDataset.pkl') #Move test set indicator to the 2nd position of the dataframe cols = list(df) cols.insert(1, cols.pop(cols.index('TestSet'))) df = df.ix[:, cols] # Split dataframe into target and features y = df.iloc[:, 0] # .as_matrix() flag = pd.DataFrame(df.iloc[:, 1]) # .as_matrix() X = df.iloc[:, 2:] # .as_matrix() # Apply standard scaler in order to remove mean and scale to unit variance (so large-valued features won't #heavily influence the model) sc = StandardScaler() # Apply scaler colNames = X.columns X = sc.fit_transform(X) X = pd.DataFrame(X, columns=colNames) # Remove features with less than 20% variance colNames = X.columns sel = VarianceThreshold(threshold=0.16) X = sel.fit_transform(X) # Get column names back newCols = [] for remain, col in zip(sel.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) #Perform dimensionality reduction using PCA pca = PCA(n_components=14) pca.fit(X) #PCA scree plot - aid in determining number of components plt.figure(1, figsize=(4, 3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_, linewidth=2) plt.axis('tight') plt.xlabel('n_components') plt.ylabel('explained_variance_') #plt.show() #Create PCA dataframe and append to original #Adding principle components adds additional insight to the dataframe #If PCs do not perform well, they will be removed in further feature selection procedures dfPCA = pd.DataFrame(pca.transform(X)) newCols = [] for col in dfPCA.columns: name = 'PCA' + str(col) newCols.append(name) dfPCA.columns = newCols X = pd.merge(X, dfPCA, left_index=True, right_index=True) # Perform univariate feature selection (ANOVA F-values) colNames = X.columns selection_Percent = SelectPercentile(percentile=50) X = selection_Percent.fit_transform(X, y) # Get column names back newCols = [] for remain, col in zip(selection_Percent.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Perform tree-based feature selection clf = ExtraTreeClassifier() clf = clf.fit(X, y) colNames = X.columns sel = SelectFromModel(clf, prefit=True) X = sel.transform(X) newCols = [] for remain, col in zip(sel.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) #Split train and test set #Create new test set column in X X['TestSet'] = flag['TestSet'].tolist() X['Target'] = y.tolist() #Encode target (to binary) - for ROC AUC metric (0 for under 50k, 1 for over) le = LabelEncoder() X['Target'] = le.fit_transform(X['Target']) #Copy in dfTest all the test set values from X dfTest = X.loc[X['TestSet'] == 1] #Re-write X with only learning set values X = X.loc[X['TestSet'] == 0] #Define test set target dfTestTarget = dfTest['Target'] #Remove target and 'test set' column from test dataframe dfTest.drop(['TestSet', 'Target'], axis=1, inplace=True) #Create new learning target series y = X['Target'] #Drop newly inserted columns from learning dataframe X.drop(['TestSet', 'Target'], axis=1, inplace=True) #Retain column names colNames = X.columns # The dataset is heavily imbalanced in terms of classes, and balancing procedures need to be conducted # Testing various under / over / combined sampling procedures # Some of these procedures are very computationally expensive (and thus are not suitable for home use e.g. SMOTEENN) rus = RandomUnderSampler() X, y = rus.fit_sample(X, y) #sme = SMOTEENN(n_jobs=-1) #X, y, = sme.fit_sample(X, y) X = pd.DataFrame(X, columns=colNames) y = pd.Series(y, name='Target') #Define train/test variables X_train = X y_train = y X_test = dfTest y_test = dfTestTarget def testClassifier(clf): #XGB tuning - concept, not in use param_grid = [{ 'max_depth': range(2, 6, 2), 'min_child_weight': range(2, 6, 2), 'n_estimators': range(100, 200, 75), 'learning_rate': [0.1], 'gamma': [0, 1, 10], 'subsample': [0.6, 0.8], 'colsample_bytree': [0.6, 0.8], 'reg_alpha': [1, 10], 'reg_lambda': [1, 10] }] fit_params = { "early_stopping_rounds": 8, "eval_metric": "map", "eval_set": [[X_test, y_test]], "verbose": False } grid = GridSearchCV(clf, param_grid, fit_params=fit_params, cv=3, verbose=1, n_jobs=-1, scoring='average_precision') fitted_classifier = grid.fit(X_train, y_train) print(grid.best_score_, grid.best_params_) predictions = fitted_classifier.predict(X_test) score1 = metrics.accuracy_score(y_test.values, predictions) score2 = metrics.roc_auc_score(y_test.values, predictions) score3 = metrics.cohen_kappa_score(y_test.values, predictions) score4 = metrics.classification_report(y_test.values, predictions) print('Accuracy score, ROC AUC, Cohen Kappa') print(score1, score2, score3) print('Classification Report') print(score4) print('Normal Fit') fitted = clf.fit(X_train, y_train) scoresCV = cross_val_score(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1) trainPredictionsCV = cross_val_predict(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1) trainPredictions = clf.predict(X_train) testPredictions = clf.predict(X_test) #X_test['Predictions'] = testPredictions score1 = metrics.accuracy_score(y_test.values, testPredictions) score2 = metrics.roc_auc_score(y_test.values, testPredictions) score3 = metrics.cohen_kappa_score(y_test.values, testPredictions) score4 = metrics.classification_report(y_test.values, testPredictions) print('Train score: ', metrics.accuracy_score(y_train.values, trainPredictions)) print('CV score: ', scoresCV) print('Accuracy score, ROC AUC, Cohen Kappa') print(score1, score2, score3) print('Classification Report') print(score4) #WITH UNDER-SAMPLING #Low Precision in Class 1 (~0.28) = suggests that too many salaries are labeled as >50k when they are <50k #Could be a potential after-effect of under-sampling #High Recall in Class 1 (~0.90) = suggests that the classifier is able to find all positive samples #WITHOUT UNDER-SAMPLING #High Precision in Class 1 (~0.76) = suggests that the classifiers handles negative samples well #Low Recall in Class 1 (~0.39) = suggests that the classifier is not able to find all positive samples return clf '''print('LR') lr = LogisticRegression(C = 100) clf = testClassifier(lr) print('DT') dt = DecisionTreeClassifier() clf = testClassifier(dt) export_graphviz(clf, out_file = 'tree.dot') print('RF') rf = RandomForestClassifier() clf = testClassifier(rf)''' print('XGB') gb = xgboost.XGBClassifier() clf = testClassifier(gb)
######### Create New Dataframe r2 = result.drop(["index"], axis=1) # check new class counts r2.fraudulent.value_counts() ################################################################### ########################## Feature Selection ###################### ################################################################### #Feature Selection using Tree Classifier a = r2.iloc[:, :8] #independent columns b = r2.iloc[:, -1] #target column model = ExtraTreeClassifier() model.fit(a, b) print(model.feature_importances_ ) #use inbuilt class feature_importances of tree based classifiers #plot graph of feature importances for better visualization feat_importances = pd.Series(model.feature_importances_, index=a.columns) feat_importances.nlargest(8).plot(kind='barh') #Almost all 8 variables are contributing towards output variable. ############################################################### ####################### Cross Validation ###################### ###############################################################
from utils.IO import writeResToFile, loadResFromFile, getResultsFromFileAsArray, saveTableToFile from utils.plot import generateChart import pandas from datasets.dataUrls import dataURL x = dataURL[20] url = "D:/projekty/UM/datasets/" + x + ".dat" dataframe = pandas.read_csv(url) array = dataframe.values tmp = array[:, -1] tmp2 = tmp == 'positive' X = array[:, :-1] y = tmp2.astype(int) # drzewo klasyfikacyjne z domyślnymi wartościami parametrów clf = ExtraTreeClassifier(random_state=1410) # wielokrotna 5krotna walidacja krzyzowa (10x5) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42) scores = np.zeros((len(preprocs), 5 * 2, len(metrics))) for fold_id, (train, test) in enumerate(rskf.split(X, y)): for preproc_id, preproc in enumerate(preprocs): clf = clone(clf) if preprocs[preproc] == None: X_train, y_train = X[train], y[train] else: X_train, y_train = preprocs[preproc].fit_resample( X[train], y[train])
classifiers.append(GaussianNB()) #Nearest Neighbors classifiers.append(KNeighborsClassifier()) #Discrimnant analysis classifiers.append(LinearDiscriminantAnalysis()) #Support vector machine classifiers.append(SVC(random_state=random_state, probability=True)) classifiers.append(NuSVC(random_state=random_state, probability=True)) classifiers.append(LinearSVC(random_state=random_state)) #Trees classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append(ExtraTreeClassifier(random_state=random_state)) """ Accuracy cross validation for algorithms """ cf_results_acc = [] for classifier in classifiers: cf_results_acc.append( cross_val_score(classifier, features_train, y=target_train, scoring="accuracy", cv=kfold, n_jobs=4)) #Means and standard deviation for each machine learning model utilized cf_means_acc = []
class_names = bi_class_target_attrs, filled = True, rounded = True, special_characters = True) print(check_output('dot -Tpdf cart.dot -o cart.pdf', shell = True)) print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_cart.predict(rnd_test_X))) print("Precision = %s"%precision_score(rnd_test_y, clf_cart.predict(rnd_test_X))) print("Recall = %s"%recall_score(rnd_test_y, clf_cart.predict(rnd_test_X))) print("F = %s"%fbeta_score(rnd_test_y, clf_cart.predict(rnd_test_X), beta=1)) print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_cart.predict(rnd_test_X))) roc_auc_scorer = get_scorer("roc_auc") print("ROC AUC = %s"%roc_auc_scorer(clf_cart, rnd_test_X, rnd_test_y)) fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_cart.predict_proba(rnd_test_X)[:, 1]) axes_roc.plot(fpr, tpr, label = 'CART-2') ## randomized tree with default setting clf_rnd_tree = ExtraTreeClassifier() clf_rnd_tree.fit(rnd_training_X, rnd_training_y) export_graphviz(clf_rnd_tree, out_file = 'default_rnd_tree.dot', feature_names = attribute_names, class_names = bi_class_target_attrs, filled = True, rounded = True, special_characters = True) print(check_output('dot -Tpdf default_rnd_tree.dot -o default_rnd_tree.pdf', shell = True)) print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("Precision = %s"%precision_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("Recall = %s"%recall_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("F = %s"%fbeta_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X), beta=1)) print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_rnd_tree.predict_proba(rnd_test_X)[:, 1]) axes_roc.plot(fpr, tpr, label = "Randomized tree-1") axes_roc.set_title("ROC of CART and a randomized tree")
class ExtraTreeClass: """ Name : ExtraTreeClassifier Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'extratree' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/classifier/resource/classifier_sample.csv", sep=",", encoding="utf-8") # 학습 및 레이블(정답) 데이터 분리 self._x = data.drop("quality", axis=1) self._y = data["quality"] # 학습 데이터 및 테스트 데이터 분리 self._x_train, self._x_test, self._y_train, self._y_test = train_test_split( self._x, self._y, test_size=0.2, shuffle=True, random_state=42) # 모델 선언 self._model = ExtraTreeClassifier() # 모델 학습 self._model.fit(self._x_train, self._y_train) # 일반 예측 def predict(self): # 예측 y_pred = self._model.predict(self._x_test) # 리포트 출력 print(classification_report(self._y_test, y_pred)) score = accuracy_score(self._y_test, y_pred) # 스코어 확인 print(f'Score = {score}') # 스코어 리턴 return score # CV 예측(Cross Validation) def predict_by_cv(self): cv = KFold(n_splits=5, shuffle=True) # CV 지원 여부 if hasattr(self._model, "score"): cv_score = cross_val_score(self._model, self._x, self._y, cv=cv) # 스코어 확인 print(f'Score = {cv_score}') # 스코어 리턴 return cv_score else: raise Exception('Not Support CrossValidation') # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}.pkl'): os.rename( self._f_path + f'/model/{self._name}.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl') def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
# data = '' with open(fname) as f: for s in f: tmp = map(int, s.split()) labels.append(tmp[-1]) res.append(tmp[:-1]) # data += (str(tmp)[1:-1]).replace(',', '')+'\n' # with open('out.txt', 'w') as o: # o.write(str(data)[1:-1]) return res, labels X, Y = readData('german.data-numeric.txt') Xt = X[:-200] ; Yt = Y[:-200] XT = X[-200:] ; YT = Y[-200:] print len(Xt) clf = ExtraTreeClassifier(max_depth=None, random_state=0) clf = clf.fit(Xt, Yt) #proba = clf.predict_proba(XT) #print len(proba) #print proba err = 0 for i, x in enumerate(XT): if clf.predict(x) != YT[i]: prob = clf.predict_proba(x) # print prob err += 1 print err
Model = DecisionTreeClassifier() Model.fit(X_train, y_train) y_pred = Model.predict(X_test) # Summary of the predictions made by the classifier print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) # Accuracy score print('accuracy is', accuracy_score(y_pred, y_test)) # ExtraTreeClassifier from sklearn.tree import ExtraTreeClassifier Model = ExtraTreeClassifier() Model.fit(X_train, y_train) y_pred = Model.predict(X_test) # Summary of the predictions made by the classifier print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) # Accuracy score print('accuracy is', accuracy_score(y_pred, y_test)) import numpy as np def sigmoid(z): return 1 / (1 + np.exp(-z))
# print("CV error = %f +-%f" % (np.mean(scores), np.std(scores))) # print "Cross validation" scores = cross_val_score(RandomForestClassifier(), training, classes, cv=KFold(n=len(training), n_folds=5, random_state=42), scoring="accuracy") print("CV error = %f +-%f" % (1. - np.mean(scores), np.std(scores))) print("Accuracy =", accuracy_score(y_test, tlf.predict(X_test))) print("Precision =", precision_score(y_test, tlf.predict(X_test))) print("Recall =", recall_score(y_test, tlf.predict(X_test))) print("F =", fbeta_score(y_test, tlf.predict(X_test), beta=1)) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Extra Tree classifier" rlf = ExtraTreeClassifier() rlf.fit(training, classes) print("Training error =", zero_one_loss(classes, rlf.predict(training))) X_train, X_test, y_train, y_test = train_test_split(training, classes) rlf = ExtraTreeClassifier() rlf.fit(X_train, y_train) print("Training error =", zero_one_loss(y_train, rlf.predict(X_train))) print("Test error =", zero_one_loss(y_test, rlf.predict(X_test))) scores = [] print "K-fold cross validation" for train, test in KFold(n=len(training), n_folds=5, random_state=42): X_train, y_train = training[train], classes[train] X_test, y_test = training[test], classes[test]
#df.drop('character', axis=0, inplace=True) #df = df.astype(np.uint8) ############################### df_sample = df.sample(frac=0.1, random_state=0) names = [ 'RidgeClassifier', 'BernoulliNB', 'GaussianNB', 'ExtraTreeClassifier', 'DecisionTreeClassifier', 'NearestCentroid', 'KNeighborsClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier' ] classifiers = [ RidgeClassifier(), BernoulliNB(), GaussianNB(), ExtraTreeClassifier(), DecisionTreeClassifier(), NearestCentroid(), KNeighborsClassifier(), ExtraTreesClassifier(), RandomForestClassifier() ] test_scores, train_scores, fit_time, score_time = [], [], [], [] return_train_score = "warn" for clf in classifiers: scores = cross_validate(clf, df_sample.iloc[:, :-1], df_sample.iloc[:, -1], return_train_score=True) test_scores.append(scores['test_score'].mean()) train_scores.append(scores['train_score'].mean())
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
def all_classifier_models(): models = [] metrix = [] c_report = [] train_accuracy = [] test_accuracy = [] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier())) models.append(('GaussianNB', GaussianNB())) models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100))) models.append(('SVM', SVC(gamma='auto'))) models.append(('Linear_SVM', LinearSVC())) models.append(('XGB', XGBClassifier())) models.append(('SGD', SGDClassifier())) models.append(('Perceptron', Perceptron())) models.append(('ExtraTreeClassifier', ExtraTreeClassifier())) models.append(('OneClassSVM', OneClassSVM(gamma = 'auto'))) models.append(('NuSVC', NuSVC())) models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1))) models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0))) models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0))) models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('RidgeClassifierCV', RidgeClassifierCV())) models.append(('RidgeClassifier', RidgeClassifier())) models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier())) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))] models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))) clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'))) models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('ExtraTreesClassifier', ExtraTreesClassifier())) models.append(('CategoricalNB', CategoricalNB())) models.append(('ComplementNB', ComplementNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('MultinomialNB', MultinomialNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('NearestCentroid', NearestCentroid())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('GaussianMixture', GaussianMixture())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) test_accuracy= [] names = [] for name, model in models: try: m = model m.fit(X_train, y_train) y_pred = m.predict(X_test) train_acc = round(m.score(X_train, y_train) * 100, 2) test_acc = metrics.accuracy_score(y_test,y_pred) *100 c_report.append(classification_report(y_test, y_pred)) test_accuracy.append(test_acc) names.append(name) metrix.append([name, train_acc, test_acc]) except: print("Exception Occurred :",name) return metrix,test_accuracy,names