def test_ecoc_float_y(): # Test that the OCC errors on float targets X = iris.data y = iris.data[:, 0] ovo = OutputCodeClassifier(LinearSVC()) assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) ovo = OutputCodeClassifier(LinearSVC(), code_size=-1) assert_raise_message(ValueError, "code_size should be greater than 0," " got -1", ovo.fit, X, y)
def test_ecoc_fit_predict(): # A classifier which implements decision_function. ecoc = OutputCodeClassifier(LinearSVC(), code_size=2) ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2) # A classifier which implements predict_proba. ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2) ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2)
def test_ecoc_float_y(): # Test that the OCC errors on float targets X = iris.data y = iris.data[:, 0] ovo = OutputCodeClassifier(LinearSVC()) msg = "Unknown label type" with pytest.raises(ValueError, match=msg): ovo.fit(X, y) ovo = OutputCodeClassifier(LinearSVC(), code_size=-1) msg = "code_size should be greater than 0, got -1" with pytest.raises(ValueError, match=msg): ovo.fit(X, y)
def menu(mode, mult_clf_mode, bin_clf_mode): # Define o classificador binário if (bin_clf_mode == "--svc"): bin_clf = svm.SVC(class_weight='balanced') filename = "svc" elif (bin_clf_mode == "--mlp"): bin_clf = MLPClassifier() filename = "mlp" else: print("Escolha o terceiro argumento como --svc ou --mlp") exit() # Define o classificador multiclasse if (mult_clf_mode == "--ovr"): mult_clf = OneVsRestClassifier(bin_clf, n_jobs=-1) filename = "ovr_" + filename elif (mult_clf_mode == "--ovo"): mult_clf = OneVsOneClassifier(bin_clf, n_jobs=-1) filename = "ovo_" + filename elif (mult_clf_mode == "--eoc"): mult_clf = OutputCodeClassifier(bin_clf, code_size=3.0, n_jobs=-1) filename = "eoc_" + filename else: print("Escolha o segundo argumento como --ovr ou --ovo ou --eoc") exit() if (mode == "--train"): training(mult_clf, filename) elif (mode == "--test"): test(filename) else: print("Escolha o primeiro argumento como --train ou --test") exit()
def test_ecoc_float_y(): # Test that the OCC errors on float targets X = iris.data y = iris.data[:, 0] ovo = OutputCodeClassifier(LinearSVC()) assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
def test_ecoc_gridsearch(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ecoc, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs)
def run_test(**kwargs): b = fetch_sw_orl() tic = time.time() # split the data in X_train, X_test, y_train, y_true = train_test_split(b.data, b.target, test_size=0.2, stratify=b.target) hog_train = [] for img_array in X_train: fd, _ = hog(img_array.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_train.append(fd) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2) clf.fit(hog_train, y_train) tok = time.time() hog_test = [] for img_arry in X_test: fd, _ = hog(img_arry.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_test.append(fd) y_pred = clf.predict(hog_test) return tok - tic, accuracy_score(y_true, y_pred)
def get_classifier_by_type(clftype, model_train_feature, model_train_label, Classifier, kwargs): """ Get classifiers """ print 'Train multi-class classifiers, type = %s' % (clftype) if clftype == 'multiclass': clf = Classifier(**kwargs).fit(model_train_feature, model_train_label) elif clftype == 'onevsrest': from sklearn.multiclass import OneVsRestClassifier clf = OneVsRestClassifier(Classifier(**kwargs)).fit( model_train_feature, model_train_label) elif clftype == 'onevsone': from sklearn.multiclass import OneVsOneClassifier clf = OneVsOneClassifier(Classifier(**kwargs), n_jobs=-1).fit(model_train_feature, model_train_label) elif clftype == 'occ': from sklearn.multiclass import OutputCodeClassifier clf = OutputCodeClassifier(Classifier(**kwargs), code_size=2, random_state=0).fit(model_train_feature, model_train_label) else: print 'Unsupported clf type:', clftype sys.exit(1) return clf
def af_vecAvg_MaxEnt_OutputCode(data): job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def voting_classifier(): # create the classifier objects f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced') classifiers = { 'knn':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',KNeighborsClassifier())]), 'logistic':LogisticRegression(), 'lda':LinearDiscriminantAnalysis(), 'svm':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',SVC())]), 'tree':DecisionTreeClassifier(), 'randomforest':RandomForestClassifier(), 'extratrees':ExtraTreesClassifier(), 'gradboost':GradientBoostingClassifier(), 'adaboost':AdaBoostClassifier(), 'mlp':MLPClassifier(), 'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,probability=True,class_weight='balanced'), code_size=2)} # create ensemble of the classifiers clfs = [] [clfs.append((name,classifiers.get(name))) for name in classifier_names] # create the voting classifier voting_type = classification_method[0:4] eclf = VotingClassifier(estimators=clfs, voting=voting_type) # specify parameters of the classifiers param_set = {} if 'knn' in classifier_names: #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance' param_set.update({'knn__clf__n_neighbors':[17], 'knn__clf__p':[1], 'knn__clf__weights':['distance'], 'knn__clf__algorithm':['auto'], 'knn__clf__n_jobs':[3]}) if 'logistic' in classifier_names: #94.4 'C':1, 'solver':'newton-cg' param_set.update({'logistic__C':[2], 'logistic__solver':['lbfgs'], 'logistic__class_weight':['balanced'], 'logistic__max_iter':[100]}) if 'lda' in classifier_names: #94.9 'solver':'lsqr' param_set.update({'lda__solver':['lsqr'], 'lda__shrinkage':['auto']}) if 'svm' in classifier_names: #95.3 'C':1, 'kernel':'linear' param_set.update({'svm__clf__C':[2], 'svm__clf__kernel':['linear'], 'svm__clf__shrinking':[True], 'svm__clf__probability':[True], 'svm__clf__class_weight':['balanced'], 'svm__clf__decision_function_shape':['ovo']}) if 'tree' in classifier_names: #82.3 'max_depth':15 param_set.update({'tree__max_depth':[10,15,20], 'tree__class_weight':['balanced'], 'tree__presort':[True]}) if 'randomforest' in classifier_names: #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25 param_set.update({'randomforest__n_estimators':[100], 'randomforest__max_features':[10,25,50], 'randomforest__min_samples_leaf':[50] ,'randomforest__max_depth':[None], 'randomforest__bootstrap':[True], 'randomforest__class_weight':['balanced'], 'randomforest__oob_score':[True], 'randomforest__n_jobs':[3]}) if 'extratrees' in classifier_names: #92.8 'n_estimators':500, 'max_depth':50 param_set.update({'extratrees__n_estimators':[300], 'extratrees__max_features':['auto'], 'extratrees__min_samples_leaf':[50], 'extratrees__max_depth':[None], 'extratrees__bootstrap':[False], 'extratrees__class_weight':['balanced'], 'extratrees__oob_score':[False], 'extratrees__n_jobs':[3]}) if 'gradboost' in classifier_names: #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50 param_set.update({'gradboost__n_estimators':[100], 'gradboost__max_features':['auto'], 'gradboost__learning_rate':[0.1], 'gradboost__min_samples_leaf':[50]}) if 'adaboost' in classifier_names: param_set.update({'adaboost__n_estimators':[100], 'adaboost__learning_rate':[0.1]}) if 'mlp' in classifier_names: # 95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs' param_set.update({'mlp__hidden_layer_sizes':[(50,)], 'mlp__alpha':[10], 'mlp__solver':['lbfgs']}) # run grid search or randomized search if tuning_method=='grid': search = GridSearchCV(eclf, param_grid=param_set, cv=2, n_jobs=3) elif tuning_method=='rand': search = RandomizedSearchCV(eclf, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3) return search
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") kernelRBF=1.0*RBF(1.0) clf=OutputCodeClassifier(estimator = DecisionTreeClassifier()) clf=clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def aa_tfidf_MaxEnt_OutputCode(data): job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', max_features = 2000, min_df = 5)), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def ab_tfidf_elasticnet_OutputCode(data): job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', min_df = 5)), ('elnet', OutputCodeClassifier( SGDClassifier(penalty="elasticnet"), code_size = 100))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], # , # [(1, 3)] elnet__estimator__alpha = [0.0001], # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99] job.run(pipeline, parameters, data) return None
def train(corpus): time = datetime.datetime.now() logging.info('Static Embedding Oracle') Y, X_dic = EmbeddingOracle.parseCorpus(corpus.trainingSents, EmbeddingOracle) vec = DictVectorizer() X = vec.fit_transform(X_dic) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X, Y) logging.info('Traingin Time: ' + str(int((datetime.datetime.now() - time).seconds / 60.))) return clf, vec
def test_ecoc_delegate_sparse_base_estimator(): # Non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/17218 X, y = iris.data, iris.target X_sp = sp.csc_matrix(X) # create an estimator that does not support sparse input base_estimator = CheckingClassifier( check_X=check_array, check_X_params={"ensure_2d": True, "accept_sparse": False}, ) ecoc = OutputCodeClassifier(base_estimator, random_state=0) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.fit(X_sp, y) ecoc.fit(X, y) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.predict(X_sp) # smoke test to check when sparse input should be supported ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) ecoc.fit(X_sp, y).predict(X_sp) assert len(ecoc.estimators_) == 4
def scikit_outputcode(X, y, X_test, y_test=None): from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC predictions = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0).fit(X, y).predict(X_test) correctcount = 0 totalcount = 0 for index, each in enumerate(predictions): if y_test[index] == each: correctcount += 1 totalcount += 1 print str(correctcount) + " / " + str(totalcount) + " = " + str( float(correctcount) / totalcount)
def ECOC(): print('Aplicando metodo multiclase ERROR CORRECTING OUTPUT CODES') for indice in lista_datasets: print('Base de datos: ' + str(indice)) dataset = arff.loadarff('./datasets/' + str(indice)) df = pd.DataFrame(dataset[0]) input = df.iloc[:, df.columns != 'class'] output = pd.factorize(df['class'])[0] X_train, X_test, Y_train, Y_test = train_test_split(input, output, test_size=0.25) clf = OutputCodeClassifier(KNeighborsClassifier(n_neighbors=5), code_size=2, random_state=0) clf.fit(X_train, Y_train) print('Porcentaje de bien clasificados ERROR CORRECTING OUTPUT CODES') print(clf.score(X_test, Y_test)) print('--------------------------')
def _model1(self, visDataObjects, features, labels): """Ted's round one. Find max margin in: for t in vis_types: for x in columns: yield margin(x_axis | t, x) Repeat for y. Then we basis so (independently) pick the best axis assignment for a chart type. """ from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) pass
def __init__(self, labels, data, load=False, save=False): if load: with open(clfData, 'rb') as input: self.classifier = pickle.load(input) with open(vecData, 'rb') as input: self.verctorizer = pickle.load(input) return self.verctorizer = DictVectorizer() featureVec = self.verctorizer.fit_transform(data) self.classifier = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) # self.classifier = LogisticRegression( solver='sag') self.classifier.fit(featureVec, labels) if save: with open(clfData, 'wb') as output: pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL) with open(vecData, 'wb') as output: pickle.dump(self.verctorizer, output, pickle.HIGHEST_PROTOCOL)
def evaluateOutputCode(X, Y, printReport=False): time = datetime.datetime.now() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X_train, Y_train) if printReport: print 'Training time:' + str(datetime.datetime.now() - time) print 'Evaluation result: OneVsOne: ' + str( clf.score(X_test, Y_test)) Y_test = clf.predict(X_test) if printReport: print '0: ' + str((Y_test == 0).sum()) print '1: ' + str((Y_test == 1).sum()) print '2: ' + str((Y_test == 2).sum()) return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
def _multiclass_refit(self, clf): """Return advanced choices of the classification method""" if self.args.multiclass == 'one-vs-rest': from sklearn.multiclass import OneVsRestClassifier print('[ML] Using one-vs-rest method to re-train') clf = OneVsRestClassifier(clf) elif self.args.multiclass == 'one-vs-one': from sklearn.multiclass import OneVsOneClassifier self.args.get_prob = False print('[ML] Using one-vs-one method to re-train') print('[ML] WARNING: Set get_prob to False') clf = OneVsOneClassifier(clf) elif self.args.multiclass == 'error-correcting': from sklearn.multiclass import OutputCodeClassifier print('[ML] Using error-correcting method to re-train') clf = OutputCodeClassifier(clf, code_size=2) return clf
def OutputCodeClassifier(data, label, pred_data, pred_last): ''' 0.76473194506 Number of mislabeled points out of a total 841 points : 211 0.749108204518 需要规范化 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(data, label) print clf.score(data, label) pred_result = clf.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print clf.score(pred_data, pred_last) return pred_result
def getFitness(individual, X, y): """ Feature subset fitness function """ if individual.count(0) != len(individual): # get index with value 0 cols = [index for index in range( len(individual)) if individual[index] == 0] # get features subset X_parsed = X.drop(X.columns[cols], axis=1) X_subset = pd.get_dummies(X_parsed) # X_subset = X # # for col in cols: # X_subset[col].values[:] = 0 # apply classification algorithm clf = AdaBoostClassifier() clf = BaggingClassifier() clf = BernoulliNB() clf = CalibratedClassifierCV() clf = CategoricalNB() clf = ClassifierChain() clf = ComplementNB() clf = DecisionTreeClassifier() clf = DummyClassifier() clf = ExtraTreeClassifier() clf = ExtraTreesClassifier() clf = GaussianNB() clf = GaussianProcessClassifier() clf = GradientBoostingClassifier() # clf = HistGradientBoostingClassifier() clf = KNeighborsClassifier() clf = LabelPropagation() clf = LabelSpreading() clf = LinearDiscriminantAnalysis() clf = LinearSVC() clf = LogisticRegression() clf = LogisticRegressionCV() clf = MLPClassifier() clf = MultiOutputClassifier() clf = MultinomialNB() clf = NearestCentroid() clf = NuSVC() clf = OneVsOneClassifier() clf = OneVsRestClassifier() clf = OutputCodeClassifier() clf = PassiveAggressiveClassifier() clf = Perceptron() clf = QuadraticDiscriminantAnalysis() clf = RadiusNeighborsClassifier() clf = RandomForestClassifier() clf = RidgeClassifier() clf = RidgeClassifierCV() clf = SGDClassifier() clf = SVC() clf = StackingClassifier() clf = VotingClassifier() # clf.fit(X, y) # clf.fit(X_subset, y_train) clf.fit(X_subset, y) # y_pred_ANN = clf.predict(X_test) # y_pred = clf.predict(X_subset) # score = cross_val_score(clf, X, y, cv=5) # # print(max(score), min(score)) return (avg(cross_val_score(clf, X_subset, y, cv=5)),) # return (avg(score),) # return accuracy_score(y, y_pred_ANN) else: return (0,)
def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ecoc.predict, [])
def __init__(self, X, y, people, df_features, feature_names, conf_dict): self.X = X self.y = np.array(y) self.feature_names = feature_names self.people = people.reset_index(drop=True) self.X_df = df_features self.y_df = y.reset_index(drop=True) self.app_list = config_dict["app_list"] self.labels_numeric = {name: i for i, name in enumerate(self.app_list)} self.n_classes = len(self.labels_numeric) self.clf_name = config_dict["classifier"] # self.feature_selection = config_dict["feature_selection"] #True/False # self.num_features = config_dict["num_features"] # self.one_vs_all_type = config_dict["one_vs_all_type"] self.feature_selection = conf_dict["feature_selection"] #True/False self.num_features = conf_dict["num_features"] self.one_vs_all_type = conf_dict["one_vs_all_type"] self.chosen_feature_names = None self.chosen_features_all_folds = [] self.clf_dict = {} #self.clf_dict["one_vs_all"] = OneVsRestClassifier(SVC(kernel='rbf', C=1000, gamma=0.001)) self.clf_dict["output_code"] = OutputCodeClassifier(SVC(kernel='rbf', C=1000, gamma=0.001), code_size=2, random_state=0) params_rf = { 'n_estimators': 100, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 0 } #params_rf = {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 20, 'random_state': 0} self.clf_dict["rf"] = RandomForestClassifier(**params_rf) # self.clf_dict["svm"] = SVC(kernel='rbf', C=1000, gamma=0.001) # self.clf_dict["svm"] = SVC(kernel='linear', C=1, gamma=0.001) params_svm = { 'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'sigmoid' } self.clf_dict["svm"] = SVC(**params_svm) #Naive Bayes classifier is a general term which refers to conditional independence of each of the features in the model, while Multinomial Naive Bayes classifier is a specific instance of a Naive Bayes classifier which uses a multinomial distribution for each of the features. self.clf_dict["nb"] = MultinomialNB(alpha=0.00001) self.clf_dict["gnb"] = GaussianNB(var_smoothing=0.05) self.clf_dict["knn"] = KNeighborsClassifier(n_neighbors=8) params_dt = { 'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best' } self.clf_dict["dt"] = DecisionTreeClassifier(**params_dt) self.clf_dict["one_vs_all"] = OneVsRestClassifier( self.clf_dict[conf_dict["one_vs_all_type"]]) self.fs_dict = {} self.fs_dict["selectKbest_chi2"] = SelectKBest(chi2, k=self.num_features) self.fs_dict["selectKbest_fclassif"] = SelectKBest(f_classif, k=self.num_features)
WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients'] ] test_ingredients.append(' '.join(ings)) #used to encode labels as numbers for use with RandomForestClassifier le = LabelEncoder() #encode cuisines as numbers train_cuisines = le.fit_transform(train_cuisines) #used to create bag of ingredients vocabulary and create features for each entry vectorizer = CountVectorizer() train_features = vectorizer.fit_transform(train_ingredients).toarray() test_features = vectorizer.transform(test_ingredients).toarray() clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2) result = clf.fit(train_features, train_cuisines).predict(test_features) output = pd.DataFrame(data={ 'id': test_ids, 'cuisine': le.inverse_transform(result) }) #force explicit ordering of columns output = output[['id', 'cuisine']] output.to_csv('ecoc.csv', index=False)
def main(): filenameLB = 'mfcc_lb.csv' allsongcat = pickle.load(open('mfcc_fv.p', 'rb')) hcdf = pickle.load(open('hcdf_fv.p', 'rb')) with open('mfcc_lb.csv') as f: reader = csv.reader(f) for row in reader: labels = row # select training and test sets ''' TEidx = np.array(random.sample(range(0,1000), 100)) training = [] test = [] trainingLB = [] testLB = [] # make numpy arrays for i in range(1000): if i in TEidx: test.append(featureDict[i]) testLB.append(int(labels[i])) else: training.append(featureDict[i]) trainingLB.append(int(labels[i])) # fit with classifier and predict X = np.array(training) Y = np.array(trainingLB) ''' l = [allsongcat, hcdf] all_feats = combineFeatures(l) feats_shuf = [] labels_shuf = [] index_shuf = range(len(labels)) shuffle(index_shuf) for i in index_shuf: feats_shuf.append(all_feats[i]) labels_shuf.append(labels[i]) X = np.array(feats_shuf) Y = np.array(labels_shuf) kf = KFold(1000, n_folds=10) #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2') sgd = SGDClassifier(loss="hinge", penalty="l2") #svc = svm.SVC(kernel='linear') dtree = DecisionTreeClassifier(max_depth=3) lsvc = LinearSVC(random_state=0) cla = OutputCodeClassifier(sgd, code_size=128, random_state=0) cm_all = np.zeros((10, 10), dtype=np.int) cb = np.zeros((10, 20)) losses = [] with open('ECOC_sgd_error.csv', 'w') as f1: wrtest = csv.writer(f1, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') scores = 0.0 for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[ test] cla.fit(X_train, y_train) predictions = cla.predict(X_test) loss = zero_one_loss(predictions, y_test) losses.append(loss) scores += loss # print y_test # print predictions cb = cla.code_book_ np.savetxt('codebook.csv', cb, delimiter=',') # Compute confusion matrix cm = confusion_matrix( y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) np.set_printoptions(precision=2) #print(cm_all) cm_all = np.add(cm_all, cm) # make ECOC coding matrix 0-1 binary cb[cb <= 0] = 0 wrtest.writerow(losses) print cb print scores / 10
def single_classifier(clf_name): # create the classifier objects classifiers = { 'knn':KNeighborsClassifier(), 'logistic':LogisticRegression(), 'lda':LinearDiscriminantAnalysis(), 'svm':SVC(), 'tree':DecisionTreeClassifier(), 'randomforest':RandomForestClassifier(), 'extratrees':ExtraTreesClassifier(), 'gradboost':GradientBoostingClassifier(), 'adaboost':AdaBoostClassifier(), 'mlp':MLPClassifier(), 'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,class_weight='balanced'), code_size=2)} # feature selection using a pipeline if f_sel_method=='none': pipe = Pipeline([('clf',classifiers[clf_name])]) param_set = {} elif f_sel_method=='anova': pipe = Pipeline([('f_sel',SelectPercentile(score_func=f_classif)), ('clf',classifiers[clf_name])]) param_set = {'f_sel__percentile':[25,50,75,100]} elif f_sel_method=='mutualinfo': pipe = Pipeline([('f_sel',SelectPercentile(score_func=mutual_info_classif)), ('clf',classifiers[clf_name])]) param_set = {'f_sel__percentile':[25,50,75,100]} elif f_sel_method=='recursivesvm': f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced') pipe = Pipeline([('f_sel',RFECV(estimator=f_sel)), ('clf',classifiers[clf_name])]) param_set = {'f_sel__step':[10], 'f_sel__cv':[2], 'f_sel__scoring':['accuracy']} elif f_sel_method=='frommodelsvm': f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced') pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])]) param_set = {} elif f_sel_method=='frommodeltree': f_sel = ExtraTreesClassifier(n_estimators=100, class_weight='balanced') pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])]) param_set = {} # specify parameters of the classifiers if clf_name=='knn': #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance' param_set.update({'clf__n_neighbors':[1,9,13,17,25,50], 'clf__p':[1,2,3,5], 'clf__weights':['distance'], 'clf__algorithm':['auto'], 'clf__n_jobs':[3]}) elif clf_name=='logistic': #94.4 'C':1, 'solver':'newton-cg' param_set.update({'clf__C':[1,2,3,4], 'clf__solver':['newton-cg'], 'clf__class_weight':['balanced'], 'clf__max_iter':[100]}) elif clf_name=='lda': #94.9 'solver':'lsqr' param_set.update({'clf__solver':['lsqr','eigen'], 'clf__shrinkage':['auto']}) elif clf_name=='svm': #95.3 'C':1, 'kernel':'linear' param_set.update({'clf__C':[0.75,1,1.25,1.5,2], 'clf__kernel':['linear'], 'clf__shrinking':[True], 'clf__probability':[False], 'clf__class_weight':['balanced'], 'clf__decision_function_shape':['ovr']}) elif clf_name=='tree': #82.3 'max_depth':15 param_set.update({'clf__min_samples_leaf':[10,50,75,100], 'clf__class_weight':['balanced'], 'clf__presort':[True]}) elif clf_name=='randomforest': #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25 param_set.update({'clf__n_estimators':[500,1000], 'clf__max_features':[5,10,25], 'clf__min_samples_leaf':[1,10,25] ,'clf__max_depth':[None], 'clf__bootstrap':[True], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]}) elif clf_name=='extratrees': #92.8 'n_estimators':500, 'max_depth':50 param_set.update({'clf__n_estimators':[100,500,1000], 'clf__max_features':[5,10,20,25,50,100,150], 'clf__min_samples_leaf':[1,10,25,50,100], 'clf__max_depth':[None], 'clf__bootstrap':[False], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]}) elif clf_name=='gradboost': #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50 param_set.update({'clf__n_estimators':[100], 'clf__max_features':['auto'], 'clf__learning_rate':[0.1], 'clf__min_samples_leaf':[50]}) elif clf_name=='adaboost': #57.9 'n_estimators':100, 'learning_rate':0.1 param_set.update({'clf__n_estimators':[100,500], 'clf__learning_rate':[0.01,0.1]}) elif clf_name=='mlp': #95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs' param_set.update({'clf__hidden_layer_sizes':[(50,),(60,),(100,)], 'clf__alpha':[0.5,1,2,5,7], 'clf__solver':['adam']}) elif clf_name=='ecoc': param_set.update({}) # run grid search or randomized search if tuning_method=='grid': search = GridSearchCV(pipe, param_grid=param_set, cv=2, n_jobs=3) elif tuning_method=='rand': search = RandomizedSearchCV(pipe, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3) return search
# predict predictions = classifier.predict(valid_X) accuracy_score(valid_label, predictions) from sklearn.metrics import accuracy_score accuracy_score(y_test, predictions) ## from sklearn.multiclass import OneVsRestClassifier from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.multiclass import OutputCodeClassifier classifier = OutputCodeClassifier(GradientBoostingClassifier(max_depth=5, n_estimators=14), code_size=2, random_state=0) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracy_score(y_test, predictions) # creating a confusion matrix cm = confusion_matrix(y_test, dtree_predictions) ### test data test['age_bin'] = test['age'].apply(lambda x: age_bin(x)) test = test[~test['image_name'].isin(wrong_im_test)] encode_columns_test = test[['age_bin', 'gender', 'view_position']]
def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) with pytest.raises(NotFittedError): ecoc.predict([])