def test_ecoc_fit_predict(): # A classifier which implements decision_function. ecoc = OutputCodeClassifier(LinearSVC(), code_size=2) pred = ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2) # A classifier which implements predict_proba. ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2) pred = ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2)
def train_svm(labels,array, num_folds, num_jobs, params = 2): #obtain the best parameter settings for an svm outputcode classifier bestParameters = dict() if len(labels) > 2: print("outputcodeclassifier") #param_grid = {'estimator__C': [0.001, 0.005, 0.01,0.1, 0.5, 1,2.5, 5, 10,15,25, 50,75, 100, 500, 1000], # 'estimator__kernel': ['linear','rbf','poly'], # 'estimator__gamma': [0.0005,0.001, 0.002, 0.008,0.016, 0.032,0.064, 0.128,0.256, 0.512, 1.024, 2.048], # 'estimator__degree': [1,2,3,4]} param_grid = {'estimator__C': [0.001, 0.005], 'estimator__kernel': ['linear','rbf'], 'estimator__gamma': [0.0005,0.001], 'estimator__degree': [1]} model = OutputCodeClassifier(svm.SVC(probability=True)) else: print("svc model") param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'kernel': ['linear','rbf','poly'], 'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'degree': [1,2,3,4]} model = svm.SVC(probability=True) paramsearch = RandomizedSearchCV(model, param_grid, cv=num_folds, verbose=2,n_iter = params,n_jobs=num_jobs) print("Grid search...") paramsearch.fit(array,numpy.asarray(labels)) print("Prediction...") parameters = paramsearch.best_params_ for parameter in parameters.keys(): print(parameter + ": " + str(parameters[parameter]) + "\n") print("best score: " + str(paramsearch.best_score_) + "\n\n") #for score in paramsearch.grid_scores_: # print 'mean score:',score.mean_validation_score # print 'list scores:',score.cv_validation_scores #train an svm outputcode classifier using the best parameters if len(labels) > 2: test = svm.SVC(probability=True, C=parameters['estimator__C'], kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'], degree=parameters['estimator__degree']) out_test = OutputCodeClassifier(test,n_jobs=1) out_test.fit(array,labels) else: test = svm.SVC(probability=True, C=parameters['C'], kernel=parameters['kernel'],gamma=parameters['gamma'], degree=parameters['degree']) #test.fit(array,labels) return test
def test_ecoc_fit_predict(): # A classifier which implements decision_function. ecoc = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2) # A classifier which implements predict_proba. ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0) ecoc.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ecoc.estimators_), n_classes * 2)
def __init__(self, labels, data, load=False, save=False): if load: with open(clfData, 'rb') as input: self.classifier = pickle.load(input) with open(vecData, 'rb') as input: self.verctorizer = pickle.load(input) return self.verctorizer = DictVectorizer() featureVec = self.verctorizer.fit_transform(data) self.classifier = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) # self.classifier = LogisticRegression( solver='sag') self.classifier.fit(featureVec, labels) if save: with open(clfData, 'wb') as output: pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL) with open(vecData, 'wb') as output: pickle.dump(self.verctorizer, output, pickle.HIGHEST_PROTOCOL)
def af_vecAvg_MaxEnt_OutputCode(data): job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def voting_classifier(): # create the classifier objects f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced') classifiers = { 'knn':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',KNeighborsClassifier())]), 'logistic':LogisticRegression(), 'lda':LinearDiscriminantAnalysis(), 'svm':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',SVC())]), 'tree':DecisionTreeClassifier(), 'randomforest':RandomForestClassifier(), 'extratrees':ExtraTreesClassifier(), 'gradboost':GradientBoostingClassifier(), 'adaboost':AdaBoostClassifier(), 'mlp':MLPClassifier(), 'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,probability=True,class_weight='balanced'), code_size=2)} # create ensemble of the classifiers clfs = [] [clfs.append((name,classifiers.get(name))) for name in classifier_names] # create the voting classifier voting_type = classification_method[0:4] eclf = VotingClassifier(estimators=clfs, voting=voting_type) # specify parameters of the classifiers param_set = {} if 'knn' in classifier_names: #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance' param_set.update({'knn__clf__n_neighbors':[17], 'knn__clf__p':[1], 'knn__clf__weights':['distance'], 'knn__clf__algorithm':['auto'], 'knn__clf__n_jobs':[3]}) if 'logistic' in classifier_names: #94.4 'C':1, 'solver':'newton-cg' param_set.update({'logistic__C':[2], 'logistic__solver':['lbfgs'], 'logistic__class_weight':['balanced'], 'logistic__max_iter':[100]}) if 'lda' in classifier_names: #94.9 'solver':'lsqr' param_set.update({'lda__solver':['lsqr'], 'lda__shrinkage':['auto']}) if 'svm' in classifier_names: #95.3 'C':1, 'kernel':'linear' param_set.update({'svm__clf__C':[2], 'svm__clf__kernel':['linear'], 'svm__clf__shrinking':[True], 'svm__clf__probability':[True], 'svm__clf__class_weight':['balanced'], 'svm__clf__decision_function_shape':['ovo']}) if 'tree' in classifier_names: #82.3 'max_depth':15 param_set.update({'tree__max_depth':[10,15,20], 'tree__class_weight':['balanced'], 'tree__presort':[True]}) if 'randomforest' in classifier_names: #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25 param_set.update({'randomforest__n_estimators':[100], 'randomforest__max_features':[10,25,50], 'randomforest__min_samples_leaf':[50] ,'randomforest__max_depth':[None], 'randomforest__bootstrap':[True], 'randomforest__class_weight':['balanced'], 'randomforest__oob_score':[True], 'randomforest__n_jobs':[3]}) if 'extratrees' in classifier_names: #92.8 'n_estimators':500, 'max_depth':50 param_set.update({'extratrees__n_estimators':[300], 'extratrees__max_features':['auto'], 'extratrees__min_samples_leaf':[50], 'extratrees__max_depth':[None], 'extratrees__bootstrap':[False], 'extratrees__class_weight':['balanced'], 'extratrees__oob_score':[False], 'extratrees__n_jobs':[3]}) if 'gradboost' in classifier_names: #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50 param_set.update({'gradboost__n_estimators':[100], 'gradboost__max_features':['auto'], 'gradboost__learning_rate':[0.1], 'gradboost__min_samples_leaf':[50]}) if 'adaboost' in classifier_names: param_set.update({'adaboost__n_estimators':[100], 'adaboost__learning_rate':[0.1]}) if 'mlp' in classifier_names: # 95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs' param_set.update({'mlp__hidden_layer_sizes':[(50,)], 'mlp__alpha':[10], 'mlp__solver':['lbfgs']}) # run grid search or randomized search if tuning_method=='grid': search = GridSearchCV(eclf, param_grid=param_set, cv=2, n_jobs=3) elif tuning_method=='rand': search = RandomizedSearchCV(eclf, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3) return search
def aa_tfidf_MaxEnt_OutputCode(data): job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', max_features = 2000, min_df = 5)), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def test_ecoc_float_y(): # Test that the OCC errors on float targets X = iris.data y = iris.data[:, 0] ovo = OutputCodeClassifier(LinearSVC()) msg = "Unknown label type" with pytest.raises(ValueError, match=msg): ovo.fit(X, y) ovo = OutputCodeClassifier(LinearSVC(), code_size=-1) msg = "code_size should be greater than 0, got -1" with pytest.raises(ValueError, match=msg): ovo.fit(X, y)
def ab_tfidf_elasticnet_OutputCode(data): job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', min_df = 5)), ('elnet', OutputCodeClassifier( SGDClassifier(penalty="elasticnet"), code_size = 100))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], # , # [(1, 3)] elnet__estimator__alpha = [0.0001], # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99] job.run(pipeline, parameters, data) return None
def scikit_outputcode(X, y, X_test, y_test=None): from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC predictions = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0).fit(X, y).predict(X_test) correctcount = 0 totalcount = 0 for index, each in enumerate(predictions): if y_test[index] == each: correctcount += 1 totalcount += 1 print str(correctcount) + " / " + str(totalcount) + " = " + str( float(correctcount) / totalcount)
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") kernelRBF=1.0*RBF(1.0) clf=OutputCodeClassifier(estimator = DecisionTreeClassifier()) clf=clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def OutputCodeClassifier(data, label, pred_data, pred_last): ''' 0.76473194506 Number of mislabeled points out of a total 841 points : 211 0.749108204518 需要规范化 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(data, label) print clf.score(data, label) pred_result = clf.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print clf.score(pred_data, pred_last) return pred_result
def _model1(self, visDataObjects, features, labels): """Ted's round one. Find max margin in: for t in vis_types: for x in columns: yield margin(x_axis | t, x) Repeat for y. Then we basis so (independently) pick the best axis assignment for a chart type. """ from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) pass
def _multiclass_refit(self, clf): """Return advanced choices of the classification method""" if self.args.multiclass == 'one-vs-rest': from sklearn.multiclass import OneVsRestClassifier print('[ML] Using one-vs-rest method to re-train') clf = OneVsRestClassifier(clf) elif self.args.multiclass == 'one-vs-one': from sklearn.multiclass import OneVsOneClassifier self.args.get_prob = False print('[ML] Using one-vs-one method to re-train') print('[ML] WARNING: Set get_prob to False') clf = OneVsOneClassifier(clf) elif self.args.multiclass == 'error-correcting': from sklearn.multiclass import OutputCodeClassifier print('[ML] Using error-correcting method to re-train') clf = OutputCodeClassifier(clf, code_size=2) return clf
def evaluateOutputCode(X, Y, printReport=False): time = datetime.datetime.now() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X_train, Y_train) if printReport: print 'Training time:' + str(datetime.datetime.now() - time) print 'Evaluation result: OneVsOne: ' + str( clf.score(X_test, Y_test)) Y_test = clf.predict(X_test) if printReport: print '0: ' + str((Y_test == 0).sum()) print '1: ' + str((Y_test == 1).sum()) print '2: ' + str((Y_test == 2).sum()) return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
def train_svm(self,params = 10): #obtain the best parameter settings for an svm outputcode classifier if len(self.labels) > 2: print("outputcodeclassifier") param_grid = {'estimator__C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'estimator__kernel': ['linear','rbf','poly'], 'estimator__gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'estimator__degree': [1,2,3,4]} model = OutputCodeClassifier(svm.SVC(probability=True)) else: print("svc model") param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'kernel': ['linear','rbf','poly'], 'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'degree': [1,2,3,4]} model = svm.SVC(probability=True) paramsearch = RandomizedSearchCV(model, param_grid, cv=5, verbose=2,n_iter = params,n_jobs=self.jobs) print("Grid search...") paramsearch.fit(self.training_csr,numpy.asarray(self.trainlabels)) print("Prediction...") #print the best parameters to the file parameters = paramsearch.best_params_ self.outstring = "best parameter settings:\n" for parameter in parameters.keys(): self.outstring += (parameter + ": " + str(parameters[parameter]) + "\n") self.outstring += ("best score: " + str(paramsearch.best_score_) + "\n\n") #train an svm outputcode classifier using the best parameters if len(self.labels) > 2: clf = svm.SVC(probability=True, C=parameters['estimator__C'], kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'], degree=parameters['estimator__degree']) self.clf = OutputCodeClassifier(clf,n_jobs=self.jobs) self.clf.fit(self.training_csr,self.trainlabels) else: self.clf = svm.SVC(probability=True, C=parameters['C'], kernel=parameters['kernel'],gamma=parameters['gamma'], degree=parameters['degree']) self.clf.fit(self.training_csr,self.trainlabels)
from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC from sklearn.datasets import load_svmlight_file import numpy as np import sklearn TEST_SPLIT = .2 X, Y = load_svmlight_file("ablated_features.txt") num_instances = len(Y) num_test = int((1 - TEST_SPLIT) * num_instances) indices = np.arange(num_instances) np.random.shuffle(indices) X = X[indices] Y = Y[indices] X_train = X[:num_test] Y_train = Y[:num_test] X_test = X[num_test:] Y_test = Y[num_test:] # print X_train.shape[0], X_test.shape[0] clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=20, random_state=0) preds = clf.fit(X_train, Y_train).predict(X_test) print sklearn.metrics.accuracy_score(Y_test, preds)
# predict predictions = classifier.predict(valid_X) accuracy_score(valid_label, predictions) from sklearn.metrics import accuracy_score accuracy_score(y_test, predictions) ## from sklearn.multiclass import OneVsRestClassifier from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.multiclass import OutputCodeClassifier classifier = OutputCodeClassifier(GradientBoostingClassifier(max_depth=5, n_estimators=14), code_size=2, random_state=0) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) accuracy_score(y_test, predictions) # creating a confusion matrix cm = confusion_matrix(y_test, dtree_predictions) ### test data test['age_bin'] = test['age'].apply(lambda x: age_bin(x)) test = test[~test['image_name'].isin(wrong_im_test)] encode_columns_test = test[['age_bin', 'gender', 'view_position']]
print('NB for BOW KF1',NB_BOW_KF1) print('NB for TF-IDF KF1',NB_TFIDF_KF1) NB_BOW_KF2=np.sum(clf.predict(X4)==Y4.values.tolist())/X2.shape[0]; NB_TFIDF_KF2=np.sum(clf2.predict(X4_tf)==Y4.values.tolist())/X2.shape[0]; print('NB for BOW KF2',NB_BOW_KF2) print('NB for TF-IDF KF2',NB_TFIDF_KF2) # In[66]: #SVM from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0),code_size=2, random_state=0) SVM_BOW_KF1=np.sum(clf.fit(X_train, y_train).predict(X2)==Y2.values.tolist())/X2.shape[0] # save the model to disk filename = 'finalized_model3.sav' pickle.dump(clf, open(filename, 'wb')) clf1 = pickle.load(open(filename, 'rb')) SVM_TFIDF_KF1=np.sum(clf.fit(tfidf_train, y_train).predict(X2_tf)==Y2.values.tolist())/X2.shape[0] # save the model to disk filename = 'finalized_model4.sav' pickle.dump(clf, open(filename, 'wb')) clf2 = pickle.load(open(filename, 'rb'))
#check details print(f'The size of the data is {breast.data.shape}') print(f'There are {breast.target_names} classifiers') # split the dataset into training and testing x_train, x_test, y_train, y_test = train_test_split(breast.data, breast.target, test_size=0.2) # creating a classification clf_1 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42) clf_2 = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=42) # train the classifier with training data clf_1.fit(x_train, y_train) clf_2.fit(x_train, y_train) # find y_pred prediction best on x_test data y_pred_1 = clf_1.predict(x_test) y_pred_2 = clf_2.predict(x_test) # calculate accuracy of y_pred using y_test print(f'accuracy {accuracy_score(y_test, y_pred_1)}') print(f'accuracy {accuracy_score(y_test, y_pred_2)}') # use classification_report function to print more information
train_ingredients.append(' '.join(ings)) #construct test_ingredients for entry in test_set: ings = [WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients']] test_ingredients.append(' '.join(ings)) #used to encode labels as numbers for use with RandomForestClassifier le = LabelEncoder() #encode cuisines as numbers train_cuisines = le.fit_transform(train_cuisines) #used to create bag of ingredients vocabulary and create features for each entry vectorizer = CountVectorizer() train_features = vectorizer.fit_transform(train_ingredients).toarray() test_features = vectorizer.transform(test_ingredients).toarray() clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2) result = clf.fit(train_features, train_cuisines).predict(test_features) output = pd.DataFrame(data={'id':test_ids, 'cuisine':le.inverse_transform(result)}) #force explicit ordering of columns output = output[['id', 'cuisine']] output.to_csv('ecoc.csv', index=False)
def main(): filenameLB = 'mfcc_lb.csv' allsongcat = pickle.load(open('mfcc_fv.p', 'rb')) hcdf = pickle.load(open('hcdf_fv.p', 'rb')) with open('mfcc_lb.csv') as f: reader = csv.reader(f) for row in reader: labels = row # select training and test sets ''' TEidx = np.array(random.sample(range(0,1000), 100)) training = [] test = [] trainingLB = [] testLB = [] # make numpy arrays for i in range(1000): if i in TEidx: test.append(featureDict[i]) testLB.append(int(labels[i])) else: training.append(featureDict[i]) trainingLB.append(int(labels[i])) # fit with classifier and predict X = np.array(training) Y = np.array(trainingLB) ''' l = [allsongcat, hcdf] all_feats = combineFeatures(l) feats_shuf = [] labels_shuf = [] index_shuf = range(len(labels)) shuffle(index_shuf) for i in index_shuf: feats_shuf.append(all_feats[i]) labels_shuf.append(labels[i]) X = np.array(feats_shuf) Y = np.array(labels_shuf) kf = KFold(1000, n_folds=10) #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2') sgd = SGDClassifier(loss="hinge", penalty="l2") #svc = svm.SVC(kernel='linear') dtree = DecisionTreeClassifier(max_depth=3) lsvc = LinearSVC(random_state=0) cla = OutputCodeClassifier(sgd, code_size=128, random_state=0) cm_all = np.zeros((10, 10), dtype=np.int) cb = np.zeros((10, 20)) losses = [] with open('ECOC_sgd_error.csv', 'w') as f1: wrtest = csv.writer(f1, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') scores = 0.0 for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[ test] cla.fit(X_train, y_train) predictions = cla.predict(X_test) loss = zero_one_loss(predictions, y_test) losses.append(loss) scores += loss # print y_test # print predictions cb = cla.code_book_ np.savetxt('codebook.csv', cb, delimiter=',') # Compute confusion matrix cm = confusion_matrix( y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) np.set_printoptions(precision=2) #print(cm_all) cm_all = np.add(cm_all, cm) # make ECOC coding matrix 0-1 binary cb[cb <= 0] = 0 wrtest.writerow(losses) print cb print scores / 10
def oc_classify(X,Y): size = np.count_nonzero(sp.unique(Y)) clf = OutputCodeClassifier(LinearSVC(),code_size=size) clf.fit(X,Y) return clf
print "Running Feature Extraction.." vectorizer = CountVectorizer() #initialise Bag of words train_count = vectorizer.fit_transform(train.Phrase) print "Bag of words Counts: ", train_count.shape #Tf-Idf Transformer print "Running Tf-Idf Transformer" tf_idf = TfidfTransformer() #initialise Tf-Idf Transformer train_tf_idf = tf_idf.fit_transform(train_count) print "Tf-Idf : ", train_tf_idf.shape #Process the test set print "Processing Test set.. \n" test_count = vectorizer.transform(test.Phrase) test_tf_idf = tf_idf.transform(test_count) print "Training the Model and predicting on the Test data.." predicted1 = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0).fit( train_tf_idf, train.Sentiment).predict(test_tf_idf) print "Writing the output in a csv file..." output = pd.DataFrame(data={ "PhraseId": test.PhraseId, "Sentiment": predicted1 }) output.to_csv("Sentiment Analysis on Movie Reviews -- OutputCode", index=False, quoting=3)
def single_classifier(clf_name): # create the classifier objects classifiers = { 'knn':KNeighborsClassifier(), 'logistic':LogisticRegression(), 'lda':LinearDiscriminantAnalysis(), 'svm':SVC(), 'tree':DecisionTreeClassifier(), 'randomforest':RandomForestClassifier(), 'extratrees':ExtraTreesClassifier(), 'gradboost':GradientBoostingClassifier(), 'adaboost':AdaBoostClassifier(), 'mlp':MLPClassifier(), 'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,class_weight='balanced'), code_size=2)} # feature selection using a pipeline if f_sel_method=='none': pipe = Pipeline([('clf',classifiers[clf_name])]) param_set = {} elif f_sel_method=='anova': pipe = Pipeline([('f_sel',SelectPercentile(score_func=f_classif)), ('clf',classifiers[clf_name])]) param_set = {'f_sel__percentile':[25,50,75,100]} elif f_sel_method=='mutualinfo': pipe = Pipeline([('f_sel',SelectPercentile(score_func=mutual_info_classif)), ('clf',classifiers[clf_name])]) param_set = {'f_sel__percentile':[25,50,75,100]} elif f_sel_method=='recursivesvm': f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced') pipe = Pipeline([('f_sel',RFECV(estimator=f_sel)), ('clf',classifiers[clf_name])]) param_set = {'f_sel__step':[10], 'f_sel__cv':[2], 'f_sel__scoring':['accuracy']} elif f_sel_method=='frommodelsvm': f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced') pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])]) param_set = {} elif f_sel_method=='frommodeltree': f_sel = ExtraTreesClassifier(n_estimators=100, class_weight='balanced') pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])]) param_set = {} # specify parameters of the classifiers if clf_name=='knn': #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance' param_set.update({'clf__n_neighbors':[1,9,13,17,25,50], 'clf__p':[1,2,3,5], 'clf__weights':['distance'], 'clf__algorithm':['auto'], 'clf__n_jobs':[3]}) elif clf_name=='logistic': #94.4 'C':1, 'solver':'newton-cg' param_set.update({'clf__C':[1,2,3,4], 'clf__solver':['newton-cg'], 'clf__class_weight':['balanced'], 'clf__max_iter':[100]}) elif clf_name=='lda': #94.9 'solver':'lsqr' param_set.update({'clf__solver':['lsqr','eigen'], 'clf__shrinkage':['auto']}) elif clf_name=='svm': #95.3 'C':1, 'kernel':'linear' param_set.update({'clf__C':[0.75,1,1.25,1.5,2], 'clf__kernel':['linear'], 'clf__shrinking':[True], 'clf__probability':[False], 'clf__class_weight':['balanced'], 'clf__decision_function_shape':['ovr']}) elif clf_name=='tree': #82.3 'max_depth':15 param_set.update({'clf__min_samples_leaf':[10,50,75,100], 'clf__class_weight':['balanced'], 'clf__presort':[True]}) elif clf_name=='randomforest': #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25 param_set.update({'clf__n_estimators':[500,1000], 'clf__max_features':[5,10,25], 'clf__min_samples_leaf':[1,10,25] ,'clf__max_depth':[None], 'clf__bootstrap':[True], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]}) elif clf_name=='extratrees': #92.8 'n_estimators':500, 'max_depth':50 param_set.update({'clf__n_estimators':[100,500,1000], 'clf__max_features':[5,10,20,25,50,100,150], 'clf__min_samples_leaf':[1,10,25,50,100], 'clf__max_depth':[None], 'clf__bootstrap':[False], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]}) elif clf_name=='gradboost': #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50 param_set.update({'clf__n_estimators':[100], 'clf__max_features':['auto'], 'clf__learning_rate':[0.1], 'clf__min_samples_leaf':[50]}) elif clf_name=='adaboost': #57.9 'n_estimators':100, 'learning_rate':0.1 param_set.update({'clf__n_estimators':[100,500], 'clf__learning_rate':[0.01,0.1]}) elif clf_name=='mlp': #95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs' param_set.update({'clf__hidden_layer_sizes':[(50,),(60,),(100,)], 'clf__alpha':[0.5,1,2,5,7], 'clf__solver':['adam']}) elif clf_name=='ecoc': param_set.update({}) # run grid search or randomized search if tuning_method=='grid': search = GridSearchCV(pipe, param_grid=param_set, cv=2, n_jobs=3) elif tuning_method=='rand': search = RandomizedSearchCV(pipe, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3) return search
knn.fit(train_ft, train_label).score(test_ft, test_label)) print('LogisticRegression score: %f' % logistic.fit(train_ft, train_label).score(test_ft, test_label)) # SVM list_of_acc = list() accur = 0 # for c in np.logspace(-2, 10, 5): c = 1000 # for c in np.logspace(-2, 10, 5): # for c in np.logspace(-2, 10, 5): for c in [100, 1000, 10000, 100000]: for g in np.logspace(-9, 3, 13): clf = OutputCodeClassifier(svm.SVC(random_state=0, gamma=g, C=c), code_size=10, random_state=0) accur_temp = clf.fit(svmtrain, svmtrainlabel).score(svmtest, svmtestlabel) if accur < accur_temp: accur = accur_temp gamma = g print(c, g, accur) list_of_acc.append(accur) print(np.mean(list_of_acc))
def getFitness(individual, X, y): """ Feature subset fitness function """ if individual.count(0) != len(individual): # get index with value 0 cols = [index for index in range( len(individual)) if individual[index] == 0] # get features subset X_parsed = X.drop(X.columns[cols], axis=1) X_subset = pd.get_dummies(X_parsed) # X_subset = X # # for col in cols: # X_subset[col].values[:] = 0 # apply classification algorithm clf = AdaBoostClassifier() clf = BaggingClassifier() clf = BernoulliNB() clf = CalibratedClassifierCV() clf = CategoricalNB() clf = ClassifierChain() clf = ComplementNB() clf = DecisionTreeClassifier() clf = DummyClassifier() clf = ExtraTreeClassifier() clf = ExtraTreesClassifier() clf = GaussianNB() clf = GaussianProcessClassifier() clf = GradientBoostingClassifier() # clf = HistGradientBoostingClassifier() clf = KNeighborsClassifier() clf = LabelPropagation() clf = LabelSpreading() clf = LinearDiscriminantAnalysis() clf = LinearSVC() clf = LogisticRegression() clf = LogisticRegressionCV() clf = MLPClassifier() clf = MultiOutputClassifier() clf = MultinomialNB() clf = NearestCentroid() clf = NuSVC() clf = OneVsOneClassifier() clf = OneVsRestClassifier() clf = OutputCodeClassifier() clf = PassiveAggressiveClassifier() clf = Perceptron() clf = QuadraticDiscriminantAnalysis() clf = RadiusNeighborsClassifier() clf = RandomForestClassifier() clf = RidgeClassifier() clf = RidgeClassifierCV() clf = SGDClassifier() clf = SVC() clf = StackingClassifier() clf = VotingClassifier() # clf.fit(X, y) # clf.fit(X_subset, y_train) clf.fit(X_subset, y) # y_pred_ANN = clf.predict(X_test) # y_pred = clf.predict(X_subset) # score = cross_val_score(clf, X, y, cv=5) # # print(max(score), min(score)) return (avg(cross_val_score(clf, X_subset, y, cv=5)),) # return (avg(score),) # return accuracy_score(y, y_pred_ANN) else: return (0,)
# -*- coding: utf-8 -*- """ Created on Fri May 24 20:38:46 2019 @author: pathouli """ import pandas as pd from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC the_path = 'C:/Users/pathouli/myStuff/academia/torhea/projects/groupC/' allstate_data = pd.read_csv(the_path + 'train.csv', sep=",") clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) label_cols = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] X_cols = allstate_data.columns.difference(label_cols) X = allstate_data[X_cols][1:10000] y = allstate_data[label_cols][1:10000] #small sample to test clf.fit(X, y).predict(X) # https://www.kaggle.com/c/allstate-purchase-prediction-challenge/data
X_train = training2 y_train = labels[100:172,i] X_test = sample2 y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] box = np.zeros([6,6]) accuracy = np.zeros(100) for m in range(0,100): posterior = np.empty([100,72,6]) gbc = GradientBoostingClassifier(n_estimators=60, max_depth=3) occ = OutputCodeClassifier(gbc) y_pred = occ.fit(X_train, y_train).predict(X_test) n=0 for i in range(0,len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n+1 accuracy[m] = accuracy[m]+1 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1 #posterior[m] = knc.predict_proba(X_test) print np.mean(accuracy)/0.72, np.std(accuracy)/0.72 #print sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0 ''' means = np.empty([72,6]) stds = np.empty([72,6])
def __init__(self, X, y, people, df_features, feature_names, conf_dict): self.X = X self.y = np.array(y) self.feature_names = feature_names self.people = people.reset_index(drop=True) self.X_df = df_features self.y_df = y.reset_index(drop=True) self.app_list = config_dict["app_list"] self.labels_numeric = {name: i for i, name in enumerate(self.app_list)} self.n_classes = len(self.labels_numeric) self.clf_name = config_dict["classifier"] # self.feature_selection = config_dict["feature_selection"] #True/False # self.num_features = config_dict["num_features"] # self.one_vs_all_type = config_dict["one_vs_all_type"] self.feature_selection = conf_dict["feature_selection"] #True/False self.num_features = conf_dict["num_features"] self.one_vs_all_type = conf_dict["one_vs_all_type"] self.chosen_feature_names = None self.chosen_features_all_folds = [] self.clf_dict = {} #self.clf_dict["one_vs_all"] = OneVsRestClassifier(SVC(kernel='rbf', C=1000, gamma=0.001)) self.clf_dict["output_code"] = OutputCodeClassifier(SVC(kernel='rbf', C=1000, gamma=0.001), code_size=2, random_state=0) params_rf = { 'n_estimators': 100, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 0 } #params_rf = {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 20, 'random_state': 0} self.clf_dict["rf"] = RandomForestClassifier(**params_rf) # self.clf_dict["svm"] = SVC(kernel='rbf', C=1000, gamma=0.001) # self.clf_dict["svm"] = SVC(kernel='linear', C=1, gamma=0.001) params_svm = { 'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'sigmoid' } self.clf_dict["svm"] = SVC(**params_svm) #Naive Bayes classifier is a general term which refers to conditional independence of each of the features in the model, while Multinomial Naive Bayes classifier is a specific instance of a Naive Bayes classifier which uses a multinomial distribution for each of the features. self.clf_dict["nb"] = MultinomialNB(alpha=0.00001) self.clf_dict["gnb"] = GaussianNB(var_smoothing=0.05) self.clf_dict["knn"] = KNeighborsClassifier(n_neighbors=8) params_dt = { 'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best' } self.clf_dict["dt"] = DecisionTreeClassifier(**params_dt) self.clf_dict["one_vs_all"] = OneVsRestClassifier( self.clf_dict[conf_dict["one_vs_all_type"]]) self.fs_dict = {} self.fs_dict["selectKbest_chi2"] = SelectKBest(chi2, k=self.num_features) self.fs_dict["selectKbest_fclassif"] = SelectKBest(f_classif, k=self.num_features)
for ind, im in enumerate(images): row = [] for (top_left, bottom_right) in rectangles: row += get_haar_features(im, top_left, bottom_right) train_ecoc_table[ind] = row test_ecoc_table = np.zeros(shape=(np.shape(test_images)[0], 200)) for ind, im in enumerate(test_images): row = [] for (top_left, bottom_right) in rectangles: row += get_haar_features(im, top_left, bottom_right) test_ecoc_table[ind] = row clf = OutputCodeClassifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200), code_size=5, random_state=0) clf.fit(train_ecoc_table, labels) train_pred = np.array(clf.predict(train_ecoc_table)) print "Digits Training Accuracy: %f" % (np.sum(train_pred == np.array(labels)).astype(np.float)/np.shape(train_pred)[0]) test_pred = np.array(clf.predict(test_ecoc_table)) print "Digits Testing Accuracy: %f" % (np.sum(test_pred == np.array(test_labels)).astype(np.float)/np.shape(test_pred)[0]) # ecoc_table = [] # for im in images: # # im_preprocess = np.matrix([[np.sum(im[:i,:j]) for i in range(1, 29)] for j in range(1, 29)]) # # def get_black_rectangle(top_left, bottom_right): # x1, y1 = top_left
def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) with pytest.raises(NotFittedError): ecoc.predict([])
def test_ecoc_delegate_sparse_base_estimator(): # Non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/17218 X, y = iris.data, iris.target X_sp = sp.csc_matrix(X) # create an estimator that does not support sparse input base_estimator = CheckingClassifier( check_X=check_array, check_X_params={ "ensure_2d": True, "accept_sparse": False }, ) ecoc = OutputCodeClassifier(base_estimator, random_state=0) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.fit(X_sp, y) ecoc.fit(X, y) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.predict(X_sp) # smoke test to check when sparse input should be supported ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) ecoc.fit(X_sp, y).predict(X_sp) assert len(ecoc.estimators_) == 4
C=penalty, random_state=109)) elif class_type == 'ovo': clf = SVC( kernel=kernel, gamma=1, coef0=coef, degree=degree, max_iter=max_iter, C=penalty, random_state=109) # SVC is ovo by default, contrary to documentation elif class_type == 'ecoc': clf = OutputCodeClassifier(SVC(kernel=kernel, gamma=1, coef0=coef, degree=degree, max_iter=max_iter, C=penalty, random_state=109), random_state=109) # remove convergence warning printouts from SVM training if silence == 1: warnings.filterwarnings("ignore") # Train the model using the training sets if silence == 0: print("Training SVM using %s classification" % class_type) clf.fit(X_train, y_train) ######################################### # Generate the configuration matrix for the SVM ######################################### # this will be used for manual classification
WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients'] ] test_ingredients.append(' '.join(ings)) #used to encode labels as numbers for use with RandomForestClassifier le = LabelEncoder() #encode cuisines as numbers train_cuisines = le.fit_transform(train_cuisines) #used to create bag of ingredients vocabulary and create features for each entry vectorizer = CountVectorizer() train_features = vectorizer.fit_transform(train_ingredients).toarray() test_features = vectorizer.transform(test_ingredients).toarray() clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2) result = clf.fit(train_features, train_cuisines).predict(test_features) output = pd.DataFrame(data={ 'id': test_ids, 'cuisine': le.inverse_transform(result) }) #force explicit ordering of columns output = output[['id', 'cuisine']] output.to_csv('ecoc.csv', index=False)
class Classifier(): def __init__(self,trainlist,testlist,scaling = "binary",jobs=16,directory=False, features = False, feature_info = False): self.training = trainlist self.test = testlist #self.test should be a list with multiple lists for each testset self.scaling = scaling self.jobs = jobs self.directory = directory self.feature_status = {} self.outstring = False self.features = features self.feature_info = feature_info def count_feature_frequency(self): def ff(instances,queue): feature_frequency = defaultdict(int) for i,instance in enumerate(instances): for feature in instance["ngrams"]: feature_frequency[feature] += 1 queue.put(feature_frequency) print(len(self.training)) q = multiprocessing.Queue() chunks = gen_functions.make_chunks(self.training,self.jobs) for chunk in chunks: p = multiprocessing.Process(target=ff,args=[chunk,q]) p.start() ds = [] while True: l = q.get() ds.append(l) if len(ds) == len(chunks): break self.feature_frequency = defaultdict(int) for d in ds: for k in d: self.feature_frequency[k] += d[k] self.features = sorted(self.feature_frequency, key=self.feature_frequency.get, reverse=True) def make_feature_labellist(self): feature_labellist = defaultdict(list) for instance in self.training: try: label = int(instance["label"]) for feature in instance["ngrams"]: feature_labellist[feature].append(label) except: continue self.feature_labellist = feature_labellist def prune_features(self): for instance in self.training: new_features = [] #print feature_status for f in instance["ngrams"]: try: if self.feature_status[f]: new_features.append(f) except: continue instance["ngrams"] = new_features # queue.put(instance) def convert_features(self,convert_list): for instance in self.training: new_features = [] #print feature_status #print instance["features"] for i,f in enumerate(instance["ngrams"]): if f in convert_list.keys(): instance["ngrams"][i] = convert_list[f] #print instance["features"] def filter_stdev(self,threshold,prop): self.make_feature_labellist() feature_convert = {} new_features = [] for feature in self.feature_labellist.keys(): if re.search(r"^" + prop,feature): if gen_functions.return_standard_deviation(self.feature_labellist[feature]) > threshold or len(self.feature_labellist[feature]) <= 2: self.feature_status[feature] = False else: new_feature = str(abs(int(numpy.median(self.feature_labellist[feature])))) + "_days" feature_convert[feature] = new_feature new_features.append(new_feature) self.feature_status[new_feature] = True else: self.feature_status[feature] = True new_features.append(feature) self.convert_features(feature_convert) self.prune_features() self.features = list(set(new_features)) def prune_features_topfrequency(self,n): #generate feature_frequency dict for f in self.features[:n]: self.feature_status[f] = True for f in self.features[n:]: self.feature_status[f] = False self.features = self.features[:n] self.prune_features() def balance_data(self): label_instances = defaultdict(list) new_training = [] for instance in self.training: label = instance["label"] label_instances[label].append(instance) if len(label_instances.keys()) > 2: median = int(numpy.median(numpy.array([len(label_instances[x]) for \ x in label_instances.keys()]))) for label in label_instances.keys(): if len(label_instances[label]) == median: new_training.extend(label_instances[label]) else: instances = lineconverter.Lineconverter(label_instances[label]) if len(instances.lines) < median: instances.sample(median-len(instances.lines),sample_type="up") else: instances.sample(len(instances.lines)-median) new_training.extend(instances.lines) self.training = new_training def index_features(self,ind = 0): feature_frequency=defaultdict(int) self.feature_info={} #print self.features for i,feature in enumerate(self.features): self.feature_info[feature]=i+ind def sparsify(instances,writelist): for instance in instances: sparse_features = defaultdict(int) for feature in instance["ngrams"]: try: sparse_features[self.feature_info[feature]] += 1 except: continue instance["sparse"] = sparse_features writelist.append(instance) new_instances = [] sparsify(self.training,new_instances) self.training = new_instances for tset in self.test: for instance in tset["instances"]: sparse_features = defaultdict(int) for feature in instance["ngrams"]: try: sparse_features[self.feature_info[feature]] += 1 except: continue instance["sparse"] = sparse_features def vectorize(self,instances): zerolist = [float(0)] * len(self.feature_info.keys()) matrix = [] for instance in instances: featurev = zerolist[:] for feature in instance["sparse"].keys(): if self.scaling == "binary": featurev[feature] = float(1) elif self.scaling == "log": featurev[feature] = math.log(instance["sparse"][feature],10) elif self.scaling == "tfidf": featurev[feature] = instance["sparse"][feature] * self.idf[feature] for feat in instance["features"]: featurev.append(feat) matrix.append(featurev) return matrix def model_necessities(self): #generate scipy libsvm input self.trainlabels_raw = [x["label"] for x in self.training] self.labels = set(self.trainlabels_raw) labeldict = dict(zip(self.labels,range(len(self.labels)))) self.labeldict_back = dict(zip(range(len(self.labels)),self.labels)) if self.scaling == "tfidf": self.idf = weight_features.return_idf(self.training) self.trainingvectors = self.vectorize(self.training) self.training_csr = csr_matrix(self.trainingvectors) self.trainlabels = [labeldict[x["label"]] for x in self.training] def predict(self,ts): testvectors = self.vectorize(ts) predictions = [] for i,t in enumerate(testvectors): classification = self.clf.predict(t) proba = self.clf.predict_proba(t) classification_label = self.labeldict_back[classification[0]] if len(ts[0]["meta"]) == 6: predictions.append([ts[i]["meta"][5], ts[i]["label"] + " " + classification_label, \ " ".join([str(round(x,2)) for x in proba.tolist()[0]])]) else: predictions.append([" ".join([x for x in ts[i]["ngrams"] if not re.search("_",x)]), ts[i]["label"] + " " + classification_label, \ " ".join([str(round(x,2)) for x in proba.tolist()[0]])]) return predictions def train_svm(self,params = 10): #obtain the best parameter settings for an svm outputcode classifier if len(self.labels) > 2: print("outputcodeclassifier") param_grid = {'estimator__C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'estimator__kernel': ['linear','rbf','poly'], 'estimator__gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'estimator__degree': [1,2,3,4]} model = OutputCodeClassifier(svm.SVC(probability=True)) else: print("svc model") param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'kernel': ['linear','rbf','poly'], 'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048], 'degree': [1,2,3,4]} model = svm.SVC(probability=True) paramsearch = RandomizedSearchCV(model, param_grid, cv=5, verbose=2,n_iter = params,n_jobs=self.jobs) print("Grid search...") paramsearch.fit(self.training_csr,numpy.asarray(self.trainlabels)) print("Prediction...") #print the best parameters to the file parameters = paramsearch.best_params_ self.outstring = "best parameter settings:\n" for parameter in parameters.keys(): self.outstring += (parameter + ": " + str(parameters[parameter]) + "\n") self.outstring += ("best score: " + str(paramsearch.best_score_) + "\n\n") #train an svm outputcode classifier using the best parameters if len(self.labels) > 2: clf = svm.SVC(probability=True, C=parameters['estimator__C'], kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'], degree=parameters['estimator__degree']) self.clf = OutputCodeClassifier(clf,n_jobs=self.jobs) self.clf.fit(self.training_csr,self.trainlabels) else: self.clf = svm.SVC(probability=True, C=parameters['C'], kernel=parameters['kernel'],gamma=parameters['gamma'], degree=parameters['degree']) self.clf.fit(self.training_csr,self.trainlabels) def train_nb(self): self.clf = naive_bayes.MultinomialNB() self.clf.fit(self.training_csr,self.trainlabels) def train_decisiontree(self): self.clf = tree.DecisionTreeClassifier() self.clf.fit(self.training_csr.toarray(),self.trainlabels) def tenfold_train(self,voting,classifiers = [],p = 10): kf = cross_validation.KFold(len(self.training), n_folds=10) training = deepcopy(self.training) feat = deepcopy(self.features) fi = deepcopy(self.feature_info) if voting == "weighted": self.feature_info = {} self.features = [] for instance in self.training: instance["sparse"] = defaultdict(int) instance["ngrams"] = [] len_features = len(self.features) for i,fn in enumerate(classifiers): featurename = "___" + fn self.feature_info[featurename] = len_features + i self.features.append(featurename) for train_index, test_index in kf: train = deepcopy([training[x] for x in train_index]) test = deepcopy([training[y] for y in test_index]) cl = Classifier(train,test,features = feat,feature_info = fi) cl.model_necessities() if "svm" in classifiers: cl.train_svm(params = p) predictions = cl.predict(test) for i,j in enumerate(test_index): prediction = int(float(predictions[i][1].split()[1])) self.training[j]["sparse"][self.feature_info["___svm"]] = prediction if prediction == 1: self.training[j]["ngrams"].append("___svm") if "nb" in classifiers: cl.train_nb() predictions = cl.predict(test) for i,j in enumerate(test_index): prediction = int(float(predictions[i][1].split()[1])) self.training[j]["sparse"][self.feature_info["___nb"]] = prediction if prediction == 1: self.training[j]["ngrams"].append("___nb") if "dt" in classifiers: cl.train_decisiontree() predictions = cl.predict(test) for i,j in enumerate(test_index): prediction = int(float(predictions[i][1].split()[1])) self.training[j]["sparse"][self.feature_info["___dt"]] = prediction if prediction == 1: self.training[j]["ngrams"].append("___dt") def return_classification_features(self): prediction_features_testset = [] for tset in self.test: prediction_features = [] predictions = self.predict(tset["instances"]) for i,prediction in enumerate(predictions): prediction_features.append(int(float(predictions[i][1].split()[1]))) prediction_features_testset.append(prediction_features) return prediction_features_testset def add_classification_features(self,featuredict,featurenames,voter): if voter == "majority": self.feature_info = {} len_features = len(self.feature_info.keys()) for i,fn in enumerate(featurenames): self.feature_info[fn] = len_features + i self.features.append(fn) for i,tset in enumerate(self.test): for j,instance in enumerate(tset["instances"]): if voter != "arbiter": tset["instances"][j]["sparse"] = defaultdict(int) tset["instances"][j]["ngrams"] = [] for fn in featurenames: tset["instances"][j]["sparse"][self.feature_info[fn]] = featuredict[i][j][fn] tset["instances"][j]["ngrams"].append(fn) def append_classifier_labelings(self): len_features = len(self.feature_info.keys()) self.feature_info["___append"] = len_features self.features.append("___append") for instance in self.training: instance["sparse"][self.feature_info["___append"]] = instance["append"] if instance["append"] == 1: instance["features"].append("___append") for tset in self.test: for instance in tset["instances"]: instance["sparse"][self.feature_info["___append"]] = instance["append"] if instance["append"] == 1: instance["features"].append("___append") def output_data(self): if re.search(".txt",self.test[0]["out"]): outdir = self.test[0]["out"][:-4] + "_" else: outdir = self.test[0]["out"] #output features #featureout = codecs.open(outdir + "features.txt","w","utf-8") featureout = open(outdir + "features.txt", "w", encoding = "utf-8") for feature in sorted(self.feature_info, key=self.feature_info.get): featureout.write(feature + "\t" + str(self.feature_info[feature]) + "\n") featureout.close() #output trainfile #trainout = codecs.open(outdir + "train.txt","w","utf-8") trainout = open(outdir + "train.txt", "w", encoding = "utf-8") for instance in self.training: trainout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + ",".join([str(x) for x in instance["sparse"].keys()]) + "\n") trainout.close() #output testfile #testout = codecs.open(outdir + "test.txt","w","utf-8") testout = open(outdir + "test.txt", "w", encoding = "utf-8") for i,tset in enumerate(self.test): #testout = codecs.open(outdir + "test" + str(i) + ".txt","w","utf-8") for instance in tset["instances"]: testout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + ",".join([str(x) for x in instance["sparse"].keys()]) + "\n") def test_model(self): for tset in self.test: testresults = self.predict(tset["instances"]) #outfile = codecs.open(tset["out"] + "predictions.txt","w","utf-8") if re.search(".txt",tset["out"]): outstring = tset["out"][:-4] + "_predictions.txt" else: outstring = tset["out"] + "predictions.txt" # outfile = codecs.open(outstring,"w","utf-8") outfile = open(outstring, "w", encoding = "utf-8") if self.outstring: outfile.write(self.outstring) for instance in testresults: outfile.write("\t".join(instance) + "\n") outfile.close() def save_model(self): for tset in self.test: outfile = tset["out"][:-4] + "_model.joblib.pkl" #with open(outfile, 'wb') as fid: # cPickle.dump(self.clf, fid) with open(outfile, 'wb') as fid: pickle.dump(self.clf, fid) #_ = joblib.dump(, outfile, compress=9) #outvocabulary = codecs.open(tset["out"] + "vocabulary.txt","w","utf-8") outstring = tset["out"][:-4] + "_vocabulary.txt" #outvocabulary = codecs.open(outstring,"w","utf-8") outvocabulary = open(outstring, "w", encoding = "utf-8") for feature in self.features: outvocabulary.write(feature + "\n") outvocabulary.close() #outidf = codecs.open(tset["out"][:-4] + "_idfs.txt","w","utf-8") outidf = open(tset["out"][:-4] + "_idfs.txt", "w", encoding = "utf-8") for key in self.idf.keys(): outidf.write(str(key) + "\t" + str(self.idf[key]) + "\n") outidf.close()
def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ecoc.predict, [])
print("Accuracy using MLPClassifier and Random Seed:", s, ":", str(acc)) print(confusion_matrix(label_test, prediction)) print("Mean Accuracy using MLPClassifier Classifier: ", np.array(acc_array).mean()) #---------------------------------------------------------------- # Init the Models for Comparision #---------------------------------------------------------------- models = [ BaggingClassifier(), RandomForestClassifier(), AdaBoostClassifier(), KNeighborsClassifier(), GaussianNB(), tree.DecisionTreeClassifier(), svm.SVC(kernel='linear', C=1), OutputCodeClassifier(BaggingClassifier()), OneVsRestClassifier(svm.SVC(kernel='linear')) ] model_names = [ "Bagging with DT", "Random Forest", "AdaBoost", "KNN", "Naive Bayes", "Decision Tree", "Linear SVM", "OutputCodeClassifier with Linear SVM", "OneVsRestClassifier with Linear SVM" ] #---------------------------------------------------------------- # Run Each Model #---------------------------------------------------------------- for model, name in zip(models, model_names): model.fit(data_train, label_train) # Display the relative importance of each attribute if name == "Random Forest":