class Classifier(): def __init__(self, model_filename=None): if model_filename is not None: self.load_model(model_filename) else: self.linSVC_obj = LinearSVC() def save_model(self, filename): # save linSVC object to file, compress is also to prevent multiple model files. joblib.dump(self.linSVC_obj, filename, compress=3) def load_model(self, filename): # load linSVC Object from file self.linSVC_obj = joblib.load(filename) def train(self, X, Y): assert X.ndim == 2, "Classifier training data X.ndim is %d instead of 2" %X.ndim assert Y.ndim == 1, "Classifier training data Y.ndim is %d instead of 1" %Y.ndim # train the model self.linSVC_obj.fit(X,Y) def get_predictions(self, X): assert X.ndim == 2, "Classifier prediction data X.ndim is %d instead of 2" %X.ndim # get classes try: return self.linSVC_obj.predict(X) except NotFittedError: raise NotFittedError("Classification model cannot preidct without being trained first. " \ + "Train the classification model at least once to prevent this error.")
def run_ratio(self, dataset, set_size): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) test_set_original = (X_test, y_test) large = ENMLT(LinearSVC) large.fit(X_train, y_train) simple = LinearSVC() simple.fit(X_train, y_train) for r in numpy.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) y_pred = large.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc1 = self.accuracy(cm) y_pred = simple.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc2 = self.accuracy(cm) print "%.2f, %f, %f" % (r, acc1, acc2)
def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(C=1000, penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y)
def train(dataset): print "Reading dataset ..." features = np.array(dataset.data, "int16") labels = np.array(dataset.target, "int") nExamples = features.shape[0] # Compute HOGs for each image in the database print "Extracting features for " + str(nExamples) + " training examples ... ", sys.stdout.flush() startTime = time.clock() list_hog_fd = [] for feature in features: fd = hog( feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False ) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, "float64") elapsedTime = time.clock() - startTime print "{0:.3f}s ({1:.4f}s/example)".format(elapsedTime, elapsedTime / nExamples) print "Training ... ", sys.stdout.flush() startTime = time.clock() clf = LinearSVC() clf.fit(hog_features, labels) elapsedTime = time.clock() - startTime print "{0:.3f}s".format(elapsedTime) print "Saving model to " + MODEL_FILE joblib.dump(clf, MODEL_FILE, compress=3) print "Training finished ..."
def svm_vecteur(): "Interprétation des images comme vecteurs de pixels et classification via le SVM" best=np.zeros(4) for npix in range(50,200,50): _, data, target, _ = utils.chargementVecteursImages(mer,ailleurs,1,-1,npix) X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed()) for iterations in range(250,1000,250): start_time = time.time() svc = LinearSVC(random_state=random.seed(), max_iter=iterations) x1=np.array(X_train) x1 = np.reshape(x1, (x1.shape[0],x1.shape[2])) x2=np.array(X_test) x2 = np.reshape(x2, (x2.shape[0],x2.shape[2])) svc.fit(X=x1, y=Y_train) score = svc.score(x2,Y_test) end_time = time.time() if score>best[0]: best[0] = score best[1] = iterations best[2] = end_time-start_time best[3] = npix print("| SVM linéaire | V.Pix {:4.0f} | iterations={:1.0f} | {:10.3f}ms | {:1.3f} |".format(best[3],best[1],best[3]*1000,best[0]))
def linear_svc(train_bow,train_labels,test_bow,test_labels,bow_indexes): print("Training linear svc") svc_classifier=LinearSVC() svc_classifier.fit(train_bow,train_labels) print("Testing linear svc") test(svc_classifier,"svc",test_bow,test_labels,bow_indexes)
def processMBHval(): for featType in ['MBH']: names = getnames() gtlabels = readpkl('{}data/labels.pkl'.format(baseDir)) indexs = readpkl('{}data/indexs.pkl'.format(baseDir)) actionIDs,taxonomy,database = readannos() print 'getting training data.... ', xtrain,ytrain = getdataVal(database,indexs,gtlabels,'training',featType) print 'got it!! and shape is ',np.shape(xtrain) #print 'getting validation data.... ', #xval,yval = getdata(database,indexs,gtlabels,'validation',featType) #print 'got it!! and shape is ',np.shape(xval) if featType == 'IMS': jobs = 16 c = 0.01; else: jobs = 16 c = 10; clf = LinearSVC(C = c) clf = clf.fit(xtrain, ytrain) saveName = '{}data/train-valSVM-{}.pkl'.format(baseDir,featType) with open(saveName,'w') as f: pickle.dump(clf,f)
def train_classifier(): pos_feat_path = positive_features_path neg_feat_path = negative_features_path model_path = classifier_model_path feature_vectors = [] labels = [] for feat_path in glob.glob(os.path.join(pos_feat_path, "*.feat")): fd = joblib.load(feat_path) print len(fd) if len(fd): fd = fd.astype(numpy.object) feature_vectors.append(fd) labels.append(1) for feat_path in glob.glob(os.path.join(neg_feat_path, "*.feat")): fd = joblib.load(feat_path) print len(fd) if len(fd): fd = fd.astype(numpy.object) feature_vectors.append(fd) labels.append(0) classifier = LinearSVC() print "Training classifier" classifier.fit(feature_vectors, labels) print "Classifier successfully trained" if not os.path.isdir(os.path.split(model_path)[0]): os.makedirs(os.path.split(model_path)[0]) joblib.dump(classifier, model_path)
def fit(self, X, Y, W): clf = LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, multi_class=self.multi_class, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, random_state=self.random_state) return LinearSVMClassifier(clf.fit(X, Y.reshape(-1)))
def train(train_input, train_output, test_input, test_output): # 训练模块 # 选择模型 # model = MultinomialNB() # model = GaussianNB() # model = SGDClassifier() # model = SVC(kernel='linear') # 这个很慢 model = LinearSVC() # model = RandomForestClassifier(max_depth=2, n_estimators=500) # model = AdaBoostClassifier(n_estimators=500,base_estimator=DecisionTreeClassifier(max_depth=10)) # 训练 & 评测 model.fit(train_input,train_output) pred_train = model.predict(train_input) pred_test = model.predict(test_input) label_size = max(train_output)+1 train_ratio = cal_accuracy(pred_train, train_output) train_recal = cal_recall(pred_train, train_output, label_size) # print(test_output) print(list(pred_test)) test_ratio = cal_accuracy(pred_test, test_output) test_recal = cal_recall(pred_test, test_output, label_size) print('%f\t%f'%(train_ratio, test_ratio)) print('%f\t%f'%(train_recal, test_recal))
def stump(X, y): score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision') clf = LinearSVC() clf.fit(X, y) coef = clf.coef_[0,0] inter = clf.intercept_[0] return np.mean(score), np.sign(coef), inter / np.abs(coef)
def linearSVM(self): ''' 線形SVMを用いた2クラス分類 args : -> dst : -> param: -> ''' # 学習データ data_training_tmp = np.loadtxt('../../../data/statistical_data/CodeIQ_auth.txt', delimiter=' ') data_training = [[x[0], x[1]] for x in data_training_tmp] label_training = [int(x[2]) for x in data_training_tmp] # 試験データ data_test = np.loadtxt('../../../data/statistical_data/CodeIQ_auth.txt', delimiter=' ') print np.array(data_test).shape # 学習 estimator = LinearSVC(C=1.0) estimator.fit(data_training, label_training) # 予測 label_prediction = estimator.predict(data_test[:,0:2]) print(label_prediction) print
class TrainingTesting() : def __init__(self) : self.y = [] self.noa = 0 self.author_names = [] self.train_data = [] self.author_files = os.listdir(path+"/generated_files") #print author_names for author in self.author_files : self.author_names.append(author[:-4]) text1 = open(path+"/generated_files/"+author,"r").read().split("\n") #print text1[1:-1] for txt in text1[1:-1] : t = [] self.y.append(self.noa) #t.append(self.noa) for i in txt.split(",")[1:-1] : t.append(float(i)) self.train_data.append(t) self.noa += 1 #print self.y #print self.train_data def train(self) : self.clfr = LinearSVC() self.clfr.fit(self.train_data,self.y) #print self.author_names[clfr.predict(self.train_data[0])[0]] def test(self,test_data) : self.correct_author_name = self.author_names[self.clfr.predict(test_data)[0]]
class Classifier: def __init__(self, ctype): self.ctype = ctype def train(self, data, labels): if self.ctype == "SVM": self.model = LinearSVC() self.model.fit(data, labels) elif self.ctype == "Decision": print "Unsupported" elif self.ctype == "Chi-Squared": self.model_data = data self.model_labels = labels def predict(self, data): if self.ctype == "SVM": return self.model.predict(data) elif self.ctype == "Decision": print "Unsupported" elif self.ctype == "Chi-Squared": predictions = [] for sample, test_hist in enumerate(data): #Storing the first distance by default lowest_score = cv2.compareHist(np.array(self.model_data[0], dtype = np.float32), np.array(test_hist, dtype = np.float32), method = 1) predictions.append(self.model_labels[0]) #Going through the rest of data for index, train_hist in enumerate(self.model_data): score = cv2.compareHist(np.array(train_hist, dtype = np.float32), np.array(test_hist, dtype = np.float32), method = 1) if score < lowest_score: lowest_score = score predictions[sample] = self.model_labels[index] return predictions
def run_model(X_train, y_train, X_test, y_test, model, layer, C=0): """ Implement sklearn LinearSVC model and fit to training data. Predict labels of test data. Dump the training data, training labels, test features, test labels, predictions and pickled model. :params X_train: array of training features :params y_train: array of training labels :params X_test: array of test features :params y_test: array of test labels :params model: Name of the pre-trained CNN used for extracting features :params layer: Name of the layer used for extracting features :params C: Optimized C value from grid search """ svc = LinearSVC(C=C) svc.fit(X_train, y_train) predicted_labels = svc.predict(X_test) directory_name = "svm_" + model + "_" + "layer_" + layer + "_" + str(datetime.date.today()).replace("-","_") directory_path = os.path.join("../models", directory_name) if not os.path.exists(directory_path): os.makedirs(directory_path) X_train.dump(os.path.join(directory_path, "train_data")) y_train.dump(os.path.join(directory_path, "train_labels")) X_test.dump(os.path.join(directory_path, "test_data")) y_test.dump(os.path.join(directory_path, "test_labels")) predicted_labels.dump(os.path.join(directory_path, "predicted_labels")) joblib.dump(svc, os.path.join(directory_path, "model.pkl")) return
def svm_train(train_file): _,x,y = readFile(train_file) print 'reading done.' from sklearn.cross_validation import train_test_split tmp_array = np.arange(x.shape[0]) train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500) #from sklearn import preprocessing as pp #scaler = pp.StandardScaler() #x = scaler.fit(x) #print 'scale done.' train_x = x[train_i] test_x = x[test_i] train_y = y[train_i] test_y = y[test_i] from sklearn.svm import LinearSVC classifier = LinearSVC() classifier.fit(train_x,train_y) print 'train done.' res = classifier.predict(test_x) print res.shape from sklearn.metrics import roc_auc_score score = roc_auc_score(test_y,res) print score return classifier '''
class MyComposition(object): def __init__(self, n_estimators=60): self.n_estimators = n_estimators self.lsvc = LinearSVC(penalty='l1', dual=False) self.estimators_ = None self.weights_ = None def fit(self, x_train, y_train, x_train2, y_train2): # svr = SVR(**svr_params) svr = LinearSVR(**svr_params2) gbr = GradientBoostingRegressor(n_estimators=self.n_estimators, learning_rate=0.1) abr = AdaBoostRegressor(n_estimators=self.n_estimators, learning_rate=0.01) estimators = [gbr, abr, svr] self.estimators_ = [e.fit(x_train, y_train) for e in estimators] x_pred = np.vstack([e.predict(x_train2) for e in self.estimators_]).T self.lsvc.fit(x_pred, y_train2) self.weights_ = np.array(self.lsvc.coef_ / np.sum(self.lsvc.coef_)).ravel() return self def predict(self, x): x_pred = np.vstack([e.predict(x) for e in self.estimators_]).T return np.sum(x_pred * self.weights_, axis=1).ravel()
def estimate_svm(textlines): svc = LinearSVC(C=10, random_state=1, class_weight={1:0.35}) data = [] for line in textlines: dat = np.r_[line.in_word_distances, line.between_word_distances] if dat.shape[0] < 2: continue _, _, centroids = cv2.kmeans(data=np.asarray([dat]).transpose().astype(np.float32), K=2, bestLabels=None, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_MAX_ITER, 100, 0.001), attempts=5, flags=cv2.KMEANS_PP_CENTERS) diff = abs(centroids[0] - centroids[1]) if line.n_words == 1: # single word data.append([1] + [diff / np.mean(line.heights), diff / (np.median(dat) + 1e-10)]) continue #multi word data.append([-1] + [diff / np.mean(line.heights), diff / (np.median(dat) + 1e-10)]) if len(line.in_word_distances) < 2: continue # create an artificial single word _, _, centroids = cv2.kmeans(data=np.asarray([line.in_word_distances]).transpose().astype(np.float32), K=2, bestLabels=None, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_MAX_ITER, 100, 0.001), attempts=5, flags=cv2.KMEANS_PP_CENTERS) diff = abs(centroids[0] - centroids[1]) data.append([1] + [diff / np.mean(line.heights), diff / (np.median(line.in_word_distances) + 1e-10)]) data = np.array(data) svc.fit(data[:,1:], data[:,0]) return svc
def svm_once(data): cumu_false_total = 0.0 cumu_false_negative_total = 0.0 total_count = 0 x = data[0]['data'] y = np.ravel(data[0]['labels']) svc = LinearSVC(C=100) svc.fit(x, y.tolist()) file_false = open('svm_once_f.txt', 'w') file_false_negative = open('svm_once_fn.txt', 'w') for i in xrange(1, 100): print 'day(%d)' % i x = data[i]['data'] y = np.ravel(data[i]['labels']) p_y = svc.predict(x) cumu_false = 0 cumu_false_negative = 0 for idx, f in enumerate(y == p_y): if not f: cumu_false += 1 if y[idx] == 1: cumu_false_negative += 1 cumu_false_total += cumu_false cumu_false_negative_total += cumu_false_negative total_count += len(y) file_false.write("%f\n" % (cumu_false_total / total_count)) file_false_negative.write("%f\n" % (cumu_false_negative_total / total_count)) file_false.close() file_false_negative.close()
def _compute_svmnormalvector((cache_dir, images, control_images, normalization_name, preprocess_file, rfe)): #try: import numpy as np import sys from cpf.profiling.cache import Cache from cpf.profiling.normalization import RobustLinearNormalization, normalizations from sklearn.svm import LinearSVC from cpf.profiling.profile_svmnormalvector import _compute_rfe cache = Cache(cache_dir) normalization = normalizations[normalization_name] normalizeddata, normalized_colnames, _ = cache.load(images, normalization=normalization) control_data, control_colnames, _ = cache.load(control_images, normalization=normalization) if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) normalizeddata = preprocessor(normalizeddata) control_data = preprocessor(control_data) assert len(control_data) >= len(normalizeddata) downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :] x = np.vstack((normalizeddata, downsampled)) y = np.array([1] * len(normalizeddata) + [0] * len(downsampled)) clf = LinearSVC(C=1.0) m = clf.fit(x, y) normal_vector = m.coef_[0] if rfe: # Copy because it is immutable (normal_vector.flags.weriteable == False) normal_vector = np.array(normal_vector) normal_vector[~_compute_rfe(x, y)] = 0 return normal_vector
def train(self, sourceClassifier, sourceTrainData, targetTrainData): targetClassifier = sourceClassifier self.checkSizes(sourceTrainData, targetTrainData) improvement = sys.maxsize i = 0 unusedTargetData = targetTrainData targetTrainData = sourceTrainData #use all the source train data as well targetTrainData[1] = targetTrainData[1].tolist() while not self.isStoppingConditionMet(self.stoppingCondition, i, improvement): #print("iteration number "+str(i)) result = self.sampleSelector.selectSamples(targetClassifier,unusedTargetData,self.batch_size) selectedSamples = result[0] selectedIndices = result[1] # if firstIteration: # print("in first iteration!") # firstIteration = 0 # targetTrainData = [selectedSamples[0], selectedSamples[1]] # else: #print("type(selectedSamples) = %s" % type(selectedSamples[0])) #print("targetTrainData[0].shape[0] = %d" % targetTrainData[0].shape[0]) targetTrainData[0] = self.robustAppend(targetTrainData[0], selectedSamples[0]) targetTrainData[1] = targetTrainData[1] + selectedSamples[1] unusedTargetData = self.getNewUnusedData(unusedTargetData,selectedIndices) targetClassifier = LinearSVC() #print("targetTrainData[0].shape[0] = %d" % targetTrainData[0].shape[0]) targetClassifier.fit(targetTrainData[0],targetTrainData[1]) i += 1 print("active learner was trained on {0} labeled instances.".format(self.batch_size * self.max_num_of_iterations)) return targetClassifier
def SVC_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options): y_bin = y_all[learn_options["binary target name"]].values[:, None] clf = LinearSVC(penalty="l2", dual=False) clf.fit(X[train], y_bin[train].flatten()) # y_pred = clf.predict(X[test])[:, None] # this returns 0/1 y_pred = clf.decision_function(X[test])[:, None] return y_pred, clf
def retrain_models(username): train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username) b_train_x = [] b_train_y = numpy.concatenate([body_y, train_y]) for msg in (body_x + train_x): b_train_x.append(extract_body_features(msg)) body_vec = TfidfVectorizer(norm="l2") b_train_x = body_vec.fit_transform(b_train_x) h_train_x = [] h_train_y = numpy.concatenate([head_y, train_y]) for msg in (head_x + train_x): h_train_x.append(extract_header_features(msg)) head_vec = DictVectorizer() h_train_x = head_vec.fit_transform(h_train_x) body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) head_model = RidgeClassifier(tol=1e-2, solver="lsqr") body_model.fit(b_train_x, b_train_y) head_model.fit(h_train_x, h_train_y) print("Finished training models for "+username+"...") store_models(username, body_vec, body_model, head_vec, head_model)
def benchmark(k, epochs): print("*" * 80) print("k: %d, epochs: %d\n" % (k, epochs)) #select = SelectKBest(score_func=chi2, k=k) select = TruncatedSVD(n_components=k) X_train_trunc = select.fit_transform(X_train, Y_train) X_test_trunc = select.transform(X_test) print('done truncating') parameters = {'C': [1, 10, 100, 1000, 10000], 'class_weight': ['auto', None], 'tol':[0.001,0.0001]} clf = LinearSVC(C=100000) #clf = grid_search.GridSearchCV(svc, parameters) clf.fit(X_train_trunc, Y_train) pred = clf.predict(X_test_trunc) if CREATE_SUBMISSION: X_submit_trunc = select.transform(X_submit) pred_submit = clf.predict(X_submit_trunc) dump_csv(pred_submit, k, epochs) score = metrics.f1_score(Y_test, pred) print("f1-score: %0.3f" % score) print("classification report:") print(metrics.classification_report(Y_test, pred)) print("confusion matrix:") print(metrics.confusion_matrix(Y_test, pred))
def main(): dataset = load_cifar.load_cifar(n_train=N_TRAIN, n_test=N_TEST, grayscale=GRAYSCALE, shuffle=False) train_data = dataset['train_data'] train_labels = dataset['train_labels'] test_data = dataset['test_data'] test_labels = dataset['test_labels'] print train_data.shape, test_data.shape patch_extractor = image.PatchExtractor(patch_size=(PATCH_SIZE, PATCH_SIZE), max_patches = N_PATCHES/ len(train_data)) pp = preprocessing.Preprocessor(n_components=0.99) fl = feature_learner.FeatureLearner(pp, patch_extractor, n_clusters=N_CENTROIDS) fl.fit(train_data) train = fl.transform(train_data) m_train = mean(train, axis=0) train -= m_train v_train = sqrt(var(train, axis=0) + 0.01) train /= v_train test = fl.transform(test_data) test -= m_train test /= v_train classifier = SVC(C=10.0)#, gamma=1e-3, verbose=False) classifier.fit(train, train_labels) print classifier.score(test, test_labels) return
def svm_for_multiclass(): text_file = "/home/web_server/wangyuanfu/age/temp1" dataset = np.loadtxt(text_file, delimiter=" ") X = dataset[:,1:] y = dataset[:,0:1] min_max_scaler = preprocessing.MinMaxScaler() normalized_X = min_max_scaler.fit_transform(X) print len(normalized_X) X_train, X_test, y_train, y_test = train_test_split(normalized_X, y, test_size=0.1, random_state=7) clf = LinearSVC(random_state=0, C=1, multi_class='ovr', penalty='l2') clf = clf.fit(X_train, y_train.reshape(-1)) # print the training scores print("training score : %.3f " % (clf.score(X_train, y_train))) # make predictions predicted = clf.predict(X_test) length_predicted = len(predicted) print predicted.shape #for i in range(0,length_predicted): # print predicted[i],y_test[i] #print X_test[i,:],predicted[i],y_test[i],probability[i] # summarize the fit of the model print(metrics.classification_report(y_test, predicted)) print(metrics.confusion_matrix(y_test, predicted)) print(metrics.precision_score(y_test, predicted, average='micro'))
def svm_binary_svc_probability(X, Y, C): allp = np.sum(Y>0); alln = len(Y) - allp; nr_fold = 5; perm = list(range(len(Y))); random.shuffle(perm); dec_values = np.zeros(len(Y), dtype=np.float32); for i in range(nr_fold): start = i * len(Y) // nr_fold; end = (i+1) * len(Y) // nr_fold; trainL = [perm[j] for j in range(len(Y)) if j not in range(start, end)]; testL = perm[start:end]; trainX = X[trainL,:]; trainY = Y[trainL]; p_count = np.sum(trainY>0); n_count = len(trainY) - p_count; if p_count==0 and n_count==0: dec_values[start:end] = 0.0; elif p_count > 0 and n_count == 0: dec_values[start:end] = 1.0; elif p_count == 0 and n_count > 0: dec_values[start:end] = -1.0; else : subclf = LinearSVC(C=C, class_weight={1:allp,-1:alln}); subclf.fit(trainX, trainY); dec_values[testL] = subclf.decision_function(X[testL,:]).ravel(); return sigmoid_train(dec_values, Y);
class LinearSVM: def __init__(self): self.clf = LinearSVC(penalty='l2', loss='l1', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None) self.pattern ='(?u)\\b[A-Za-z]{3,}' self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(1, 3)) def train(self,fileName): print "LinearSVM Classifier is being trained" table = pandas.read_table(fileName, sep="\t", names=["cat", "message"]) X_train = self.tfidf.fit_transform(table.message) Y_train = [] for item in table.cat: Y_train.append(int(item)) self.clf.fit(X_train, Y_train) print "LinearSVM Classifier has been trained" def classify(self,cFileName, rFileName): table = pandas.read_table(cFileName, names=["message"]) X_test = self.tfidf.transform(table.message) print "Data have been classified" with open(rFileName,'w') as f: for item in self.clf.predict(X_test).astype(str): f.write(item+'\n') def validate(self,fileName): table = pandas.read_table(fileName, sep="\t", names=["cat", "message"]) X_validate = self.tfidf.transform(table.message) Y_validated = self.clf.predict(X_validate).astype(str) totalNum = len(table.cat) errorCount = 0 for i in range(0,totalNum): if int(table.cat[i])!=int(Y_validated[i]): errorCount += 1 print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
def do_SVM(x,y,xt,yt): Cs = [0.01,0.1,10,100,1000] for C in Cs: print "El valor de C que se esta probando: %f"%C model = LinearSVC(C=C) model = model.fit(x, y) score_the_model(model,x,y,xt,yt,"SVM")
def test_class_weights_rescale_C(): # check that our crammer-singer implementation with class weights and # rescale_C=True is the same as LinearSVC's c-s class_weight implementation from sklearn.svm import LinearSVC X, Y = make_blobs(n_samples=210, centers=3, random_state=1, cluster_std=3, shuffle=False) X = np.hstack([X, np.ones((X.shape[0], 1))]) X, Y = X[:170], Y[:170] weights = 1. / np.bincount(Y) weights *= len(weights) / np.sum(weights) pbl_class_weight = MultiClassClf(n_features=3, n_classes=3, class_weight=weights, rescale_C=True) svm_class_weight = OneSlackSSVM(pbl_class_weight, C=10, tol=1e-5) svm_class_weight.fit(X, Y) try: linearsvm = LinearSVC(multi_class='crammer_singer', fit_intercept=False, class_weight='auto', C=10) linearsvm.fit(X, Y) assert_array_almost_equal(svm_class_weight.w, linearsvm.coef_.ravel(), 3) except TypeError: # travis has a really old sklearn version that doesn't support # class_weight in LinearSVC pass
print('Precision, recall and f1-score:') print(classification_report(y_test, y_pred)) roc = roc_auc_score(y_test, y_pred) print('ROC AUC: {}'.format(roc)) pr = average_precision_score(y_test, y_pred) print('Precision-recall: {}'.format(pr)) print('-' * 10, 'End', model.__class__.__name__, '-' * 10) fit_model(XGBClassifier()) fit_model(LogisticRegression()) fit_model(LinearSVC(random_state=0, tol=1e-5)) fit_model(KNeighborsClassifier(n_neighbors=6)) ''' PAIRWISE: 0: [0, 0, 0, 0, 1, 1, 1, 0] 1: [1, 1, 1, 1, 2, 2, 2, 0] 2: [2, 2, 2, 2, 3, 3, 3, 0] 3: [3, 3, 3, 3, 4, 4, 4, 0] 4: [4, 4, 4, 4, 5, 5, 5, 0] 5: [5, 5, 5, 5, 6, 6, 6, 0] 6: [6, 6, 6, 6, 7, 7, 7, 0] 7: [7, 7, 7, 7, 8, 8, 8, 0] 8: [8, 8, 8, 8, 9, 9, 9, 0] 9: [9, 9, 9, 9, 9, 8, 7, 1] 10: [9, 8, 7, 6, 6, 5, 4, 2] 11: [8, 7, 6, 5, 5, 4, 3, 2]
"""Prints features with the highest coefficient values, per class""" feature_names = vectorizer.get_feature_names() coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) topClass1 = coefs_with_fns[:N] topClass2 = coefs_with_fns[:-(N + 1):-1] print("Class 1 best: ") for feat in topClass1: print(feat) print("Class 2 best: ") for feat in topClass2: print(feat) # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1)) clf = LinearSVC() # the pipeline to clean, tokenize, vectorize, and classify pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)]) # data train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", "lol @twitterdude that is gr8", "twitter & reddit are fun.", "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", "Rockets launch from Earth and go to other planets.", "twitter social media > <", "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."] labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"] test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"] labelsTest = ["twitter", "space"] # train
# Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X # scaled_X = X_scaler.transform(X) y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features)))) rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rand_state) def get_classifier_from_pickle(): with open(f'classifiers/classifier{str(CONFIG)}.p', 'rb') as classifier_file: svc = pickle.load(classifier_file) return svc if not __name__ == "__main__": svc = get_classifier_from_pickle() if __name__ == "__main__": from sklearn.svm import LinearSVC # Use a linear SVC (support vector classifier) svc = LinearSVC() svc.fit(X_train, y_train) with open(f'classifiers/classifier{str(CONFIG)}.p', 'wb') as classifier_file: pickle.dump(svc, classifier_file) print('Test Accuracy of SVC = ', svc.score(X_test, y_test)) print('My SVC predicts: ', svc.predict(X_test[0:10])) print('For labels: ', y_test[0:10])
def feature_selection(self, X, y, method): """ purpose: select feature input: X:train data y:lable method: uesed method return: """ X_indices = np.arange(X.shape[-1]) score = [] # Removing features with low variance # correlation coefficient # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target) # mutual information # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target) # Univariate feature selection (for classification) if method == 'chi-squared': skb = SelectKBest(chi2) skb.fit_transform(X, y) score = skb.scores_ # Univariate feature selection (for regression) if method == 'f_regression': skb = SelectKBest(f_regression) skb.fit_transform(X, y) score = skb.scores_ # L1-based feature selection (for classification) if method == 'LinearSVC': lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) sfm = SelectFromModel(lsvc, prefit=True) X_new = sfm.transform(X) # L1-based feature selection (for regression) elif method == 'LassoCV': lasso = LassoCV().fit(X, y) score = lasso.coef_ sfm = SelectFromModel(lasso, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classification) elif method == 'ExtraTreesClassifier': clf = ExtraTreesClassifier() clf = clf.fit(X, y) print clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'ExtraTreesRegressor': clf = ExtraTreesRegressor() clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classifier) elif method == 'GradientBoostingClassifier': clf = GradientBoostingClassifier(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'GradientBoostingRegressor': clf = GradientBoostingRegressor(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Print the feature ranking indices = np.argsort(score)[::-1] print("Feature ranking:") for f in X_indices: print("feature %d: %s (%f)" % (indices[f], self.columns[indices[f]], score[indices[f]])) #draw plot plt.figure() # plt.bar(indices, score, width=0.2, color='r') plt.barh(indices, score, height=0.2, color='r') plt.title(method) plt.xlabel("score") plt.ylabel("feature") plt.grid(axis='x') plt.show() pass
for clf, name in ( (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(), "Random forest"), ): print("=" * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print("=" * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print("=" * 80) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print("=" * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid()))
ovr_clf.fit(train_x, train_y) ovr_predicted = ovr_clf.predict(test_x) from sklearn.metrics import confusion_matrix, precision_score, f1_score ovr_confusion_matrix = confusion_matrix(test_y.values.argmax(axis=1), ovr_predicted.argmax(axis=1)) precision_ovr = precision_score( test_y, ovr_predicted, average='micro') #gives 95.76 percent of accuracy precision_ovr_all = precision_score(test_y, ovr_predicted, average=None) #gives 94.98,96.52 f1_lgd = f1_score(test_y, ovr_predicted, average='micro') #gives 95.76 percent of accuracy #Applying SVM for multiclass classification from sklearn.svm import LinearSVC svc = OneVsRestClassifier(LinearSVC(C=10, loss='hinge')) svc.fit(train_x, train_y) svc_predicted = svc.predict(test_x) from sklearn.metrics import confusion_matrix, precision_score, f1_score svc_confusion_matrix = confusion_matrix(test_y.values.argmax(axis=1), svc_predicted.argmax(axis=1)) precision_svc = precision_score(test_y, svc_predicted, average='micro') precision_svc_all = precision_score(test_y, svc_predicted, average=None) f1_sgd = f1_score(test_y_label, sg_predicted, average='micro') #Appying SVM for multiclass classification but with different kernels from sklearn.svm import SVC svm_poly = OneVsRestClassifier(SVC(kernel='poly', degree=4, C=10000)) svm_poly.fit(train_x, train_y) svm_poly_predicted = svm_poly.predict(test_x)
###### downsample #features_df = features_df.iloc[0:5000,:] #features_df_val = features_df_val[0:1000,] #encoded_labels_df = encoded_labels_df.iloc[0:5000,:] print("feat shape:", features_df.shape) print("labels shape:", encoded_labels_df.shape) X_train = np.array(features_df) Y_train = np.array(encoded_labels_df) x_val = np.array(features_df_val) y_val = np.array(encoded_labels_df_val) # Define model linsvm = LinearSVC(loss='hinge') #multi_class='ovr', #verbose=True, #max_iter=1000) model = OneVsRestClassifier(linsvm, n_jobs=-1) start = time.process_time() model.fit(X_train, Y_train) elapsed_fit = time.process_time() - start print("Time to fit model (min):", elapsed_fit / 60) start_predict = time.process_time() ### change y_pred = model.decision_function(x_val) elapsed_predict = time.process_time() - start_predict
# Support Vector Machines from sklearn.svm import SVC svc = SVC() svc.fit(x_train, y_train) y_pred = svc.predict(x_val) acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2) print(acc_svc) # In[ ]: # Linear SVC from sklearn.svm import LinearSVC linear_svc = LinearSVC() linear_svc.fit(x_train, y_train) y_pred = linear_svc.predict(x_val) acc_linear_svc = round(accuracy_score(y_pred, y_val) * 100, 2) print(acc_linear_svc) # In[ ]: # Perceptron from sklearn.linear_model import Perceptron perceptron = Perceptron() perceptron.fit(x_train, y_train) y_pred = perceptron.predict(x_val) acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2) print(acc_perceptron)
def return_model(mode, **kwargs): if inspect.isclass(mode): assert getattr(mode, 'fit', None) is not None, 'Custom model family should have a fit() method' model = mode(**kwargs) elif mode=='logistic': solver = kwargs.get('solver', 'liblinear') n_jobs = kwargs.get('n_jobs', None) max_iter = kwargs.get('max_iter', 5000) model = LogisticRegression(solver=solver, n_jobs=n_jobs, max_iter=max_iter, random_state=666) elif mode=='Tree': model = DecisionTreeClassifier(random_state=666) elif mode=='RandomForest': n_estimators = kwargs.get('n_estimators', 50) model = RandomForestClassifier(n_estimators=n_estimators, random_state=666) elif mode=='GB': n_estimators = kwargs.get('n_estimators', 50) model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666) elif mode=='AdaBoost': n_estimators = kwargs.get('n_estimators', 50) model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666) elif mode=='SVC': kernel = kwargs.get('kernel', 'rbf') model = SVC(kernel=kernel, random_state=666) elif mode=='LinearSVC': model = LinearSVC(loss='hinge', random_state=666) elif mode=='GP': model = GaussianProcessClassifier(random_state=666) elif mode=='KNN': n_neighbors = kwargs.get('n_neighbors', 5) model = KNeighborsClassifier(n_neighbors=n_neighbors) elif mode=='NB': model = MultinomialNB() elif mode=='linear': model = LinearRegression(random_state=666) elif mode=='ridge': alpha = kwargs.get('alpha', 1.0) model = Ridge(alpha=alpha, random_state=666) elif 'conv' in mode: tf.reset_default_graph() address = kwargs.get('address', 'weights/conv') hidden_units = kwargs.get('hidden_layer_sizes', [20]) activation = kwargs.get('activation', 'relu') weight_decay = kwargs.get('weight_decay', 1e-4) learning_rate = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 1000) early_stopping= kwargs.get('early_stopping', 10) warm_start = kwargs.get('warm_start', False) batch_size = kwargs.get('batch_size', 256) kernel_sizes = kwargs.get('kernel_sizes', [5]) strides = kwargs.get('strides', [5]) channels = kwargs.get('channels', [1]) validation_fraction = kwargs.get('validation_fraction', 0.) global_averaging = kwargs.get('global_averaging', 0.) optimizer = kwargs.get('optimizer', 'sgd') if mode=='conv': model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif mode=='conv_reg': model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif 'NN' in mode: solver = kwargs.get('solver', 'adam') hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,)) if isinstance(hidden_layer_sizes, list): hidden_layer_sizes = list(hidden_layer_sizes) activation = kwargs.get('activation', 'relu') learning_rate_init = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 5000) early_stopping= kwargs.get('early_stopping', False) warm_start = kwargs.get('warm_start', False) if mode=='NN': model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) if mode=='NN_reg': model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) else: raise ValueError("Invalid mode!") return model
y_pred = classifier.predict(X_test_mean) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_median, y_train) y_pred = classifier.predict(X_test_median) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_mode, y_train) y_pred = classifier.predict(X_test_mode) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) from sklearn.svm import LinearSVC classifier = LinearSVC() classifier.fit(X_train_0, y_train) y_pred = classifier.predict(X_test_0) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_mean, y_train) y_pred = classifier.predict(X_test_mean) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_median, y_train) y_pred = classifier.predict(X_test_median) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred))
class AllClassificationModels: """ Wrapper class around all supported classification models: LogisticRegression, MLPClassifier, RandomForest, SVC, NuSVC, LinearSVC, and XGBClassifier. AllClassificationModels runs every available classification algorithm on the given dataset and outputs the mean accuracy, ROC-AUC, and execution time of each successful model when all_classification_models() is run. """ def __init__(self, attributes=None, labels=None, test_size=0.25, verbose=False): """ Initializes an AllClassificationModels object. The following parameters are needed to use an AllClassificationModels object: – attributes: a numpy array of the desired independent variables (Default is None) – labels: a numpy array of the classes (Default is None) – test_size: the proportion of the dataset to be used for testing the model; the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25) – verbose: specifies whether or not to ouput any and all logging during model training (Default is False) Note: These are the only parameters allowed. All other parameters for each model will use their default values. For more granular control, please instantiate each model individually. The following instance data is found after running all_classification_models() successfully: – logistic_regression: a reference to the LogisticRegression model – MLP: a reference to the MLPClassifier model – random_forest: a reference to the RandomForest model – SVC: a reference to the SVC model – nu_SVC: a reference to the NuSVC model – linear_SVC: a reference to the LinearSVC model – XGB_classifier: a reference to the XGBClassifier model After running all_classification_models(), the mean accuracy, ROC-AUC (if available), and execution time for each model that ran successfully will be displayed in tabular form. Any models that failed to run will be listed. """ self.attributes = attributes self.labels = labels self.test_size = test_size self.verbose = verbose self.logistic_regression = LogisticRegression(verbose=self.verbose) self.MLP = MLPClassifier(verbose=self.verbose) self.random_forest = RandomForestClassifier(verbose=self.verbose) self.SVC = SVC(verbose=self.verbose, probability=True) self.nu_SVC = NuSVC(verbose=self.verbose, probability=True) self.linear_SVC = LinearSVC(verbose=self.verbose) self.XGB_classifier = XGBClassifier(verbosity=int(self.verbose)) self._classification_models = { "Model": ["Accuracy", "ROC-AUC", "Time"] } self._failures = [] # Accessor methods def get_attributes(self): """ Accessor method for attributes. If an AllClassificationModels object is initialized without specifying attributes, attributes will be None. all_classification_models() cannot be called until attributes is a populated numpy array of independent variables; call set_attributes(new_attributes) to fix this. """ return self.attributes def get_labels(self): """ Accessor method for labels. If an AllClassificationModels object is initialized without specifying labels, labels will be None. all_classification_models() cannot be called until labels is a populated numpy array of classes; call set_labels(new_labels) to fix this. """ return self.labels def get_test_size(self): """ Accessor method for test_size. Should return a number or None. """ return self.test_size def get_verbose(self): """ Accessor method for verbose. Will default to False if not set by the user. """ return self.verbose def get_all_classification_models(self): """ Accessor method that returns a list of all models. All models within the list will be None if all_classification_models() hasn't been called, yet. """ return [ self.logistic_regression, self.MLP, self.random_forest, self.SVC, self.nu_SVC, self.linear_SVC, self.XGB_classifier ] def get_logistic_regression(self): """ Accessor method for logistic_regression. Will return None if all_classification_models() hasn't been called, yet. """ return self.logistic_regression def get_MLP(self): """ Accessor method for MLP. Will return None if all_classification_models() hasn't been called, yet. """ return self.MLP def get_random_forest(self): """ Accessor method for random_forest. Will return None if all_classification_models() hasn't been called, yet. """ return self.random_forest def get_SVC(self): """ Accessor method for SVC. Will return None if all_classification_models() hasn't been called, yet. """ return self.SVC def get_nu_SVC(self): """ Accessor method for nu_SVC. Will return None if all_classification_models() hasn't been called, yet. """ return self.nu_SVC def get_linear_SVC(self): """ Accessor method for linear_SVC. Will return None if all_classification_models() hasn't been called, yet. """ return self.linear_SVC def get_XGB_classifier(self): """ Accessor method for XGB_classifier. Will return None if all_classification_models() hasn't been called, yet. """ return self.XGB_classifier # Modifier methods def set_attributes(self, new_attributes=None): """ Modifier method for attributes. Input should be a numpy array of independent variables. Defaults to None. """ self.attributes = new_attributes def set_labels(self, new_labels=None): """ Modifier method for labels. Input should be a numpy array of classes. Defaults to None. """ self.labels = new_labels def set_test_size(self, new_test_size=0.25): """ Modifier method for test_size. Input should be a number or None. Defaults to 0.25. """ self.test_size = new_test_size def set_verbose(self, new_verbose=False): """ Modifier method for verbose. Input should be a truthy/falsy value. Defaults to False. """ self.verbose = new_verbose # Classification functionality def all_classification_models(self): """ Driver method for running all classification models with given attributes and labels. all_classification_models() first trains the models and determines their mean accuracy, ROC-AUC, and execution time via _all_classification_models_runner(). Then, all_classification_models() calls _print_results() to format and print each successful model's measurements, while also listing any failed models. If verbose is True, all verbose logging for each model will be enabled. If verbose is False, all logging to stdout and stderr will be suppressed. """ # Call helper method for running all classification models; suppress output, if needed if not self.verbose: suppress_output = io.StringIO() with redirect_stderr(suppress_output), redirect_stdout( suppress_output): self._all_classification_models_runner() else: self._all_classification_models_runner() # Print results self._print_results() # Helper methods def _all_classification_models_runner(self): """ Helper method that runs all models using the given dataset and all default parameters. After running all models, each model is determined to be either a success or failure, and relevant data (accuracy, ROC-AUC, execution time) is recorded. _all_classification_models_runner() may only be called by all_classification_models(). """ # Split dataset dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\ train_test_split(self.attributes, self.labels, test_size=self.test_size) # Run and time all models; identify each as success or failure try: start_time = time.time() self.logistic_regression.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["LogisticRegression"] =\ [self.logistic_regression.score(dataset_X_test, dataset_y_test), roc_auc_score(self.logistic_regression.predict(dataset_X_test), self.logistic_regression.predict_proba(dataset_X_test)[::, 1]), end_time - start_time] except: self._failures.append("LogisticRegression") try: start_time = time.time() self.MLP.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["MLPClassifier"] =\ [self.MLP.score(dataset_X_test, dataset_y_test), roc_auc_score(self.MLP.predict(dataset_X_test), self.MLP.predict_proba(dataset_X_test)[::, 1]), end_time - start_time] except: self._failures.append("MLPClassifier") try: start_time = time.time() self.random_forest.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["RandomForest"] =\ [self.random_forest.score(dataset_X_test, dataset_y_test), roc_auc_score(self.random_forest.predict(dataset_X_test), self.random_forest.predict_proba(dataset_X_test)[::, 1]), end_time - start_time] except: self._failures.append("RandomForest") try: start_time = time.time() self.SVC.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["SVC"] =\ [self.SVC.score(dataset_X_test, dataset_y_test), roc_auc_score(self.SVC.predict(dataset_X_test), self.SVC.predict_proba(dataset_X_test)[::, 1]), end_time - start_time] except: self._failures.append("SVC") try: start_time = time.time() self.nu_SVC.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["NuSVC"] =\ [self.nu_SVC.score(dataset_X_test, dataset_y_test), roc_auc_score(self.nu_SVC.predict(dataset_X_test), self.nu_SVC.predict_proba(dataset_X_test)[::, 1]), end_time - start_time] except: self._failures.append("NuSVC") try: start_time = time.time() self.linear_SVC.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["LinearSVC"] =\ [self.linear_SVC.score(dataset_X_test, dataset_y_test), "Not Available", end_time - start_time] except: self._failures.append("LinearSVC") try: start_time = time.time() self.XGB_classifier.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._classification_models["XGBClassifier"] =\ [self.XGB_classifier.score(dataset_X_test, dataset_y_test), roc_auc_score(self.XGB_classifier.predict(dataset_X_test), self.XGB_classifier.predict_proba(dataset_X_test)[::, 1]), end_time - start_time] except: self._failures.append("XGBClassifier") def _print_results(self): """ Helper method that prints results of _all_classification_models_runner() in tabular form. _print_results() may only be called by all_classification_models() after all models have attempted to run. """ # Print models that didn't fail print("\nResults:\n") for model, data in self._classification_models.items(): print("{:<20} {:<20} {:<20} {:<20}".format(model, data[0], data[1], data[2])) print() # Print failures, if any if len(self._failures) > 0: print("The following models failed to run:\n") for entry in self._failures: print(entry) print()
# Scaling using the Standard Scaler scaler = StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train)) X_test = scaler.transform(X_test) os = SMOTE(random_state=0) columns = X_train.columns os_data_X, os_data_y = os.fit_sample(X_train, y_train) os_data_X = pd.DataFrame(data=os_data_X, columns=columns) os_data_y = pd.DataFrame(data=os_data_y, columns=['target']) models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), MultinomialNB(), LogisticRegression(random_state=0), ] CV = 5 cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] for model in models: model_name = model.__class__.__name__ print("Started processing model family: ", model_name) accuracies = cross_val_score(model, os_data_X, np.ravel(os_data_y), scoring='accuracy', cv=CV) for fold_idx, accuracy in enumerate(accuracies):
def select_from_l1_svc(C=0.1, tol=1e-3, threshold="0.5*mean"): return SelectFromModel(LinearSVC(C=C, penalty="l1", dual=False, tol=tol, class_weight='balanced'), prefit=False, threshold=threshold)
pred = clf.predict(X_test) test_time = time() - t0 err = metrics.zero_one(y_test, pred) / float(pred.shape[0]) return err, train_time, test_time ###################################################################### ## Train Liblinear model liblinear_parameters = { 'loss': 'l2', 'penalty': 'l2', 'C': 1000, 'dual': False, 'tol': 1e-3, } liblinear_res = benchmark(LinearSVC(**liblinear_parameters)) liblinear_err, liblinear_train_time, liblinear_test_time = liblinear_res ###################################################################### ## Train GaussianNB model gnb_err, gnb_train_time, gnb_test_time = benchmark(GaussianNB()) ###################################################################### ## Train SGD model sgd_parameters = { 'alpha': 0.001, 'n_iter': 2, } sgd_err, sgd_train_time, sgd_test_time = benchmark( SGDClassifier(**sgd_parameters))
################# matplotlib 한글 구현 ############################# from matplotlib import font_manager, rc font_name = font_manager.FontProperties( fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) #################################################################### from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC X, y = mglearn.datasets.make_forge() fig, axes = plt.subplots(1, 2, figsize=(10, 3)) for model, ax in zip([LinearSVC(), LogisticRegression()], axes): clf = model.fit(X, y) mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5, ax=ax, alpha=.7) mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) ax.set_title("{}".format(clf.__class__.__name__)) ax.set_xlabel("특성 0") ax.set_ylabel("특성 1") axes[0].legend() plt.show()
"-------------------------------------------------------------------------------------------------------------------" "----------------------------------------------- SVM / RANDOM FOREST -----------------------------------------------" while True: input_model = int( input("Choose 1: Normal SVC || 2: Linear SVC || 3: Random Forest\n")) if input_model == 1 or input_model == 2 or input_model == 3: break # SVC(kernel = 'linear') if input_model == 1: model = SVC(kernel='linear') # LinearSVC elif input_model == 2: model = LinearSVC() # Random Forest Classifier else: model = RandomForestClassifier(n_estimators=250) "-------------------------------------------------------------------------------------------------------------------" accuracy_train = [] accuracy_test = [] precision_weight = [] recall_weight = [] f1_weight = [] # To count which fold the program is currently at
digits_train = pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', header=None) digits_test = pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None) X_train = digits_train[np.arange(64)] y_train = digits_train[64] X_test = digits_test[np.arange(64)] y_test = digits_test[64] from sklearn.svm import LinearSVC svc = LinearSVC() svc.fit(X_train, y_train) svc_y_predict = svc.predict(X_test) from sklearn.decomposition import PCA pca = PCA(n_components=30) X_pca_train = pca.fit_transform(X_train) X_pca_test = pca.transform(X_test) pca_svc = LinearSVC() pca_svc.fit(X_pca_train, y_train) pca_svc_y_predict = pca_svc.predict(X_pca_test) from sklearn.metrics import classification_report print('Score SVC: ', svc.score(X_test, y_test)) print(classification_report(y_test, svc_y_predict))
gNB_clf_bus = GaussianNB() rf_clf = RandomForestClassifier() rf_clf_bus = RandomForestClassifier() knn_clf = KNeighborsClassifier() knn_clf_bus = KNeighborsClassifier() logReg_clf = LogisticRegression(random_state=0) logReg_clf_bus = LogisticRegression(random_state=0) mlp_clf = MLPClassifier() mlp_clf_bus = MLPClassifier() qda_clf = QuadraticDiscriminantAnalysis() qda_clf_bus = QuadraticDiscriminantAnalysis() lda_clf = LinearDiscriminantAnalysis() lda_clf_bus = LinearDiscriminantAnalysis() gb_clf = GradientBoostingClassifier() gb_clf_bus = GradientBoostingClassifier() lsvm_clf = LinearSVC() lsvm_clf_bus = LinearSVC() clfs = [ xgb_clf, xgb_clf_bus, svm_clf, svm_clf_bus, gb_clf, gb_clf_bus, mlp_clf, mlp_clf_bus ] # ================================================Cross-Validation====================================================== if __name__ == '__main__': y_hats = [] y_bus_hats = [] accs = [] accs_bus = [] reports = [] reports_bus = []
total_de_acertos = sum(acertos) total_de_elementos = len(teste_dados) taxa_de_acerto = 100.0 * total_de_acertos / total_de_elementos msg = "Taxa de acerto do {0}: {1}".format(nome, taxa_de_acerto) print(msg) return taxa_de_acerto resultados = {} from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0)) resultadoOneVsRest = fit_and_predict("OneVsRest", modeloOneVsRest, treino_dados, treino_marcacoes, teste_dados, teste_marcacoes) resultados[resultadoOneVsRest] = modeloOneVsRest from sklearn.multiclass import OneVsOneClassifier modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0)) resultadoOneVsOne = fit_and_predict("OneVsOne", modeloOneVsOne, treino_dados, treino_marcacoes, teste_dados, teste_marcacoes) resultados[resultadoOneVsOne] = modeloOneVsOne from sklearn.naive_bayes import MultinomialNB modeloMultinomial = MultinomialNB() resultadoMultinomial = fit_and_predict("MultinomialNB", modeloMultinomial,
]) X_train = pd.concat([ X_HC_train, X_Br_train, X_CRC_train, X_GBM_train, X_HBC_train, X_lung_train, X_PAAD_train ]) y_test = pd.concat([ y_HC_test, y_Br_test, y_CRC_test, y_GBM_test, y_HBC_test, y_lung_test, y_PAAD_test ]) X_test = pd.concat([ X_HC_test, X_Br_test, X_CRC_test, X_GBM_test, X_HBC_test, X_lung_test, X_PAAD_test ]) y_pred = OneVsOneClassifier(LinearSVC(C=100.)).fit(X_train, y_train).predict(X_test) acc = accuracy_score(y_test, y_pred) rec = recall_score(y_test, y_pred, average='macro') accuracy.append(acc) recall.append(rec) HC_pred = y_pred[0:len(y_HC_test)] Br_pred = y_pred[len(y_HC_test):len(y_HC_test) + len(y_Br_test)] CRC_pred = y_pred[len(y_HC_test) + len(y_Br_test):len(y_HC_test) + len(y_Br_test) + len(y_CRC_test)] GBM_pred = y_pred[len(y_HC_test) + len(y_Br_test) + len(y_CRC_test):len(y_HC_test) + len(y_Br_test) +
def calibrate(self): # Define feature parameters self.color_space = 'YCrCb' # #ANY OTHER self.orient = 9 self.pix_per_cell = 8 self.cell_per_block = 2 self.hog_channel = 'ALL' self.spatial_size = (16, 16) self.hist_bins = 16 self.spatial_feat = True self.hist_feat = True self.hog_feat = True t = time.time() n_samples = 1000 dirs = os.listdir("data/vehicles/") cars = [] print(dirs) for image_type in dirs: cars.extend(glob.glob('data/vehicles/' + image_type + '/*.jpg')) print('Number of Vehicles Images found', len(cars)) with open('data/vehicles/cars.txt', 'w') as f: for fn in cars: f.write(fn + '\n') dirs = os.listdir("data/non-vehicles/") notcars = [] print(dirs) for image_type in dirs: notcars.extend( glob.glob('data/non-vehicles/' + image_type + '/*.jpg')) print('Number of Non-Vehicles Images found', len(notcars)) with open('data/non-vehicles/notcars.txt', 'w') as f: for fn in notcars: f.write(fn + '\n') # Read in car / not-car image test_cars = cars #np.array(cars)[car_indxs] test_notcars = notcars #np.array(notcars)[notcar_indxs] car_features = self.extract_features( test_cars, color_space=self.color_space, spatial_size=self.spacial_size, hist_bins=self.hist_bins, orient=self.orient, pix_per_cell=self.pix_per_call, cell_per_block=self.cell_per_block, hog_channel=self.hog_channel, spatial_feat=self.spatial_feat, hist_feat=self.hist_feat, hog_feat=self.hog_feat) notcar_features = self.extract_features( test_notcars, color_space=self.color_space, spatial_size=self.spacial_size, hist_bins=self.hist_bins, orient=self.orient, pix_per_cell=self.pix_per_call, cell_per_block=self.cell_per_block, hog_channel=self.hog_channel, spatial_feat=self.spatial_feat, hist_feat=self.hist_feat, hog_feat=self.hog_feat) print(time.time() - t, ' Seconds to compute features...') X = np.vstack((car_features, notcar_features)).astype(np.float) # Fit a per column scaler self.X_scaler = StandardScaler().fit(X) # Apply the scaler to X scaled_X = self.X_scaler.transform(X) # Define the labels vector y = np.hstack( (np.ones(len(car_features)), np.zeros(len(notcar_features)))) # Split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.1, random_state=rand_state) print('Using: ', self.orient, 'orientations,', self.pix_per_cell, 'pixels per cell', self.cell_per_block, 'cells per block,', self.hist_bins, 'histogram bins, and', self.spacial_size, 'spatial sampling') print('Feature vector length:', len(X_train[0])) # Use a linear SVC self.svc = LinearSVC() # Check the learning time of the SVC t = time.time() self.svc.fit(X_train, y_train) # Save the camera calibration result for later use (we won't worry about rvecs / tvecs) self.dist_pickle = {} dist_pickle["clf"] = self.svc dist_pickle["scaler"] = self.X_scaler dist_pickle["color_space"] = self.color_space dist_pickle["orient"] = self.orient dist_pickle["pix_per_cell"] = self.pix_per_cell dist_pickle["cell_per_block"] = self.cell_per_block dist_pickle["spatial_size"] = self.spatial_size dist_pickle["hog_channel"] = self.hog_channel dist_pickle["hist_bins"] = self.hist_bins dist_pickle["spatial_feat"] = self.spatial_feat dist_pickle["hist_feat"] = self.hist_feat dist_pickle["hog_feat"] = self.hog_feat pickle.dump(self.dist_pickle, open("model/dist_pickle.p", "wb")) print(round(time.time() - t, 2), ' Seconds to train SVC...') # Check the score of the SVC print('Test accuracy of svc = ', round(self.svc.score(X_test, y_test), 4))
def main(args): args = parser.parse_args() source = domain_map.get( args.source, args.source) target = domain_map.get( args.target, args.target) # algorithm steps: # 0) create or load a random data split print("Extracting data") source_train, source_train_labels, source_test, source_test_label, target_train, target_train_labels, target_test, target_test_labels = get_splits(source, target) all_unlabeled,source_unlabeled,target_unlabeled=XML2arrayRAW("data/"+source+"/"+source+"UN.txt", "data/"+target+"/"+target+"UN.txt") # 1) extract pivot features for domain 0 based on MI and frequency in domain 1 being over some threshold. print("Computing pivot candidates for the source domain (printing 2*num_pivots)") pivot_candidates = get_pivot_candidates(source, target, source_train, source_train_labels, source_unlabeled, target_unlabeled) for pivot_ind, pivot_candidate in enumerate(pivot_candidates[:args.pivots*2]): feature_name, feature_ind, feature_score = pivot_candidate print("%s : %f" % (feature_name, feature_score)) # 2) extract a training problem for each domain in the domain pair where we try to predict the pivot feature from the non-pivot features. (this is tricky because there may be interactions?) X_st_train, y_st_train, X_st_valid, y_st_valid = create_pivot_training_problem(source_train+source_unlabeled, target_unlabeled, pivot_candidates[:args.pivots]) num_feats = X_st_train.shape[1] // 3 parameters = {'C':[0.01, 0.1, 1, 10]} for pivot_num in range(args.pivots): print("Building a classifier for feature ind %d (%s)" % (pivot_num, pivot_candidates[pivot_num][0]) ) # build a classifier for predicting pivot_num labels = y_st_valid[:,pivot_num].toarray() # clf = LogisticRegression().fit(X_st_train, y_st_train[:, pivot_num]) svr = LinearSVC() clf = GridSearchCV(svr, parameters, scoring='f1', n_jobs=1) clf.fit(X_st_train, np.ravel(y_st_train[:, pivot_num].toarray())) print(" Average score of best setting %s is %f" % (str(clf.best_params_), clf.best_score_)) gen_coefs = clf.best_estimator_.coef_[0,:num_feats] src_coefs = clf.best_estimator_.coef_[0,num_feats:2*num_feats] tgt_coefs = clf.best_estimator_.coef_[0,2*num_feats:3*num_feats] src_diff = abs(src_coefs) - abs(gen_coefs) tgt_diff = abs(tgt_coefs) - abs(gen_coefs) src_power = (src_diff > 0).sum() / num_feats tgt_power = (tgt_diff > 0).sum() / num_feats gen_feat_sum = abs(gen_coefs).sum() src_feat_sum = abs(src_coefs).sum() tgt_feat_sum = abs(tgt_coefs).sum() assert abs(clf.best_estimator_.coef_).sum() - (gen_feat_sum + src_feat_sum + tgt_feat_sum) < 1, 'Something must be wrong with the summing of the feature weights! Weight of three feature partitions does not sum to total weight (within some tolerance).' preds = clf.predict(X_st_valid) tps = (labels[:,0] * preds).sum() total_true = labels.sum() total_preds = preds.sum() prec = tps / total_preds rec = tps / total_true f1 = 2* prec * rec / (prec + rec) # Do source and target-specific predictions to see whether there are any where target # f1 is much worse than source or both f1. might mean there is no way to predict that feature # in the target domain. # X_src = X_st_train[s] print(" F1 is %f" % (f1,)) print(" Feature weights are gen=%f, src=%f, tgt=%f" % (gen_feat_sum, src_feat_sum, tgt_feat_sum)) print(" Src feature power is %0.3f, tgt feature power is %0.3f" % (src_power, tgt_power))
def __init__(self, observations, groups, features, peaked, tail_prob=0.4, regressor=HuberRegressor(), classifier=LinearSVC(random_state=42)): super().__init__(observations, groups, features) if len(observations) != len(features) or len(observations) != len(peaked): raise ValueError() self.peaked = peaked self.regressor = regressor self.classifier = classifier self.tail_prob = tail_prob
print(df.head()) # # Drop Scale 0 # drop_cols = [c for c in df.columns if '0_0_' in c] # df.drop(drop_cols, axis=1, inplace=True) classifiers = { 'svm': SVC(gamma='auto', kernel='rbf', probability=True), 'rf': RandomForestClassifier(), 'lda': LinearDiscriminantAnalysis() } selectors = { 'mrmr': MRMR(method='MID', k_features=10), 'mrmr2': MutualInformationFeatureSelector(method='MRMR', n_features=50, n_jobs=n_cores), 'svc': SelectFromModel(LinearSVC(penalty='l2')), 'lasso': SelectFromModel(LassoCV(cv=5)) } # Create a classification pipeline pipeline = Pipeline([ # ('scaler', RobustScaler()), ('selector', selectors['mrmr2']), ('clf', classifiers['rf']) ]) # Define conversion times in months times = [24, 36, 60] # Results dataframe results = pd.DataFrame()
def get_ten_fold_crossvalid_perfermance(self, fisher_mode, settings=None): analysis_scr = [] predicted_score = False reduce_ratio = 1 #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1): #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0) kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds=10) #for subset_no in range(1, 11): for ((train_index, test_index), subset_no) in izip(kf, range(1, 11)): #for train_index, test_index in kf; print("Subset:", subset_no) print("Train index: ", train_index) print("Test index: ", test_index) #logger.info('subset number: ' + str(subset_no)) if 1: print "SVM" #start_index = int((subset_no - 1) * subset_size + 1) #if subset_no == 10: # end_index = int(max(start_index + subset_size, self.ddi_obj.total_number_of_sequences)) #else: # end_index = int(start_index + subset_size) #print start_index, end_index #(train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(start_index, end_index, reduce_ratio = reduce_ratio) (train_X_10fold, train_y_10fold), (train_X_reduced, train_y_reduced), ( test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset( train_index, test_index, reduce_ratio=reduce_ratio) standard_scaler = preprocessing.StandardScaler().fit( train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(scaled_train_X, train_y_reduced) predicted_test_y = Linear_SVC.predict(scaled_test_X) isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(scaled_train_X) isTest = False #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) # direct deep learning min_max_scaler = Precessing_Scaler_0_9() X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced) X_train_pre_validation_minmax = min_max_scaler.transform( train_X_reduced) x_test_minmax = min_max_scaler.transform(test_X) pretraining_X_minmax = min_max_scaler.transform(train_X_10fold) x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split( X_train_pre_validation_minmax, train_y_reduced, test_size=0.4, random_state=42) finetune_lr = 1 batch_size = 100 pretraining_epochs = cal_epochs(5000, x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = 0.001 training_epochs = 1500 hidden_layers_sizes = [100, 100] corruption_levels = [0.1, 0.1] if 1: print "direct deep learning" sda = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda.predict(x_train_minmax) y_train = y_train_minmax isTest = False #new analysis_scr.append(( self.ddi, subset_no, fisher_mode, 'DL', isTest ) + tuple( performance_score(y_train, training_predicted).values())) test_predicted = sda.predict(x_test_minmax) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values())) if 0: # deep learning using unlabeled data for pretraining print 'deep learning with unlabel data' pretraining_epochs = cal_epochs(5000, pretraining_X_minmax, batch_size=batch_size) sda_unlabel = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, pretraining_X_minmax = pretraining_X_minmax, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_unlabel.predict(x_train_minmax) y_train = y_train_minmax isTest = False #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple( performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_unlabel.predict(x_test_minmax) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple( performance_score(y_test, test_predicted, predicted_score).values())) if 0: # deep learning using split network print 'deep learning using split network' # get the new representation for A set. first 784-D pretraining_epochs = 5000 hidden_layers_sizes = [100, 100, 100] corruption_levels = [0, 0, 0] x = x_train_minmax[:, :x_train_minmax.shape[1] / 2] print "original shape for A", x.shape a_MAE_A = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform( x_train_minmax[:, :x_train_minmax.shape[1] / 2]) x = x_train_minmax[:, x_train_minmax.shape[1] / 2:] print "original shape for B", x.shape a_MAE_B = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_B = a_MAE_B.transform( x_train_minmax[:, x_train_minmax.shape[1] / 2:]) new_x_test_minmax_A = a_MAE_A.transform( x_test_minmax[:, :x_test_minmax.shape[1] / 2]) new_x_test_minmax_B = a_MAE_B.transform( x_test_minmax[:, x_test_minmax.shape[1] / 2:]) new_x_validation_minmax_A = a_MAE_A.transform( x_validation_minmax[:, :x_validation_minmax.shape[1] / 2]) new_x_validation_minmax_B = a_MAE_B.transform( x_validation_minmax[:, x_validation_minmax.shape[1] / 2:]) new_x_train_minmax_whole = np.hstack( (new_x_train_minmax_A, new_x_train_minmax_B)) new_x_test_minmax_whole = np.hstack( (new_x_test_minmax_A, new_x_test_minmax_B)) new_x_validationt_minmax_whole = np.hstack( (new_x_validation_minmax_A, new_x_validation_minmax_B)) finetune_lr = 1 batch_size = 100 pretraining_epochs = cal_epochs(5000, x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = 0.001 training_epochs = 1500 hidden_layers_sizes = [100, 100, 100] corruption_levels = [0, 0, 0] sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax, new_x_validationt_minmax_whole, y_validation_minmax , new_x_test_minmax_whole, y_test, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_transformed.predict( new_x_train_minmax_whole) y_train = y_train_minmax isTest = False #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple( performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_transformed.predict( new_x_test_minmax_whole) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple( performance_score(y_test, test_predicted, predicted_score).values())) report_name = filename + '_' + '_test10fold_'.join( map(str, hidden_layers_sizes) ) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str( reduce_ratio) + '_' + str(training_epochs) + '_' + current_date saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)
class Vehicle_Detection(): def __init__(self, ystart=400, ystop=656, xstart=200, xstop=1280, scale=1.5, scales=[1.5], threshold_factor=5, heat_threshold=1, nb_frame_ave=5, window=64, cnn_predict=False): self.dist_pickle = pickle.load(open("model/dist_pickle.p", "rb")) self.svc = self.dist_pickle["clf"] self.X_scaler = self.dist_pickle["scaler"] self.orient = self.dist_pickle["orient"] self.pix_per_cell = self.dist_pickle["pix_per_cell"] self.pix_per_cell2 = np.int(self.pix_per_cell / 2) self.cell_per_block = self.dist_pickle["cell_per_block"] self.cell_per_block2 = self.cell_per_block * 2 self.spatial_size = self.dist_pickle["spatial_size"] self.hist_bins = self.dist_pickle["hist_bins"] self.ystart = ystart self.ystop = ystop self.ystart2 = ystart self.ystop2 = np.int(ystart + (ystop - ystart) / 2) self.scale = scale self.threshold_factor = threshold_factor self.heatmaps = deque(maxlen=self.threshold_factor) self.heatmap = [] self.nb_frame_ave = nb_frame_ave self.frames = deque(maxlen=self.threshold_factor) self.heat_threshold = heat_threshold self.window = window self.scales = scales self.cnn_model = load_model('./dl_detect/model.h5') self.cnn_predict = cnn_predict self.xstart = xstart self.xstop = xstop self.lane_line_tracker = Lane_Line_Tracker() def calibrate(self): # Define feature parameters self.color_space = 'YCrCb' # #ANY OTHER self.orient = 9 self.pix_per_cell = 8 self.cell_per_block = 2 self.hog_channel = 'ALL' self.spatial_size = (16, 16) self.hist_bins = 16 self.spatial_feat = True self.hist_feat = True self.hog_feat = True t = time.time() n_samples = 1000 dirs = os.listdir("data/vehicles/") cars = [] print(dirs) for image_type in dirs: cars.extend(glob.glob('data/vehicles/' + image_type + '/*.jpg')) print('Number of Vehicles Images found', len(cars)) with open('data/vehicles/cars.txt', 'w') as f: for fn in cars: f.write(fn + '\n') dirs = os.listdir("data/non-vehicles/") notcars = [] print(dirs) for image_type in dirs: notcars.extend( glob.glob('data/non-vehicles/' + image_type + '/*.jpg')) print('Number of Non-Vehicles Images found', len(notcars)) with open('data/non-vehicles/notcars.txt', 'w') as f: for fn in notcars: f.write(fn + '\n') # Read in car / not-car image test_cars = cars #np.array(cars)[car_indxs] test_notcars = notcars #np.array(notcars)[notcar_indxs] car_features = self.extract_features( test_cars, color_space=self.color_space, spatial_size=self.spacial_size, hist_bins=self.hist_bins, orient=self.orient, pix_per_cell=self.pix_per_call, cell_per_block=self.cell_per_block, hog_channel=self.hog_channel, spatial_feat=self.spatial_feat, hist_feat=self.hist_feat, hog_feat=self.hog_feat) notcar_features = self.extract_features( test_notcars, color_space=self.color_space, spatial_size=self.spacial_size, hist_bins=self.hist_bins, orient=self.orient, pix_per_cell=self.pix_per_call, cell_per_block=self.cell_per_block, hog_channel=self.hog_channel, spatial_feat=self.spatial_feat, hist_feat=self.hist_feat, hog_feat=self.hog_feat) print(time.time() - t, ' Seconds to compute features...') X = np.vstack((car_features, notcar_features)).astype(np.float) # Fit a per column scaler self.X_scaler = StandardScaler().fit(X) # Apply the scaler to X scaled_X = self.X_scaler.transform(X) # Define the labels vector y = np.hstack( (np.ones(len(car_features)), np.zeros(len(notcar_features)))) # Split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.1, random_state=rand_state) print('Using: ', self.orient, 'orientations,', self.pix_per_cell, 'pixels per cell', self.cell_per_block, 'cells per block,', self.hist_bins, 'histogram bins, and', self.spacial_size, 'spatial sampling') print('Feature vector length:', len(X_train[0])) # Use a linear SVC self.svc = LinearSVC() # Check the learning time of the SVC t = time.time() self.svc.fit(X_train, y_train) # Save the camera calibration result for later use (we won't worry about rvecs / tvecs) self.dist_pickle = {} dist_pickle["clf"] = self.svc dist_pickle["scaler"] = self.X_scaler dist_pickle["color_space"] = self.color_space dist_pickle["orient"] = self.orient dist_pickle["pix_per_cell"] = self.pix_per_cell dist_pickle["cell_per_block"] = self.cell_per_block dist_pickle["spatial_size"] = self.spatial_size dist_pickle["hog_channel"] = self.hog_channel dist_pickle["hist_bins"] = self.hist_bins dist_pickle["spatial_feat"] = self.spatial_feat dist_pickle["hist_feat"] = self.hist_feat dist_pickle["hog_feat"] = self.hog_feat pickle.dump(self.dist_pickle, open("model/dist_pickle.p", "wb")) print(round(time.time() - t, 2), ' Seconds to train SVC...') # Check the score of the SVC print('Test accuracy of svc = ', round(self.svc.score(X_test, y_test), 4)) # Define a function to compute color histogram features def color_hist(self, image, nbins=32): # Compute the histogram of the RGB channels separately # Concatenate the histograms into a single feature vector # Return the feature vector # Take histograms in R, G, and B rhist = np.histogram(image[:, :, 0], bins=32) ghist = np.histogram(image[:, :, 1], bins=32) bhist = np.histogram(image[:, :, 2], bins=32) # Generating bin centers bin_edges = rhist[1] # Concatenate the histograms into a single feature vector hist_features = np.concatenate((rhist[0], ghist[0], bhist[0])) # Return the individual histograms, bin_centers and feature vector return hist_features def bin_spatial(self, img, size=(32, 32)): # Use cv2.resize().ravel() to create the feature vector features = cv2.resize(img, size).ravel() # Return the feature vector return features def data_look(self, car_list, notcar_list): data_dict = {} # Define a key in data_dict "n_cars" and store the number of car images data_dict["n_cars"] = len(car_list) # Define a key "n_notcars" and store the number of notcar images data_dict["n_notcars"] = len(notcar_list) # Read in a test image, either car or notcar example_img = mpimg.imread(car_list[0]) # Define a key "image_shape" and store the test image shape 3-tuple data_dict["image_shape"] = example_img.shape # Define a key "data_type" and store the data type of the test image. data_dict["data_type"] = example_img.dtype # Return data_dict return data_dict # Define a function to return HOG features and visualization def get_hog_features(img, orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True): if vis == True: # Use skimage.hog() to get both features and a visualization features, hog_image = hog(img, orientations=orient, pixels_per_cell=(pix_per_cell, pix_per_cell), cells_per_block=(cell_per_block, cell_per_block), visualise=vis, feature_vector=feature_vec) return features, hog_image else: # Use skimage.hog() to get features only features = features, hog_image = hog( img, orientations=orient, pixels_per_cell=(pix_per_cell, pix_per_cell), cells_per_block=(cell_per_block, cell_per_block), visualise=vis, feature_vector=feature_vec) return features # Define a function to extract features from a list of images # Have this function call bin_spatial() and color_hist() def extract_features(self, imgs, color_space='RGB', spatial_size=(32, 32), hist_bins=32, orient=9, pix_per_cell=8, cell_per_block=2, hog_channel=0, spatial_feat=True, hist_feat=True, hog_feat=True): # Create a list to append feature vectors to features = [] # Iterate through the list of images for file in imgs: file_features = [] # Read in each one by one image = mpimg.imread(file) # apply color conversion if other than 'RGB' if color_space != 'RGB': if color_space == 'HSV': feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) elif color_space == 'LUV': feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV) elif color_space == 'HLS': feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS) elif color_space == 'YUV': feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YUV) elif color_space == 'YCrCb': feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb) else: feature_image = np.copy(image) if spatial_feat == True: spatial_features = self.bin_spatial(feature_image, size=spatial_size) file_features.append(spatial_features) if hist_feat == True: # Apply color_hist() hist_features = self.color_hist(feature_image, nbins=hist_bins) file_features.append(hist_features) if hog_feat == True: # Call get_hog_features() with vis=False, feature_vec=True if hog_channel == 'ALL': hog_features = [] for channel in range(feature_image.shape[2]): hog_features.append( self.get_hog_features(feature_image[:, :, channel], orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True)) hog_features = np.ravel(hog_features) else: hog_features = self.get_hog_features( feature_image[:, :, hog_channel], orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True) # Append the new feature vector to the features list file_features.append(hog_features) features.append(np.concatenate(file_features)) # Return list of feature vectors return features # Define a function to return HOG features and visualization def get_hog_features(self, img, orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True): # Call with two outputs if vis==True if vis == True: features, hog_image = hog(img, orientations=orient, pixels_per_cell=(pix_per_cell, pix_per_cell), cells_per_block=(cell_per_block, cell_per_block), transform_sqrt=True, visualise=vis, feature_vector=feature_vec) return features, hog_image # Otherwise call with one output else: features = hog(img, orientations=orient, pixels_per_cell=(pix_per_cell, pix_per_cell), cells_per_block=(cell_per_block, cell_per_block), transform_sqrt=True, visualise=vis, feature_vector=feature_vec) return features # Define a function that takes an image, # start and stop positions in both x and y, # window size (x and y dimensions), # and overlap fraction (for both x and y) def slide_window(self, img, x_start_stop=[None, None], y_start_stop=[None, None], xy_window=(64, 64), xy_overlap=(0.5, 0.5)): # If x and/or y start/stop positions not defined, set to image size if x_start_stop[0] == None: x_start_stop[0] = 0 if x_start_stop[1] == None: x_start_stop[1] = img.shape[1] if y_start_stop[0] == None: y_start_stop[0] = 0 if y_start_stop[1] == None: y_start_stop[1] = img.shape[0] # Compute the span of the region to be searched xspan = x_start_stop[1] - x_start_stop[0] yspan = y_start_stop[1] - y_start_stop[0] # Compute the number of pixels per step in x/y nx_pix_per_step = np.int(xy_window[0] * (1 - xy_overlap[0])) ny_pix_per_step = np.int(xy_window[1] * (1 - xy_overlap[1])) # Compute the number of windows in x/y nx_buffer = np.int(xy_window[0] * (xy_overlap[0])) ny_buffer = np.int(xy_window[1] * (xy_overlap[1])) nx_windows = np.int((xspan - nx_buffer) / nx_pix_per_step) ny_windows = np.int((yspan - ny_buffer) / ny_pix_per_step) # Initialize a list to append window positions to window_list = [] # Loop through finding x and y window positions # Note: you could vectorize this step, but in practice # you'll be considering windows one by one with your # classifier, so looping makes sense for ys in range(ny_windows): for xs in range(nx_windows): # Calculate window position startx = xs * nx_pix_per_step + x_start_stop[0] endx = startx + xy_window[0] starty = ys * ny_pix_per_step + y_start_stop[0] endy = starty + xy_window[1] # Append window position to list window_list.append(((startx, starty), (endx, endy))) # Return the list of windows return window_list # Define a function to extract features from a single image window # This function is very similar to extract_features() # just for a single image rather than list of images def single_img_features(self, img, color_space='RGB', spatial_size=(32, 32), hist_bins=32, orient=9, pix_per_cell=8, cell_per_block=2, hog_channel=0, spatial_feat=True, hist_feat=True, hog_feat=True): # 1) Define an empty list to receive features img_features = [] # 2) Apply color conversion if other than 'RGB' if (color_space != 'RGB'): if (color_space == 'HSV'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) elif (color_space == 'LUV'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2LUV) elif (color_space == 'HLS'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HLS) elif (color_space == 'YUV'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YUV) elif (color_space == 'YCrCb'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb) else: # feature_image = np.copy(img) feature_image = img.copy() # 3) Compute spatial features if flag is set if (spatial_feat == True): spatial_features = self.bin_spatial(feature_image, size=spatial_size) # 4) Append features to list img_features.append(spatial_features) # 5) Compute histogram features if flag is set if (hist_feat == True): hist_features = self.color_hist(feature_image, nbins=hist_bins) # 6) Append features to list img_features.append(hist_features) # 7) Compute HOG features if flag is set if (hog_feat == True): if (hog_channel == 'ALL'): hog_features = [] hog_image = None for channel in range(feature_image.shape[2]): hog_features.extend( self.get_hog_features(feature_image[:, :, channel], orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True)) else: hog_features = self.get_hog_features( feature_image[:, :, hog_channel], orient, pix_per_cell, cell_per_block, vis=True, feature_vec=True) # 8) Append features to list img_features.append(hog_features) # 9) Return concatenated array of features return np.concatenate(img_features) def single_img_features_train(self, img, color_space='RGB', spatial_size=(32, 32), hist_bins=32, orient=9, pix_per_cell=8, cell_per_block=2, hog_channel=0, spatial_feat=True, hist_feat=True, hog_feat=True): # 1) Define an empty list to receive features img_features = [] # 2) Apply color conversion if other than 'RGB' if (color_space != 'RGB'): if (color_space == 'HSV'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) elif (color_space == 'LUV'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2LUV) elif (color_space == 'HLS'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HLS) elif (color_space == 'YUV'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YUV) elif (color_space == 'YCrCb'): feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb) else: # feature_image = np.copy(img) feature_image = img.copy() # 3) Compute spatial features if flag is set if (spatial_feat == True): spatial_features = self.bin_spatial(feature_image, size=spatial_size) # 4) Append features to list img_features.append(spatial_features) # 5) Compute histogram features if flag is set if (hist_feat == True): hist_features = self.color_hist(feature_image, nbins=hist_bins) # 6) Append features to list img_features.append(hist_features) # 7) Compute HOG features if flag is set if (hog_feat == True): if (hog_channel == 'ALL'): hog_features = [] hog_image = None for channel in range(feature_image.shape[2]): hog_features.extend( self.get_hog_features(feature_image[:, :, channel], orient, pix_per_cell, cell_per_block, vis=False, feature_vec=True)) else: hog_features, hog_image = self.get_hog_features( feature_image[:, :, hog_channel], orient, pix_per_cell, cell_per_block, vis=True, feature_vec=True) # 8) Append features to list img_features.append(hog_features) # 9) Return concatenated array of features return np.concatenate(img_features), hog_image # Define a function you will pass an image # and the list of windows to be searched (output of slide_windows()) def search_windows(self, img, windows, clf, scaler, color_space='RGB', spatial_size=(32, 32), hist_bins=32, hist_range=(0, 256), orient=9, pix_per_cell=8, cell_per_block=2, hog_channel=0, spatial_feat=True, hist_feat=True, hog_feat=True): # 1) Create an empty list to receive positive detection windows on_windows = [] # 2) Iterate over all windows in the list for window in windows: # 3) Extract the test window from original image test_img = cv2.resize( img[window[0][1]:window[1][1], window[0][0]:window[1][0]], (64, 64)) # 4) Extract features for that window using single_img_features() features = self.single_img_features(test_img, color_space=color_space, spatial_size=spatial_size, hist_bins=hist_bins, orient=orient, pix_per_cell=pix_per_cell, cell_per_block=cell_per_block, hog_channel=hog_channel, spatial_feat=spatial_feat, hist_feat=hist_feat, hog_feat=hog_feat) # 5) Scale extracted features to be fed to classifier test_features = scaler.transform(np.array(features).reshape(1, -1)) # 6) Predict using your classifier prediction = clf.predict(test_features) # 7) If positive (prediction == 1) then save the window if prediction == 1: on_windows.append(window) # 8) Return windows for positive detections return on_windows # Define a function to draw bounding boxes def draw_boxes(self, img, bboxes, color=(0, 0, 255), thick=6): # Make a copy of the image imcopy = np.copy(img) # Iterate through the bounding boxes for bbox in bboxes: # Draw a rectangle given bbox coordinates cv2.rectangle(imcopy, bbox[0], bbox[1], color, thick) # Return the image copy with boxes drawn return imcopy def convert_color(self, img, conv='RGB2YCrCb'): if conv == 'RGB2YCrCb': return cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb) if conv == 'BGR2YCrCb': return cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb) if conv == 'RGB2LUV': return cv2.cvtColor(img, cv2.COLOR_RGB2LUV) # Define a single function that can extract features using hog sub-sampling and make predictions def find_cars(self, img, ystart, ystop, scale, svc, X_scaler, orient, pix_per_cell, cell_per_block, spatial_size, hist_bins, window=64, subsample=(64, 64)): img_boxes = [] count = 0 draw_img = np.copy(img) #Make a heatmap of zeros heatmap = np.zeros_like(img[:, :, 0]) img2 = img img = img.astype(np.float32) / 255 img_tosearch = img[ystart:ystop, :, :] img_tosearch2 = img2[ystart:ystop, :, :] ctrans_tosearch = self.convert_color(img_tosearch, conv='RGB2YCrCb') if scale != 1: imshape = ctrans_tosearch.shape ctrans_tosearch = cv2.resize( ctrans_tosearch, (np.int(imshape[1] / scale), np.int(imshape[0] / scale))) img_tosearch2 = cv2.resize( img_tosearch2, (np.int(imshape[1] / scale), np.int(imshape[0] / scale))) ch1 = ctrans_tosearch[:, :, 0] ch2 = ctrans_tosearch[:, :, 1] ch3 = ctrans_tosearch[:, :, 2] # Compute individual channel HOG features for the entire image hog1 = self.get_hog_features(ch1, orient, pix_per_cell, cell_per_block, feature_vec=False) hog2 = self.get_hog_features(ch2, orient, pix_per_cell, cell_per_block, feature_vec=False) hog3 = self.get_hog_features(ch3, orient, pix_per_cell, cell_per_block, feature_vec=False) # Define blocks and steps as above nxblocks = (ch1.shape[1] // pix_per_cell) - cell_per_block + 1 nyblocks = (ch1.shape[0] // pix_per_cell) - cell_per_block + 1 nfeat_per_block = orient * cell_per_block**2 # 64 was the orginal sampling rate, with 8 cells and 8 pix per cell nblocks_per_window = (window // pix_per_cell) - cell_per_block + 1 cells_per_step = 2 # Instead of overlap, define how many cells to step nxsteps = (nxblocks - nblocks_per_window) // cells_per_step nysteps = (nyblocks - nblocks_per_window) // cells_per_step for xb in range(nxsteps): for yb in range(nysteps): ypos = yb * cells_per_step xpos = xb * cells_per_step xleft = xpos * pix_per_cell ytop = ypos * pix_per_cell # Extract the image patch subimg = cv2.resize( ctrans_tosearch[ytop:ytop + window, xleft:xleft + window], subsample) # Extract HOG for this patch hog_feat1 = hog1[ypos:ypos + nblocks_per_window, xpos:xpos + nblocks_per_window].ravel() hog_feat2 = hog2[ypos:ypos + nblocks_per_window, xpos:xpos + nblocks_per_window].ravel() hog_feat3 = hog3[ypos:ypos + nblocks_per_window, xpos:xpos + nblocks_per_window].ravel() hog_features = np.hstack((hog_feat1, hog_feat2, hog_feat3)) # Get color features spatial_features = self.bin_spatial(subimg, size=spatial_size) hist_features = self.color_hist(subimg, nbins=hist_bins) # Scale features and make a prediction test_features = X_scaler.transform( np.hstack((spatial_features, hist_features, hog_features)).reshape(1, -1)) test_prediction = svc.predict(test_features) if test_prediction == 1: xbox_left = np.int(xleft * scale) ytop_draw = np.int(ytop * scale) win_draw = np.int(window * scale) cv2.rectangle( draw_img, (xbox_left, ytop_draw + ystart), (xbox_left + win_draw, ytop_draw + win_draw + ystart), (0, 0, 255), 6) img_boxes.append(((xbox_left, ytop_draw + ystart), (xbox_left + win_draw, ytop_draw + win_draw + ystart))) heatmap[ytop_draw + ystart:ytop_draw + win_draw + ystart + 1, xbox_left:xbox_left + win_draw + 1] += 1 return draw_img, heatmap def find_cars_nn(self, img, ystart, ystop, scale, pix_per_cell, cell_per_block, window=64, subsample=(64, 64)): img_boxes = [] count = 0 draw_img = np.copy(img) #Make a heatmap of zeros heatmap = np.zeros_like(img[:, :, 0]) img_tosearch = img[ystart:ystop, self.xstart:self.xstop, :] imshape = img_tosearch.shape if scale != 1: img_tosearch = cv2.resize( img_tosearch, (np.int(imshape[1] / scale), np.int(imshape[0] / scale))) # Define blocks and steps as above nxblocks = (img_tosearch.shape[1] // pix_per_cell) - cell_per_block + 1 nyblocks = (img_tosearch.shape[0] // pix_per_cell) - cell_per_block + 1 # 64 was the orginal sampling rate, with 8 cells and 8 pix per cell #window = 64 nblocks_per_window = (window // pix_per_cell) - cell_per_block + 1 cells_per_step = 2 # Instead of overlap, define how many cells to step nxsteps = (nxblocks - nblocks_per_window) // cells_per_step nysteps = (nyblocks - nblocks_per_window) // cells_per_step for xb in range(nxsteps): for yb in range(nysteps): ypos = yb * cells_per_step xpos = xb * cells_per_step xleft = xpos * pix_per_cell ytop = ypos * pix_per_cell # Extract the image patch subimg = cv2.resize( img_tosearch[ytop:ytop + window, xleft:xleft + window], subsample) count += 1 test_prediction = self.cnn_model.predict(subimg[None, :, :, :], batch_size=1) if (test_prediction[0][0] > 0.5): test_prediction = 1 else: test_prediction = 0 if (test_prediction == 1): xbox_left = np.int(xleft * scale) ytop_draw = np.int(ytop * scale) win_draw = np.int(window * scale) cv2.rectangle( draw_img, (xbox_left + self.xstart, ytop_draw + ystart), (xbox_left + win_draw + self.xstop, ytop_draw + win_draw + ystart), (0, 0, 255), 6) heatmap[ytop_draw + ystart:ytop_draw + win_draw + ystart + 1, xbox_left + self.xstart:xbox_left + win_draw + self.xstart + 1] += 1 return draw_img, heatmap def add_heat(self, heatmap, bbox_list): # Iterate through list of bboxes for box in bbox_list: # Add += 1 for all pixels inside each bbox # Assuming each "box" takes the form ((x1, y1), (x2, y2)) heatmap[box[0][1]:box[1][1], box[0][0]:box[1][0]] += 1 # Return updated heatmap return heatmap # Iterate through list of bboxes def apply_threshold(self, heatmap, threshold): # Zero out pixels below the threshold heatmap[heatmap <= threshold] = 0 # Return thresholded map return heatmap def draw_labeled_bboxes(self, img, labels): # Iterate through all detected cars for car_number in range(1, labels[1] + 1): # Find pixels with each car_number label value nonzero = (labels[0] == car_number).nonzero() # Identify x and y values of those pixels nonzeroy = np.array(nonzero[0]) nonzerox = np.array(nonzero[1]) # Define a bounding box based on min/max x and y bbox = ((np.min(nonzerox), np.min(nonzeroy)), (np.max(nonzerox), np.max(nonzeroy))) # Draw the box on the image cv2.rectangle(img, bbox[0], bbox[1], (0, 0, 255), 6) # Return the image return img def visualise(self, fig, rows, cols, imgs, titles): for i, img in enumerate(imgs): plt.subplot(rows, cols, i + 1) plt.title(i + 1) img_dims = len(img.shape) if (img_dims < 3): plt.imshow(img, cmap='hot') plt.title(titles[i]) else: plt.imshow(img) plt.title(titles[i]) def visualisex(self, fig, rows, cols, imgs, titles): for i, img in enumerate(imgs): plt.subplot(rows, cols, i + 1) plt.title(i + 1) img_dims = len(img.shape) if (img_dims < 3): plt.imshow(img, cmap='gray') plt.title(titles[i]) else: plt.imshow(img) plt.title(titles[i]) def process_image(self, img): for scale in self.scales: if (self.cnn_predict): out_image, heatmap = self.find_cars_nn( img, self.ystart, self.ystop, scale, self.pix_per_cell, self.cell_per_block, ) else: out_image, heatmap = self.find_cars( img, self.ystart, self.ystop, scale, self.svc, self.X_scaler, self.orient, self.pix_per_cell, self.cell_per_block, self.spatial_size, self.hist_bins) self.heatmaps.append(heatmap) intergrated_heat_maps = np.sum(self.heatmaps, axis=0) threshold_heat_map = self.apply_threshold(intergrated_heat_maps, self.heat_threshold) labels = label(threshold_heat_map) # Draw bounding boxes on a copy of the image draw_image = self.draw_labeled_bboxes(np.copy(img), labels) draw_image = self.lane_line_tracker.process_image(draw_image) return draw_image
acc_log = round(logreg.score(X_train, Y_train) * 100, 2) # In[87]: acc_log # In[88]: from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC, LinearSVC # In[89]: linear_svm = LinearSVC() linear_svm.fit(X_train, Y_train) Y_pred = linear_svm.predict(X_test) acc_linear_svm = round(linear_svm.score(X_train, Y_train) * 100, 2) # In[98]: acc_linear_svm # In[99]: random_forest = RandomForestClassifier(n_estimators=150) random_forest.fit(X_train, Y_train) Y_prediction = random_forest.predict(X_test) random_forest.score(X_train, Y_train)
#GET THE TRAINING AND TESTING DATA. X_train_vids, X_test_vids = classify_library.limited_input(training_dict, testing_dict, 30, 24) X_train, Y_train = classify_library.make_FV_matrix(X_train_vids,training_output, class_index) X_test, Y_test = classify_library.make_FV_matrix(X_test_vids,testing_output, class_index) training_PCA = classify_library.limited_input1(training_dict,1) #Experiments with PCA pca_dim = 500 pca = PCA(n_components=pca_dim) pca.fit(X_train) X_train_PCA = pca.transform(X_train) X_test_PCA = pca.transform(X_test) estimator = OneVsRestClassifier(LinearSVC(penalty='l2', random_state=0, C=100, loss='hinge')) classifier = estimator.fit(X_train_PCA, Y_train) metrics = classify_library.metric_scores(classifier, X_test_PCA, Y_test, verbose=True) logging.info("mAP, accuracy_score, avg_Precision, avg_Recall") logging.info (metrics) logging.info("Complete Evaluation") do_learning_curve = True if do_learning_curve: X_full = np.vstack([X_train_PCA, X_test_PCA]) Y_full = np.hstack([Y_train, Y_test]) title= "Learning Curves (Linear SVM, C: %d, loss: %s, penalty: %s, PCA dim: %d)" % (100,'hinge','l2',pca_dim) #cv = cross_validation.ShuffleSplit(X_full.shape[0], n_iter=4,test_size=0.2, random_state=0) cv = ShuffleSplit(n_splits=5,test_size=0.5, random_state=0) estimator = OneVsRestClassifier(LinearSVC(random_state=0, C=100, loss='hinge', penalty='l2')) classify_library.plot_learning_curve(estimator, title, X_full, Y_full, (0.7, 1.01), cv=cv, n_jobs=1)
from sklearn.metrics import accuracy_score, cohen_kappa_score from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC, SVC import util from feature_extraction import create_fbcsp args = util.parse_args() loader = util.Loader(args.d, dataset_dir=args.d_path, labels_dir=args.l_path) logger = util.get_logger("TEST__%s" % args.d) # Configuration paramenters win_len = 2 # Window size C = 0.04 estimator = LinearSVC(C=C) n_components = 2 # Number of components in CSP start_time = 3.5 # Window start time for training train_sessions = [[1, 3], [1, 2, 3], [1, 2, 3], [3], [3], [3], [3], [3], [3]] test_sessions = [4, 5] freq_bands = [[10, 14, 20, 24], [10, 14, 20, 24], [8, 12, 18, 22], [8, 12, 10, 14], [22, 26, 26, 30], [10, 14, 12, 16], [12, 16, 18, 22], [8, 12, 10, 14], [18, 22, 22, 26]] logger.info("CLASSIFICATION RESULTS") logger.info("------------") logger.info("- Dataset = BCI Competition IV - %s" % args.d) logger.info("- Dataset dir = %s" % args.d_path) logger.info("- Estimator = SVM (C=%.2f)" % C) logger.info("- Window start time for training (at sec) = %s" % start_time) logger.info("- Window length (secs) = %s" % win_len)
def get_LOO_perfermance(self, fisher_mode, settings=None): analysis_scr = [] predicted_score = False reduce_ratio = 1 for seq_no in range(1, self.ddi_obj.total_number_of_sequences + 1): print seq_no logger.info('sequence number: ' + str(seq_no)) if 1: print "SVM" (train_X_LOO, train_y_LOO), (train_X_reduced, train_y_reduced), ( test_X, test_y ) = self.ddi_obj.get_LOO_training_and_reduced_traing( seq_no, reduce_ratio=reduce_ratio) standard_scaler = preprocessing.StandardScaler().fit( train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(scaled_train_X, train_y_reduced) predicted_test_y = Linear_SVC.predict(scaled_test_X) isTest = True #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(scaled_train_X) isTest = False #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) # Deep learning part min_max_scaler = Precessing_Scaler_0_9() X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced) X_train_pre_validation_minmax = min_max_scaler.transform( train_X_reduced) x_test_minmax = min_max_scaler.transform(test_X) pretraining_X_minmax = min_max_scaler.transform(train_X_LOO) x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split( X_train_pre_validation_minmax, train_y_reduced, test_size=0.4, random_state=42) finetune_lr = 1 batch_size = 100 pretraining_epochs = cal_epochs(1500, x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = 0.001 training_epochs = 1500 hidden_layers_sizes = [100, 100] corruption_levels = [0, 0] if 1: print "direct deep learning" # direct deep learning sda = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda.predict(x_train_minmax) y_train = y_train_minmax isTest = False #new analysis_scr.append(( self.ddi, seq_no, fisher_mode, 'DL', isTest ) + tuple( performance_score(y_train, training_predicted).values())) test_predicted = sda.predict(x_test_minmax) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values())) if 0: # deep learning using unlabeled data for pretraining print 'deep learning with unlabel data' pretraining_epochs_for_reduced = cal_epochs( 1500, pretraining_X_minmax, batch_size=batch_size) sda_unlabel = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, pretraining_X_minmax = pretraining_X_minmax, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs_for_reduced, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_unlabel.predict(x_train_minmax) y_train = y_train_minmax isTest = False #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple( performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_unlabel.predict(x_test_minmax) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple( performance_score(y_test, test_predicted, predicted_score).values())) if 0: # deep learning using split network print 'deep learning using split network' # get the new representation for A set. first 784-D pretraining_epochs = 1500 hidden_layers_sizes = [50, 50] corruption_levels = [0, 0] x = x_train_minmax[:, :x_train_minmax.shape[1] / 2] print "original shape for A", x.shape a_MAE_A = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform( x_train_minmax[:, :x_train_minmax.shape[1] / 2]) x = x_train_minmax[:, x_train_minmax.shape[1] / 2:] print "original shape for B", x.shape a_MAE_B = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_B = a_MAE_B.transform( x_train_minmax[:, x_train_minmax.shape[1] / 2:]) new_x_test_minmax_A = a_MAE_A.transform( x_test_minmax[:, :x_test_minmax.shape[1] / 2]) new_x_test_minmax_B = a_MAE_B.transform( x_test_minmax[:, x_test_minmax.shape[1] / 2:]) new_x_validation_minmax_A = a_MAE_A.transform( x_validation_minmax[:, :x_validation_minmax.shape[1] / 2]) new_x_validation_minmax_B = a_MAE_B.transform( x_validation_minmax[:, x_validation_minmax.shape[1] / 2:]) new_x_train_minmax_whole = np.hstack( (new_x_train_minmax_A, new_x_train_minmax_B)) new_x_test_minmax_whole = np.hstack( (new_x_test_minmax_A, new_x_test_minmax_B)) new_x_validationt_minmax_whole = np.hstack( (new_x_validation_minmax_A, new_x_validation_minmax_B)) finetune_lr = 1 batch_size = 100 pretraining_epochs = cal_epochs(1500, x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = 0.001 training_epochs = 1500 hidden_layers_sizes = [100, 100] corruption_levels = [0, 0] sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax, new_x_validationt_minmax_whole, y_validation_minmax , new_x_test_minmax_whole, y_test, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_transformed.predict( new_x_train_minmax_whole) y_train = y_train_minmax isTest = False #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple( performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_transformed.predict( new_x_test_minmax_whole) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple( performance_score(y_test, test_predicted, predicted_score).values())) report_name = filename + '_' + '_'.join(map( str, hidden_layers_sizes)) + '_' + str(pretrain_lr) + '_' + str( finetune_lr) + '_' + str(reduce_ratio) + '_' + str( training_epochs) + '_' + current_date saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)