Example #1
0
class Classifier():
    def __init__(self, model_filename=None):
        if model_filename is not None:
            self.load_model(model_filename)
        else:
            self.linSVC_obj = LinearSVC()


    def save_model(self, filename):
        # save linSVC object to file, compress is also to prevent multiple model files.
        joblib.dump(self.linSVC_obj, filename, compress=3)


    def load_model(self, filename):
        # load linSVC Object from file
        self.linSVC_obj = joblib.load(filename)


    def train(self, X, Y):
        assert X.ndim == 2, "Classifier training data X.ndim is %d instead of 2" %X.ndim
        assert Y.ndim == 1, "Classifier training data Y.ndim is %d instead of 1" %Y.ndim

        # train the model
        self.linSVC_obj.fit(X,Y)


    def get_predictions(self, X):
        assert X.ndim == 2, "Classifier prediction data X.ndim is %d instead of 2" %X.ndim

        # get classes
        try:
            return self.linSVC_obj.predict(X)
        except NotFittedError:
            raise NotFittedError("Classification model cannot preidct without being trained first. " \
                                 + "Train the classification model at least once to prevent this error.")
Example #2
0
    def run_ratio(self, dataset, set_size):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        test_set_original = (X_test, y_test)

        large = ENMLT(LinearSVC)
        large.fit(X_train, y_train)

        simple = LinearSVC()
        simple.fit(X_train, y_train)

        for r in numpy.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)

            y_pred = large.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc1 = self.accuracy(cm)

            y_pred = simple.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc2 = self.accuracy(cm)

            print "%.2f, %f, %f" % (r, acc1, acc2)
 def fit(self, X, y):
     # The smaller C, the stronger the regularization.
     # The more regularization, the more sparsity.
     self.transformer_ = LinearSVC(C=1000, penalty="l1",
                                   dual=False, tol=1e-3)
     X = self.transformer_.fit_transform(X, y)
     return LinearSVC.fit(self, X, y)
Example #4
0
def train(dataset):
    print "Reading dataset ..."

    features = np.array(dataset.data, "int16")
    labels = np.array(dataset.target, "int")
    nExamples = features.shape[0]

    # Compute HOGs for each image in the database
    print "Extracting features for " + str(nExamples) + " training examples ... ",
    sys.stdout.flush()
    startTime = time.clock()
    list_hog_fd = []
    for feature in features:
        fd = hog(
            feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False
        )
        list_hog_fd.append(fd)
    hog_features = np.array(list_hog_fd, "float64")
    elapsedTime = time.clock() - startTime
    print "{0:.3f}s ({1:.4f}s/example)".format(elapsedTime, elapsedTime / nExamples)

    print "Training ... ",
    sys.stdout.flush()
    startTime = time.clock()
    clf = LinearSVC()
    clf.fit(hog_features, labels)
    elapsedTime = time.clock() - startTime
    print "{0:.3f}s".format(elapsedTime)

    print "Saving model to " + MODEL_FILE
    joblib.dump(clf, MODEL_FILE, compress=3)

    print "Training finished ..."
Example #5
0
def svm_vecteur():
    "Interprétation des images comme vecteurs de pixels et classification via le SVM"
    best=np.zeros(4)
    
    for npix in range(50,200,50):
        _, data, target, _ = utils.chargementVecteursImages(mer,ailleurs,1,-1,npix)
        X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed())
        
        for iterations in range(250,1000,250):
            start_time = time.time()
            svc = LinearSVC(random_state=random.seed(), max_iter=iterations)
            
            x1=np.array(X_train)
            x1 = np.reshape(x1, (x1.shape[0],x1.shape[2]))
            x2=np.array(X_test)
            x2 = np.reshape(x2, (x2.shape[0],x2.shape[2]))
                
            svc.fit(X=x1, y=Y_train)
            score = svc.score(x2,Y_test)
                
            end_time = time.time()
            if score>best[0]:
                best[0] = score
                best[1] = iterations
                best[2] = end_time-start_time
                best[3] = npix
    
    print("| SVM linéaire                   | V.Pix {:4.0f} | iterations={:1.0f}                       | {:10.3f}ms | {:1.3f} |".format(best[3],best[1],best[3]*1000,best[0]))
Example #6
0
def linear_svc(train_bow,train_labels,test_bow,test_labels,bow_indexes):
    print("Training linear svc")
    svc_classifier=LinearSVC()

    svc_classifier.fit(train_bow,train_labels)
    print("Testing linear svc")
    test(svc_classifier,"svc",test_bow,test_labels,bow_indexes)
Example #7
0
def processMBHval():
    for featType in ['MBH']:
        names = getnames()
        gtlabels = readpkl('{}data/labels.pkl'.format(baseDir))
        indexs = readpkl('{}data/indexs.pkl'.format(baseDir))
        actionIDs,taxonomy,database = readannos()
        print 'getting training data.... ',
        xtrain,ytrain = getdataVal(database,indexs,gtlabels,'training',featType)
        print 'got it!! and shape is ',np.shape(xtrain)
        #print 'getting validation data.... ',
        #xval,yval = getdata(database,indexs,gtlabels,'validation',featType)
        #print 'got it!! and shape is ',np.shape(xval)
        
    
        if featType == 'IMS':
            jobs = 16
            c = 0.01;
        else:
            jobs = 16
            c = 10;
    
        clf = LinearSVC(C = c)
        clf = clf.fit(xtrain, ytrain)
        
        saveName = '{}data/train-valSVM-{}.pkl'.format(baseDir,featType)
        with open(saveName,'w') as f:
            pickle.dump(clf,f)
def train_classifier():
    pos_feat_path = positive_features_path
    neg_feat_path = negative_features_path

    model_path = classifier_model_path

    feature_vectors = []
    labels = []

    for feat_path in glob.glob(os.path.join(pos_feat_path, "*.feat")):
        fd = joblib.load(feat_path)
        print len(fd)
        if len(fd):
            fd = fd.astype(numpy.object)
            feature_vectors.append(fd)
            labels.append(1)

    for feat_path in glob.glob(os.path.join(neg_feat_path, "*.feat")):
        fd = joblib.load(feat_path)
        print len(fd)
        if len(fd):
            fd = fd.astype(numpy.object)
            feature_vectors.append(fd)
            labels.append(0)

    classifier = LinearSVC()
    print "Training classifier"
    classifier.fit(feature_vectors, labels)
    print "Classifier successfully trained"
    if not os.path.isdir(os.path.split(model_path)[0]):
        os.makedirs(os.path.split(model_path)[0])
    joblib.dump(classifier, model_path)
Example #9
0
 def fit(self, X, Y, W):
     clf = LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual,
                     tol=self.tol, C=self.C, multi_class=self.multi_class,
                     fit_intercept=self.fit_intercept,
                     intercept_scaling=self.intercept_scaling,
                     random_state=self.random_state)
     return LinearSVMClassifier(clf.fit(X, Y.reshape(-1)))
Example #10
0
def train(train_input, train_output, test_input, test_output):
    # 训练模块

    # 选择模型
    # model = MultinomialNB()
    # model = GaussianNB()
    # model = SGDClassifier()
    # model = SVC(kernel='linear') # 这个很慢
    model = LinearSVC()
    # model = RandomForestClassifier(max_depth=2, n_estimators=500)
    # model = AdaBoostClassifier(n_estimators=500,base_estimator=DecisionTreeClassifier(max_depth=10))

    # 训练 & 评测
    model.fit(train_input,train_output)
    pred_train = model.predict(train_input)
    pred_test = model.predict(test_input)

    label_size = max(train_output)+1
    train_ratio = cal_accuracy(pred_train, train_output)
    train_recal = cal_recall(pred_train, train_output, label_size)
    # print(test_output)
    print(list(pred_test))
    test_ratio = cal_accuracy(pred_test, test_output)
    test_recal = cal_recall(pred_test, test_output, label_size)
    print('%f\t%f'%(train_ratio, test_ratio))
    print('%f\t%f'%(train_recal, test_recal))
Example #11
0
def stump(X, y):
    score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision')
    clf = LinearSVC()
    clf.fit(X, y)
    coef = clf.coef_[0,0]
    inter = clf.intercept_[0]
    return np.mean(score), np.sign(coef), inter / np.abs(coef)
Example #12
0
    def linearSVM(self):
        '''
            線形SVMを用いた2クラス分類
            args :      -> 
            dst  :      -> 
            param:      -> 
        '''
        # 学習データ
        data_training_tmp = np.loadtxt('../../../data/statistical_data/CodeIQ_auth.txt', delimiter=' ')
        data_training = [[x[0], x[1]] for x in data_training_tmp]
        label_training = [int(x[2]) for x in data_training_tmp]

        # 試験データ
        data_test = np.loadtxt('../../../data/statistical_data/CodeIQ_auth.txt', delimiter=' ')

        print np.array(data_test).shape

        # 学習
        estimator = LinearSVC(C=1.0)
        estimator.fit(data_training, label_training)

        # 予測
        label_prediction = estimator.predict(data_test[:,0:2])
        print(label_prediction)

        print 
Example #13
0
class TrainingTesting() :
	def __init__(self) :
		self.y = []
		self.noa = 0
		self.author_names = []
		self.train_data = []
		self.author_files = os.listdir(path+"/generated_files")
		#print author_names
		for author in self.author_files :
			self.author_names.append(author[:-4])
			text1 = open(path+"/generated_files/"+author,"r").read().split("\n")
			#print text1[1:-1]
			for txt in text1[1:-1] :
				t = []
				self.y.append(self.noa)
				#t.append(self.noa) 
				for i in txt.split(",")[1:-1] :
					t.append(float(i))
				self.train_data.append(t)
			self.noa += 1
		#print self.y
		#print self.train_data
	def train(self) :
		self.clfr = LinearSVC()
		self.clfr.fit(self.train_data,self.y)
		#print self.author_names[clfr.predict(self.train_data[0])[0]]
	def test(self,test_data) :
		self.correct_author_name = self.author_names[self.clfr.predict(test_data)[0]]
class Classifier:
  def __init__(self, ctype):
    self.ctype = ctype

  def train(self, data, labels):
    if self.ctype == "SVM":
      self.model = LinearSVC()
      self.model.fit(data, labels)
    elif self.ctype == "Decision":
      print "Unsupported"
    elif self.ctype == "Chi-Squared":
      self.model_data = data
      self.model_labels = labels

  def predict(self, data):
    if self.ctype == "SVM":
      return self.model.predict(data)
    elif self.ctype == "Decision":
      print "Unsupported"
    elif self.ctype == "Chi-Squared":
      predictions = []
      for sample, test_hist in enumerate(data):
        #Storing the first distance by default
        lowest_score = cv2.compareHist(np.array(self.model_data[0], dtype = np.float32),
                np.array(test_hist, dtype = np.float32), method = 1)
        predictions.append(self.model_labels[0])
        #Going through the rest of data
        for index, train_hist in enumerate(self.model_data):
          score = cv2.compareHist(np.array(train_hist, dtype = np.float32),
                np.array(test_hist, dtype = np.float32), method = 1)
          if score < lowest_score:
            lowest_score = score
            predictions[sample] = self.model_labels[index]
      return predictions
def run_model(X_train, y_train, X_test, y_test, model, layer, C=0):
    """
    Implement sklearn LinearSVC model and fit to training data.
    Predict labels of test data.
    Dump the training data, training labels, test features, test labels, predictions and pickled model.
    :params X_train: array of training features
    :params y_train: array of training labels
    :params X_test: array of test features
    :params y_test: array of test labels
    :params model: Name of the pre-trained CNN used for extracting features
    :params layer: Name of the layer used for extracting features
    :params C: Optimized C value from grid search
    """
    svc = LinearSVC(C=C)
    svc.fit(X_train, y_train)
    predicted_labels = svc.predict(X_test)
    directory_name = "svm_" + model + "_" + "layer_" + layer + "_" + str(datetime.date.today()).replace("-","_")
    directory_path = os.path.join("../models", directory_name)
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    X_train.dump(os.path.join(directory_path, "train_data"))
    y_train.dump(os.path.join(directory_path, "train_labels"))
    X_test.dump(os.path.join(directory_path, "test_data"))
    y_test.dump(os.path.join(directory_path, "test_labels"))
    predicted_labels.dump(os.path.join(directory_path, "predicted_labels"))
    joblib.dump(svc, os.path.join(directory_path, "model.pkl"))
    return
Example #16
0
def svm_train(train_file):
    _,x,y = readFile(train_file)
    print 'reading done.'
    
    from sklearn.cross_validation import train_test_split
    tmp_array = np.arange(x.shape[0])
    train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500)
    
    #from sklearn import preprocessing as pp
    #scaler = pp.StandardScaler()
    #x = scaler.fit(x)
    #print 'scale done.'

    train_x = x[train_i]
    test_x = x[test_i]
    train_y = y[train_i]
    test_y = y[test_i]
    from sklearn.svm import LinearSVC
    classifier = LinearSVC()
    classifier.fit(train_x,train_y)
    print 'train done.'    

    res = classifier.predict(test_x)
    print res.shape
    from sklearn.metrics import roc_auc_score
    score = roc_auc_score(test_y,res)
    print score
    return classifier
    
    '''
Example #17
0
class MyComposition(object):
    def __init__(self, n_estimators=60):
        self.n_estimators = n_estimators
        self.lsvc = LinearSVC(penalty='l1', dual=False)

        self.estimators_ = None
        self.weights_ = None

    def fit(self, x_train, y_train, x_train2, y_train2):
        # svr = SVR(**svr_params)
        svr = LinearSVR(**svr_params2)
        gbr = GradientBoostingRegressor(n_estimators=self.n_estimators,
                                        learning_rate=0.1)
        abr = AdaBoostRegressor(n_estimators=self.n_estimators,
                                learning_rate=0.01)

        estimators = [gbr, abr, svr]
        self.estimators_ = [e.fit(x_train, y_train) for e in estimators]

        x_pred = np.vstack([e.predict(x_train2) for e in self.estimators_]).T

        self.lsvc.fit(x_pred, y_train2)
        self.weights_ = np.array(self.lsvc.coef_ / np.sum(self.lsvc.coef_)).ravel()

        return self

    def predict(self, x):
        x_pred = np.vstack([e.predict(x) for e in self.estimators_]).T
        return np.sum(x_pred * self.weights_, axis=1).ravel()
Example #18
0
def estimate_svm(textlines):
    svc = LinearSVC(C=10, random_state=1, class_weight={1:0.35})
    
    data = []
    for line in textlines:

        dat = np.r_[line.in_word_distances, line.between_word_distances]
        if dat.shape[0] < 2:
            continue

        _, _, centroids = cv2.kmeans(data=np.asarray([dat]).transpose().astype(np.float32), K=2, bestLabels=None,
            criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_MAX_ITER, 100, 0.001), attempts=5, 
            flags=cv2.KMEANS_PP_CENTERS) 

        diff = abs(centroids[0] - centroids[1])

        if line.n_words == 1:
            # single word
            data.append([1] + [diff / np.mean(line.heights), diff / (np.median(dat) + 1e-10)])
            continue

        #multi word
        data.append([-1] + [diff / np.mean(line.heights), diff / (np.median(dat) + 1e-10)])

        if len(line.in_word_distances) < 2:
            continue
        # create an artificial single word
        _, _, centroids = cv2.kmeans(data=np.asarray([line.in_word_distances]).transpose().astype(np.float32), K=2, bestLabels=None,
            criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_MAX_ITER, 100, 0.001), attempts=5, 
            flags=cv2.KMEANS_PP_CENTERS) 
        diff = abs(centroids[0] - centroids[1])
        data.append([1] + [diff / np.mean(line.heights), diff / (np.median(line.in_word_distances) + 1e-10)])
    data = np.array(data)
    svc.fit(data[:,1:], data[:,0])
    return svc
def svm_once(data):
    cumu_false_total = 0.0
    cumu_false_negative_total = 0.0
    total_count = 0
    x = data[0]['data']
    y = np.ravel(data[0]['labels'])
    svc = LinearSVC(C=100)
    svc.fit(x, y.tolist())
    file_false = open('svm_once_f.txt', 'w')
    file_false_negative = open('svm_once_fn.txt', 'w')
    for i in xrange(1, 100):
        print 'day(%d)' % i
        x = data[i]['data']
        y = np.ravel(data[i]['labels'])
        p_y = svc.predict(x)
        cumu_false = 0
        cumu_false_negative = 0
        for idx, f in enumerate(y == p_y):
            if not f:
                cumu_false += 1
                if y[idx] == 1:
                    cumu_false_negative += 1
        cumu_false_total += cumu_false
        cumu_false_negative_total += cumu_false_negative
        total_count += len(y)
        file_false.write("%f\n" % (cumu_false_total / total_count))
        file_false_negative.write("%f\n" %
                                  (cumu_false_negative_total / total_count))
    file_false.close()
    file_false_negative.close()
Example #20
0
def _compute_svmnormalvector((cache_dir, images, control_images, 
                              normalization_name, preprocess_file, rfe)):
    #try:
        import numpy as np 
        import sys
        from cpf.profiling.cache import Cache
        from cpf.profiling.normalization import RobustLinearNormalization, normalizations
        from sklearn.svm import LinearSVC
        from cpf.profiling.profile_svmnormalvector import _compute_rfe

        cache = Cache(cache_dir)
        normalization = normalizations[normalization_name]
        normalizeddata, normalized_colnames, _ = cache.load(images, normalization=normalization)
        control_data, control_colnames, _ = cache.load(control_images, normalization=normalization)
        if preprocess_file:
            preprocessor = cpf.util.unpickle1(preprocess_file)
            normalizeddata = preprocessor(normalizeddata)
            control_data = preprocessor(control_data)
        assert len(control_data) >= len(normalizeddata)
        downsampled = control_data[np.random.randint(0, len(control_data), len(normalizeddata)), :]
        x = np.vstack((normalizeddata, downsampled))
        y = np.array([1] * len(normalizeddata) + [0] * len(downsampled))
        clf = LinearSVC(C=1.0)
        m = clf.fit(x, y)
        normal_vector = m.coef_[0]
        if rfe:
            # Copy because it is immutable (normal_vector.flags.weriteable == False)
            normal_vector = np.array(normal_vector)
            normal_vector[~_compute_rfe(x, y)] = 0
        return normal_vector
    def train(self, sourceClassifier, sourceTrainData, targetTrainData):        
        targetClassifier = sourceClassifier
        self.checkSizes(sourceTrainData, targetTrainData)
        improvement = sys.maxsize
        i = 0
        unusedTargetData = targetTrainData
        targetTrainData = sourceTrainData #use all the source train data as well
        targetTrainData[1] = targetTrainData[1].tolist()
        while not self.isStoppingConditionMet(self.stoppingCondition, i, improvement):
            #print("iteration number "+str(i))
            result = self.sampleSelector.selectSamples(targetClassifier,unusedTargetData,self.batch_size)
            selectedSamples = result[0]
            selectedIndices = result[1]
 #           if firstIteration:
 #               print("in first iteration!")
 #               firstIteration = 0
 #               targetTrainData = [selectedSamples[0], selectedSamples[1]]
 #           else:
            
            #print("type(selectedSamples) = %s" % type(selectedSamples[0]))
            #print("targetTrainData[0].shape[0] = %d" % targetTrainData[0].shape[0])
            targetTrainData[0] = self.robustAppend(targetTrainData[0], selectedSamples[0])
            targetTrainData[1] = targetTrainData[1] + selectedSamples[1]
            unusedTargetData = self.getNewUnusedData(unusedTargetData,selectedIndices)
            targetClassifier = LinearSVC()
            #print("targetTrainData[0].shape[0] = %d" % targetTrainData[0].shape[0])
            targetClassifier.fit(targetTrainData[0],targetTrainData[1])
            i += 1
        
        print("active learner was trained on {0} labeled instances.".format(self.batch_size * self.max_num_of_iterations))
        return targetClassifier
Example #22
0
def SVC_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    y_bin = y_all[learn_options["binary target name"]].values[:, None]
    clf = LinearSVC(penalty="l2", dual=False)
    clf.fit(X[train], y_bin[train].flatten())
    # y_pred = clf.predict(X[test])[:, None] # this returns 0/1
    y_pred = clf.decision_function(X[test])[:, None]
    return y_pred, clf
Example #23
0
def retrain_models(username):
	train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username)

	b_train_x = []
	b_train_y = numpy.concatenate([body_y, train_y])

	for msg in (body_x + train_x):
		b_train_x.append(extract_body_features(msg))

	body_vec = TfidfVectorizer(norm="l2")
	b_train_x = body_vec.fit_transform(b_train_x)

	h_train_x = []
	h_train_y = numpy.concatenate([head_y, train_y])

	for msg in (head_x + train_x):
		h_train_x.append(extract_header_features(msg))

	head_vec = DictVectorizer()
	h_train_x = head_vec.fit_transform(h_train_x)

	body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3)
	head_model = RidgeClassifier(tol=1e-2, solver="lsqr")

	body_model.fit(b_train_x, b_train_y)
	head_model.fit(h_train_x, h_train_y)

        print("Finished training models for "+username+"...")

	store_models(username, body_vec, body_model, head_vec, head_model)
def benchmark(k, epochs):
  print("*" * 80)
  print("k: %d, epochs: %d\n" % (k, epochs))

  #select = SelectKBest(score_func=chi2, k=k)
  select = TruncatedSVD(n_components=k)
  X_train_trunc = select.fit_transform(X_train, Y_train)
  X_test_trunc = select.transform(X_test)

  print('done truncating')

  parameters = {'C': [1, 10, 100, 1000, 10000],  'class_weight': ['auto', None], 'tol':[0.001,0.0001]}
  clf = LinearSVC(C=100000)
  #clf = grid_search.GridSearchCV(svc, parameters)
  clf.fit(X_train_trunc, Y_train)
  pred = clf.predict(X_test_trunc)

  if CREATE_SUBMISSION:
    X_submit_trunc = select.transform(X_submit)
    pred_submit = clf.predict(X_submit_trunc)
    dump_csv(pred_submit, k, epochs)

  score = metrics.f1_score(Y_test, pred)
  print("f1-score:   %0.3f" % score)

  print("classification report:")
  print(metrics.classification_report(Y_test, pred))

  print("confusion matrix:")
  print(metrics.confusion_matrix(Y_test, pred))
Example #25
0
def main():
    dataset = load_cifar.load_cifar(n_train=N_TRAIN, n_test=N_TEST,
                                    grayscale=GRAYSCALE, shuffle=False)

    train_data = dataset['train_data']
    train_labels = dataset['train_labels']
    test_data = dataset['test_data']
    test_labels = dataset['test_labels']

    print train_data.shape, test_data.shape

    patch_extractor = image.PatchExtractor(patch_size=(PATCH_SIZE, PATCH_SIZE),
                                           max_patches = N_PATCHES/
                                           len(train_data))

    pp = preprocessing.Preprocessor(n_components=0.99)

    fl = feature_learner.FeatureLearner(pp, patch_extractor, n_clusters=N_CENTROIDS)
    fl.fit(train_data)
    train = fl.transform(train_data)
    m_train = mean(train, axis=0)
    train -= m_train
    v_train = sqrt(var(train, axis=0) + 0.01)
    train /= v_train

    test = fl.transform(test_data)
    test -= m_train
    test /= v_train

    classifier = SVC(C=10.0)#, gamma=1e-3, verbose=False)
    classifier.fit(train, train_labels)
    print classifier.score(test, test_labels)

    return
Example #26
0
def svm_for_multiclass():
    text_file = "/home/web_server/wangyuanfu/age/temp1"
    dataset = np.loadtxt(text_file, delimiter=" ")
    X = dataset[:,1:]
    y = dataset[:,0:1]
    min_max_scaler = preprocessing.MinMaxScaler()
    normalized_X = min_max_scaler.fit_transform(X)
    print len(normalized_X)

    X_train, X_test, y_train, y_test = train_test_split(normalized_X, y, test_size=0.1, random_state=7)


    clf = LinearSVC(random_state=0, C=1, multi_class='ovr', penalty='l2')
    clf = clf.fit(X_train, y_train.reshape(-1))
    # print the training scores
    print("training score : %.3f " % (clf.score(X_train, y_train)))

    # make predictions
    predicted = clf.predict(X_test)
    length_predicted = len(predicted)
    print predicted.shape
    #for i in range(0,length_predicted):
    #    print predicted[i],y_test[i]
        #print X_test[i,:],predicted[i],y_test[i],probability[i]
    # summarize the fit of the model
    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print(metrics.precision_score(y_test, predicted, average='micro'))
Example #27
0
def svm_binary_svc_probability(X, Y, C):
    allp = np.sum(Y>0);
    alln = len(Y) - allp;
    nr_fold = 5;
    perm = list(range(len(Y)));
    random.shuffle(perm);
    dec_values = np.zeros(len(Y), dtype=np.float32);
    for i in range(nr_fold):
        start = i * len(Y) // nr_fold;
        end   = (i+1) * len(Y) // nr_fold;
        trainL = [perm[j] for j in range(len(Y)) if j not in range(start, end)];
        testL  = perm[start:end];
        trainX = X[trainL,:];
        trainY = Y[trainL];
        p_count = np.sum(trainY>0);
        n_count = len(trainY) - p_count;
        if p_count==0 and n_count==0:
            dec_values[start:end] = 0.0;
        elif p_count > 0 and n_count == 0:
            dec_values[start:end] = 1.0;
        elif p_count == 0 and n_count > 0:
            dec_values[start:end] = -1.0;
        else :
            subclf = LinearSVC(C=C, class_weight={1:allp,-1:alln});
            subclf.fit(trainX, trainY);
            dec_values[testL] = subclf.decision_function(X[testL,:]).ravel();
    return sigmoid_train(dec_values, Y);
Example #28
0
class LinearSVM:
	def __init__(self):
		self.clf = LinearSVC(penalty='l2', loss='l1', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None)
		self.pattern ='(?u)\\b[A-Za-z]{3,}'
		self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(1, 3))
	def train(self,fileName):
		print "LinearSVM Classifier is being trained"
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_train = self.tfidf.fit_transform(table.message)
		Y_train = []
		for item in table.cat:
			Y_train.append(int(item)) 
		self.clf.fit(X_train, Y_train)
		print "LinearSVM Classifier has been trained"

	def classify(self,cFileName, rFileName):
		table = pandas.read_table(cFileName, names=["message"])
		X_test = self.tfidf.transform(table.message)
		print "Data have been classified"
		with open(rFileName,'w') as f:
			for item in self.clf.predict(X_test).astype(str):
				f.write(item+'\n')

	def validate(self,fileName):
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_validate = self.tfidf.transform(table.message)
		Y_validated = self.clf.predict(X_validate).astype(str)
		totalNum = len(table.cat)
		errorCount = 0
		for i in range(0,totalNum):
			if int(table.cat[i])!=int(Y_validated[i]):
				errorCount += 1
		print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
Example #29
0
def do_SVM(x,y,xt,yt):
	Cs = [0.01,0.1,10,100,1000]
	for C in Cs:
		print "El valor de C que se esta probando: %f"%C
		model = LinearSVC(C=C)
		model = model.fit(x, y)
		score_the_model(model,x,y,xt,yt,"SVM")
def test_class_weights_rescale_C():
    # check that our crammer-singer implementation with class weights and
    # rescale_C=True is the same as LinearSVC's c-s class_weight implementation
    from sklearn.svm import LinearSVC
    X, Y = make_blobs(n_samples=210, centers=3, random_state=1, cluster_std=3,
                      shuffle=False)
    X = np.hstack([X, np.ones((X.shape[0], 1))])
    X, Y = X[:170], Y[:170]

    weights = 1. / np.bincount(Y)
    weights *= len(weights) / np.sum(weights)
    pbl_class_weight = MultiClassClf(n_features=3, n_classes=3,
                                     class_weight=weights, rescale_C=True)
    svm_class_weight = OneSlackSSVM(pbl_class_weight, C=10, tol=1e-5)
    svm_class_weight.fit(X, Y)

    try:
        linearsvm = LinearSVC(multi_class='crammer_singer',
                              fit_intercept=False, class_weight='auto', C=10)
        linearsvm.fit(X, Y)

        assert_array_almost_equal(svm_class_weight.w, linearsvm.coef_.ravel(),
                                  3)
    except TypeError:
        # travis has a really old sklearn version that doesn't support
        # class_weight in LinearSVC
        pass
    print('Precision, recall and f1-score:')
    print(classification_report(y_test, y_pred))

    roc = roc_auc_score(y_test, y_pred)
    print('ROC AUC: {}'.format(roc))

    pr = average_precision_score(y_test, y_pred)
    print('Precision-recall: {}'.format(pr))

    print('-' * 10, 'End', model.__class__.__name__, '-' * 10)


fit_model(XGBClassifier())
fit_model(LogisticRegression())
fit_model(LinearSVC(random_state=0, tol=1e-5))
fit_model(KNeighborsClassifier(n_neighbors=6))
'''
PAIRWISE:
  0: [0, 0, 0, 0, 1, 1, 1, 0]
  1: [1, 1, 1, 1, 2, 2, 2, 0]
  2: [2, 2, 2, 2, 3, 3, 3, 0]
  3: [3, 3, 3, 3, 4, 4, 4, 0]
  4: [4, 4, 4, 4, 5, 5, 5, 0]
  5: [5, 5, 5, 5, 6, 6, 6, 0]
  6: [6, 6, 6, 6, 7, 7, 7, 0]
  7: [7, 7, 7, 7, 8, 8, 8, 0]
  8: [8, 8, 8, 8, 9, 9, 9, 0]
  9: [9, 9, 9, 9, 9, 8, 7, 1]
 10: [9, 8, 7, 6, 6, 5, 4, 2]
 11: [8, 7, 6, 5, 5, 4, 3, 2]
Example #32
0
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)
    
# the vectorizer and classifer to use
# note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
# the pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# data
train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", 
        "lol @twitterdude that is gr8", "twitter &amp; reddit are fun.", 
        "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", 
        "Rockets launch from Earth and go to other planets.", "twitter social media &gt; &lt;", 
        "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."]
labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"]

test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"]
labelsTest = ["twitter", "space"]

# train
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
# scaled_X = X_scaler.transform(X)
y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features))))

rand_state = np.random.randint(0, 100)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=rand_state)


def get_classifier_from_pickle():
    with open(f'classifiers/classifier{str(CONFIG)}.p', 'rb') as classifier_file:
        svc = pickle.load(classifier_file)
    return svc

if not __name__ == "__main__":
    svc = get_classifier_from_pickle()

if __name__ == "__main__":
    from sklearn.svm import LinearSVC
    # Use a linear SVC (support vector classifier)
    svc = LinearSVC()
    svc.fit(X_train, y_train)
    with open(f'classifiers/classifier{str(CONFIG)}.p', 'wb') as classifier_file:
        pickle.dump(svc, classifier_file)
    print('Test Accuracy of SVC = ', svc.score(X_test, y_test))
    print('My SVC predicts: ', svc.predict(X_test[0:10]))
    print('For labels:      ', y_test[0:10])
Example #34
0
    def feature_selection(self, X, y, method):
        """
        purpose:    select feature
        input:  X:train data
                y:lable
                method: uesed method
        return:
        """
        X_indices = np.arange(X.shape[-1])

        score = []

        # Removing features with low variance

        # correlation coefficient
        # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # mutual information
        # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # Univariate feature selection (for classification)
        if method == 'chi-squared':
            skb = SelectKBest(chi2)
            skb.fit_transform(X, y)
            score = skb.scores_

        # Univariate feature selection (for regression)
        if method == 'f_regression':
            skb = SelectKBest(f_regression)
            skb.fit_transform(X, y)
            score = skb.scores_

        # L1-based feature selection (for classification)
        if method == 'LinearSVC':
            lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
            sfm = SelectFromModel(lsvc, prefit=True)
            X_new = sfm.transform(X)

        # L1-based feature selection (for regression)
        elif method == 'LassoCV':
            lasso = LassoCV().fit(X, y)
            score = lasso.coef_
            sfm = SelectFromModel(lasso, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classification)
        elif method == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier()
            clf = clf.fit(X, y)
            print clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'ExtraTreesRegressor':
            clf = ExtraTreesRegressor()
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classifier)
        elif method == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'GradientBoostingRegressor':
            clf = GradientBoostingRegressor(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Print the feature ranking
        indices = np.argsort(score)[::-1]
        print("Feature ranking:")
        for f in X_indices:
            print("feature %d: %s  (%f)" %
                  (indices[f], self.columns[indices[f]], score[indices[f]]))

        #draw plot
        plt.figure()
        # plt.bar(indices, score, width=0.2, color='r')
        plt.barh(indices, score, height=0.2, color='r')
        plt.title(method)
        plt.xlabel("score")
        plt.ylabel("feature")
        plt.grid(axis='x')
        plt.show()

        pass
Example #35
0
for clf, name in (
    (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(), "Random forest"),
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print("=" * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
        benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty)))

# Train SGD with Elastic Net penalty
print("=" * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet")))

# Train NearestCentroid without threshold
print("=" * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))
Example #36
0
ovr_clf.fit(train_x, train_y)
ovr_predicted = ovr_clf.predict(test_x)

from sklearn.metrics import confusion_matrix, precision_score, f1_score
ovr_confusion_matrix = confusion_matrix(test_y.values.argmax(axis=1),
                                        ovr_predicted.argmax(axis=1))
precision_ovr = precision_score(
    test_y, ovr_predicted, average='micro')  #gives 95.76 percent of accuracy
precision_ovr_all = precision_score(test_y, ovr_predicted,
                                    average=None)  #gives 94.98,96.52
f1_lgd = f1_score(test_y, ovr_predicted,
                  average='micro')  #gives 95.76 percent of accuracy

#Applying SVM for multiclass classification
from sklearn.svm import LinearSVC
svc = OneVsRestClassifier(LinearSVC(C=10, loss='hinge'))
svc.fit(train_x, train_y)
svc_predicted = svc.predict(test_x)

from sklearn.metrics import confusion_matrix, precision_score, f1_score
svc_confusion_matrix = confusion_matrix(test_y.values.argmax(axis=1),
                                        svc_predicted.argmax(axis=1))
precision_svc = precision_score(test_y, svc_predicted, average='micro')
precision_svc_all = precision_score(test_y, svc_predicted, average=None)
f1_sgd = f1_score(test_y_label, sg_predicted, average='micro')

#Appying SVM  for multiclass classification but with different kernels
from sklearn.svm import SVC
svm_poly = OneVsRestClassifier(SVC(kernel='poly', degree=4, C=10000))
svm_poly.fit(train_x, train_y)
svm_poly_predicted = svm_poly.predict(test_x)
Example #37
0
###### downsample
#features_df = features_df.iloc[0:5000,:]
#features_df_val = features_df_val[0:1000,]
#encoded_labels_df = encoded_labels_df.iloc[0:5000,:]

print("feat shape:", features_df.shape)
print("labels shape:", encoded_labels_df.shape)

X_train = np.array(features_df)
Y_train = np.array(encoded_labels_df)
x_val = np.array(features_df_val)
y_val = np.array(encoded_labels_df_val)

# Define model
linsvm = LinearSVC(loss='hinge')
#multi_class='ovr',
#verbose=True,
#max_iter=1000)
model = OneVsRestClassifier(linsvm, n_jobs=-1)

start = time.process_time()
model.fit(X_train, Y_train)
elapsed_fit = time.process_time() - start

print("Time to fit model (min):", elapsed_fit / 60)

start_predict = time.process_time()
### change
y_pred = model.decision_function(x_val)
elapsed_predict = time.process_time() - start_predict
Example #38
0
# Support Vector Machines
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_val)
acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_svc)

# In[ ]:

# Linear SVC
from sklearn.svm import LinearSVC

linear_svc = LinearSVC()
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_val)
acc_linear_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_linear_svc)

# In[ ]:

# Perceptron
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_val)
acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_perceptron)
Example #39
0
def return_model(mode, **kwargs):
    
    
    if inspect.isclass(mode):
        assert getattr(mode, 'fit', None) is not None, 'Custom model family should have a fit() method'
        model = mode(**kwargs)
    elif mode=='logistic':
        solver = kwargs.get('solver', 'liblinear')
        n_jobs = kwargs.get('n_jobs', None)
        max_iter = kwargs.get('max_iter', 5000)
        model = LogisticRegression(solver=solver, n_jobs=n_jobs, 
                                 max_iter=max_iter, random_state=666)
    elif mode=='Tree':
        model = DecisionTreeClassifier(random_state=666)
    elif mode=='RandomForest':
        n_estimators = kwargs.get('n_estimators', 50)
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='GB':
        n_estimators = kwargs.get('n_estimators', 50)
        model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='AdaBoost':
        n_estimators = kwargs.get('n_estimators', 50)
        model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='SVC':
        kernel = kwargs.get('kernel', 'rbf')
        model = SVC(kernel=kernel, random_state=666)
    elif mode=='LinearSVC':
        model = LinearSVC(loss='hinge', random_state=666)
    elif mode=='GP':
        model = GaussianProcessClassifier(random_state=666)
    elif mode=='KNN':
        n_neighbors = kwargs.get('n_neighbors', 5)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
    elif mode=='NB':
        model = MultinomialNB()
    elif mode=='linear':
        model = LinearRegression(random_state=666)
    elif mode=='ridge':
        alpha = kwargs.get('alpha', 1.0)
        model = Ridge(alpha=alpha, random_state=666)
    elif 'conv' in mode:
        tf.reset_default_graph()
        address = kwargs.get('address', 'weights/conv')
        hidden_units = kwargs.get('hidden_layer_sizes', [20])
        activation = kwargs.get('activation', 'relu')
        weight_decay = kwargs.get('weight_decay', 1e-4)
        learning_rate = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 1000)
        early_stopping= kwargs.get('early_stopping', 10)
        warm_start = kwargs.get('warm_start', False)
        batch_size = kwargs.get('batch_size', 256)
        kernel_sizes = kwargs.get('kernel_sizes', [5])
        strides = kwargs.get('strides', [5])
        channels = kwargs.get('channels', [1])
        validation_fraction = kwargs.get('validation_fraction', 0.)
        global_averaging = kwargs.get('global_averaging', 0.)
        optimizer = kwargs.get('optimizer', 'sgd')
        if mode=='conv':
            model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter,
                          learning_rate=learning_rate, 
                          weight_decay=weight_decay, validation_fraction=validation_fraction,
                          early_stopping=early_stopping,
                         optimizer=optimizer, warm_start=warm_start, address=address,
                          hidden_units=hidden_units,
                          strides=strides, global_averaging=global_averaging,
                         kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
        elif mode=='conv_reg':
            model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter,
                          learning_rate=learning_rate, 
                          weight_decay=weight_decay, validation_fraction=validation_fraction,
                          early_stopping=early_stopping,
                         optimizer=optimizer, warm_start=warm_start, address=address,
                          hidden_units=hidden_units,
                          strides=strides, global_averaging=global_averaging,
                         kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
    elif 'NN' in mode:
        solver = kwargs.get('solver', 'adam')
        hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,))
        if isinstance(hidden_layer_sizes, list):
            hidden_layer_sizes = list(hidden_layer_sizes)
        activation = kwargs.get('activation', 'relu')
        learning_rate_init = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 5000)
        early_stopping= kwargs.get('early_stopping', False)
        warm_start = kwargs.get('warm_start', False)
        if mode=='NN':
            model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
                                activation=activation, learning_rate_init=learning_rate_init,
                                warm_start = warm_start, max_iter=max_iter,
                                early_stopping=early_stopping)
        if mode=='NN_reg':
            model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
                                activation=activation, learning_rate_init=learning_rate_init,
                                warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping)
    else:
        raise ValueError("Invalid mode!")
    return model
Example #40
0
y_pred = classifier.predict(X_test_mean)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_median, y_train)
y_pred = classifier.predict(X_test_median)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_mode, y_train)
y_pred = classifier.predict(X_test_mode)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

from sklearn.svm import LinearSVC
classifier = LinearSVC()
classifier.fit(X_train_0, y_train)
y_pred = classifier.predict(X_test_0)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_mean, y_train)
y_pred = classifier.predict(X_test_mean)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_median, y_train)
y_pred = classifier.predict(X_test_median)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))
class AllClassificationModels:
    """
    Wrapper class around all supported classification models: LogisticRegression, MLPClassifier, RandomForest, SVC,
    NuSVC, LinearSVC, and XGBClassifier.
    AllClassificationModels runs every available classification algorithm on the given dataset and outputs the mean
    accuracy, ROC-AUC, and execution time of each successful model when all_classification_models() is run.
    """
    def __init__(self,
                 attributes=None,
                 labels=None,
                 test_size=0.25,
                 verbose=False):
        """
        Initializes an AllClassificationModels object.

        The following parameters are needed to use an AllClassificationModels object:

            – attributes: a numpy array of the desired independent variables (Default is None)
            – labels: a numpy array of the classes (Default is None)
            – test_size: the proportion of the dataset to be used for testing the model;
            the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25)
            – verbose: specifies whether or not to ouput any and all logging during model training (Default is False)

            Note: These are the only parameters allowed. All other parameters for each model will use their default
            values. For more granular control, please instantiate each model individually.

        The following instance data is found after running all_classification_models() successfully:

            – logistic_regression: a reference to the LogisticRegression model
            – MLP: a reference to the MLPClassifier model
            – random_forest: a reference to the RandomForest model
            – SVC: a reference to the SVC model
            – nu_SVC: a reference to the NuSVC model
            – linear_SVC: a reference to the LinearSVC model
            – XGB_classifier: a reference to the XGBClassifier model

        After running all_classification_models(), the mean accuracy, ROC-AUC (if available), and execution time for
        each model that ran successfully will be displayed in tabular form. Any models that failed to run will be listed.
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = test_size
        self.verbose = verbose

        self.logistic_regression = LogisticRegression(verbose=self.verbose)
        self.MLP = MLPClassifier(verbose=self.verbose)
        self.random_forest = RandomForestClassifier(verbose=self.verbose)
        self.SVC = SVC(verbose=self.verbose, probability=True)
        self.nu_SVC = NuSVC(verbose=self.verbose, probability=True)
        self.linear_SVC = LinearSVC(verbose=self.verbose)
        self.XGB_classifier = XGBClassifier(verbosity=int(self.verbose))

        self._classification_models = {
            "Model": ["Accuracy", "ROC-AUC", "Time"]
        }
        self._failures = []

    # Accessor methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If an AllClassificationModels object is initialized without specifying attributes, attributes will be None.
        all_classification_models() cannot be called until attributes is a populated numpy array of independent variables;
        call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If an AllClassificationModels object is initialized without specifying labels, labels will be None.
        all_classification_models() cannot be called until labels is a populated numpy array of classes;
        call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_verbose(self):
        """
        Accessor method for verbose.

        Will default to False if not set by the user.
        """
        return self.verbose

    def get_all_classification_models(self):
        """
        Accessor method that returns a list of all models.

        All models within the list will be None if all_classification_models() hasn't been called, yet.
        """
        return [
            self.logistic_regression, self.MLP, self.random_forest, self.SVC,
            self.nu_SVC, self.linear_SVC, self.XGB_classifier
        ]

    def get_logistic_regression(self):
        """
        Accessor method for logistic_regression.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.logistic_regression

    def get_MLP(self):
        """
        Accessor method for MLP.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.MLP

    def get_random_forest(self):
        """
        Accessor method for random_forest.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.random_forest

    def get_SVC(self):
        """
        Accessor method for SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.SVC

    def get_nu_SVC(self):
        """
        Accessor method for nu_SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.nu_SVC

    def get_linear_SVC(self):
        """
        Accessor method for linear_SVC.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.linear_SVC

    def get_XGB_classifier(self):
        """
        Accessor method for XGB_classifier.

        Will return None if all_classification_models() hasn't been called, yet.
        """
        return self.XGB_classifier

    # Modifier methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a numpy array of independent variables. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a numpy array of classes. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a number or None. Defaults to 0.25.
        """
        self.test_size = new_test_size

    def set_verbose(self, new_verbose=False):
        """
        Modifier method for verbose.

        Input should be a truthy/falsy value. Defaults to False.
        """
        self.verbose = new_verbose

    # Classification functionality

    def all_classification_models(self):
        """
        Driver method for running all classification models with given attributes and labels.
        all_classification_models() first trains the models and determines their mean accuracy, ROC-AUC, and execution
        time via _all_classification_models_runner(). Then, all_classification_models() calls _print_results() to
        format and print each successful model's measurements, while also listing any failed models.

        If verbose is True, all verbose logging for each model will be enabled.
        If verbose is False, all logging to stdout and stderr will be suppressed.
        """

        # Call helper method for running all classification models; suppress output, if needed
        if not self.verbose:
            suppress_output = io.StringIO()
            with redirect_stderr(suppress_output), redirect_stdout(
                    suppress_output):
                self._all_classification_models_runner()
        else:
            self._all_classification_models_runner()

        # Print results
        self._print_results()

    # Helper methods

    def _all_classification_models_runner(self):
        """
        Helper method that runs all models using the given dataset and all default parameters.
        After running all models, each model is determined to be either a success or failure, and relevant data
        (accuracy, ROC-AUC, execution time) is recorded.

        _all_classification_models_runner() may only be called by all_classification_models().
        """

        # Split dataset
        dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

        # Run and time all models; identify each as success or failure
        try:
            start_time = time.time()
            self.logistic_regression.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["LogisticRegression"] =\
                [self.logistic_regression.score(dataset_X_test, dataset_y_test),
                roc_auc_score(self.logistic_regression.predict(dataset_X_test),
                              self.logistic_regression.predict_proba(dataset_X_test)[::, 1]),
                end_time - start_time]
        except:
            self._failures.append("LogisticRegression")

        try:
            start_time = time.time()
            self.MLP.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["MLPClassifier"] =\
                [self.MLP.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.MLP.predict(dataset_X_test), self.MLP.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("MLPClassifier")

        try:
            start_time = time.time()
            self.random_forest.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["RandomForest"] =\
                [self.random_forest.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.random_forest.predict(dataset_X_test),
                                self.random_forest.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("RandomForest")

        try:
            start_time = time.time()
            self.SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["SVC"] =\
                [self.SVC.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.SVC.predict(dataset_X_test), self.SVC.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("SVC")

        try:
            start_time = time.time()
            self.nu_SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["NuSVC"] =\
                [self.nu_SVC.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.nu_SVC.predict(dataset_X_test), self.nu_SVC.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("NuSVC")

        try:
            start_time = time.time()
            self.linear_SVC.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["LinearSVC"] =\
                [self.linear_SVC.score(dataset_X_test, dataset_y_test), "Not Available", end_time - start_time]
        except:
            self._failures.append("LinearSVC")

        try:
            start_time = time.time()
            self.XGB_classifier.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._classification_models["XGBClassifier"] =\
                [self.XGB_classifier.score(dataset_X_test, dataset_y_test),
                    roc_auc_score(self.XGB_classifier.predict(dataset_X_test),
                                  self.XGB_classifier.predict_proba(dataset_X_test)[::, 1]),
                    end_time - start_time]
        except:
            self._failures.append("XGBClassifier")

    def _print_results(self):
        """
        Helper method that prints results of _all_classification_models_runner() in tabular form.

        _print_results() may only be called by all_classification_models() after all models have attempted to run.
        """

        # Print models that didn't fail
        print("\nResults:\n")

        for model, data in self._classification_models.items():
            print("{:<20} {:<20} {:<20} {:<20}".format(model, data[0], data[1],
                                                       data[2]))

        print()

        # Print failures, if any
        if len(self._failures) > 0:
            print("The following models failed to run:\n")

            for entry in self._failures:
                print(entry)

        print()
# Scaling using the Standard Scaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = scaler.transform(X_test)

os = SMOTE(random_state=0)

columns = X_train.columns
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['target'])

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    print("Started processing model family: ", model_name)
    accuracies = cross_val_score(model,
                                 os_data_X,
                                 np.ravel(os_data_y),
                                 scoring='accuracy',
                                 cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
Example #43
0
def select_from_l1_svc(C=0.1, tol=1e-3, threshold="0.5*mean"):
    return SelectFromModel(LinearSVC(C=C, penalty="l1", dual=False, tol=tol,
                                     class_weight='balanced'),
                           prefit=False, threshold=threshold)
Example #44
0
    pred = clf.predict(X_test)
    test_time = time() - t0
    err = metrics.zero_one(y_test, pred) / float(pred.shape[0])
    return err, train_time, test_time


######################################################################
## Train Liblinear model
liblinear_parameters = {
    'loss': 'l2',
    'penalty': 'l2',
    'C': 1000,
    'dual': False,
    'tol': 1e-3,
}
liblinear_res = benchmark(LinearSVC(**liblinear_parameters))
liblinear_err, liblinear_train_time, liblinear_test_time = liblinear_res

######################################################################
## Train GaussianNB model
gnb_err, gnb_train_time, gnb_test_time = benchmark(GaussianNB())

######################################################################
## Train SGD model
sgd_parameters = {
    'alpha': 0.001,
    'n_iter': 2,
}
sgd_err, sgd_train_time, sgd_test_time = benchmark(
    SGDClassifier(**sgd_parameters))
################# matplotlib 한글 구현 #############################
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(
    fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
####################################################################

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X, y = mglearn.datasets.make_forge()

fig, axes = plt.subplots(1, 2, figsize=(10, 3))

for model, ax in zip([LinearSVC(), LogisticRegression()], axes):
    clf = model.fit(X, y)
    mglearn.plots.plot_2d_separator(clf,
                                    X,
                                    fill=False,
                                    eps=0.5,
                                    ax=ax,
                                    alpha=.7)
    mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
    ax.set_title("{}".format(clf.__class__.__name__))
    ax.set_xlabel("특성 0")
    ax.set_ylabel("특성 1")
axes[0].legend()

plt.show()
Example #46
0
"-------------------------------------------------------------------------------------------------------------------"

"----------------------------------------------- SVM / RANDOM FOREST -----------------------------------------------"
while True:
    input_model = int(
        input("Choose 1: Normal SVC || 2: Linear SVC || 3: Random Forest\n"))
    if input_model == 1 or input_model == 2 or input_model == 3:
        break

# SVC(kernel = 'linear')
if input_model == 1:
    model = SVC(kernel='linear')

# LinearSVC
elif input_model == 2:
    model = LinearSVC()

# Random Forest Classifier
else:
    model = RandomForestClassifier(n_estimators=250)

"-------------------------------------------------------------------------------------------------------------------"

accuracy_train = []
accuracy_test = []

precision_weight = []
recall_weight = []
f1_weight = []

# To count which fold the program is currently at
Example #47
0
digits_train = pd.read_csv(
    'http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra',
    header=None)
digits_test = pd.read_csv(
    'http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes',
    header=None)

X_train = digits_train[np.arange(64)]
y_train = digits_train[64]

X_test = digits_test[np.arange(64)]
y_test = digits_test[64]

from sklearn.svm import LinearSVC
svc = LinearSVC()
svc.fit(X_train, y_train)
svc_y_predict = svc.predict(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=30)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

pca_svc = LinearSVC()
pca_svc.fit(X_pca_train, y_train)
pca_svc_y_predict = pca_svc.predict(X_pca_test)

from sklearn.metrics import classification_report
print('Score SVC: ', svc.score(X_test, y_test))
print(classification_report(y_test, svc_y_predict))
gNB_clf_bus = GaussianNB()
rf_clf = RandomForestClassifier()
rf_clf_bus = RandomForestClassifier()
knn_clf = KNeighborsClassifier()
knn_clf_bus = KNeighborsClassifier()
logReg_clf = LogisticRegression(random_state=0)
logReg_clf_bus = LogisticRegression(random_state=0)
mlp_clf = MLPClassifier()
mlp_clf_bus = MLPClassifier()
qda_clf = QuadraticDiscriminantAnalysis()
qda_clf_bus = QuadraticDiscriminantAnalysis()
lda_clf = LinearDiscriminantAnalysis()
lda_clf_bus = LinearDiscriminantAnalysis()
gb_clf = GradientBoostingClassifier()
gb_clf_bus = GradientBoostingClassifier()
lsvm_clf = LinearSVC()
lsvm_clf_bus = LinearSVC()

clfs = [
    xgb_clf, xgb_clf_bus, svm_clf, svm_clf_bus, gb_clf, gb_clf_bus, mlp_clf,
    mlp_clf_bus
]

# ================================================Cross-Validation======================================================
if __name__ == '__main__':
    y_hats = []
    y_bus_hats = []
    accs = []
    accs_bus = []
    reports = []
    reports_bus = []
    total_de_acertos = sum(acertos)
    total_de_elementos = len(teste_dados)

    taxa_de_acerto = 100.0 * total_de_acertos / total_de_elementos

    msg = "Taxa de acerto do {0}: {1}".format(nome, taxa_de_acerto)
    print(msg)
    return taxa_de_acerto


resultados = {}

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0))
resultadoOneVsRest = fit_and_predict("OneVsRest", modeloOneVsRest,
                                     treino_dados, treino_marcacoes,
                                     teste_dados, teste_marcacoes)
resultados[resultadoOneVsRest] = modeloOneVsRest

from sklearn.multiclass import OneVsOneClassifier
modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0))
resultadoOneVsOne = fit_and_predict("OneVsOne", modeloOneVsOne, treino_dados,
                                    treino_marcacoes, teste_dados,
                                    teste_marcacoes)
resultados[resultadoOneVsOne] = modeloOneVsOne

from sklearn.naive_bayes import MultinomialNB
modeloMultinomial = MultinomialNB()
resultadoMultinomial = fit_and_predict("MultinomialNB", modeloMultinomial,
Example #50
0
    ])
    X_train = pd.concat([
        X_HC_train, X_Br_train, X_CRC_train, X_GBM_train, X_HBC_train,
        X_lung_train, X_PAAD_train
    ])

    y_test = pd.concat([
        y_HC_test, y_Br_test, y_CRC_test, y_GBM_test, y_HBC_test, y_lung_test,
        y_PAAD_test
    ])
    X_test = pd.concat([
        X_HC_test, X_Br_test, X_CRC_test, X_GBM_test, X_HBC_test, X_lung_test,
        X_PAAD_test
    ])

    y_pred = OneVsOneClassifier(LinearSVC(C=100.)).fit(X_train,
                                                       y_train).predict(X_test)

    acc = accuracy_score(y_test, y_pred)

    rec = recall_score(y_test, y_pred, average='macro')

    accuracy.append(acc)
    recall.append(rec)

    HC_pred = y_pred[0:len(y_HC_test)]
    Br_pred = y_pred[len(y_HC_test):len(y_HC_test) + len(y_Br_test)]
    CRC_pred = y_pred[len(y_HC_test) + len(y_Br_test):len(y_HC_test) +
                      len(y_Br_test) + len(y_CRC_test)]
    GBM_pred = y_pred[len(y_HC_test) + len(y_Br_test) +
                      len(y_CRC_test):len(y_HC_test) + len(y_Br_test) +
Example #51
0
    def calibrate(self):
        # Define feature parameters
        self.color_space = 'YCrCb'  # #ANY OTHER
        self.orient = 9
        self.pix_per_cell = 8
        self.cell_per_block = 2
        self.hog_channel = 'ALL'
        self.spatial_size = (16, 16)
        self.hist_bins = 16
        self.spatial_feat = True
        self.hist_feat = True
        self.hog_feat = True

        t = time.time()
        n_samples = 1000

        dirs = os.listdir("data/vehicles/")
        cars = []
        print(dirs)
        for image_type in dirs:
            cars.extend(glob.glob('data/vehicles/' + image_type + '/*.jpg'))

        print('Number of Vehicles Images found', len(cars))

        with open('data/vehicles/cars.txt', 'w') as f:
            for fn in cars:
                f.write(fn + '\n')

        dirs = os.listdir("data/non-vehicles/")
        notcars = []
        print(dirs)
        for image_type in dirs:
            notcars.extend(
                glob.glob('data/non-vehicles/' + image_type + '/*.jpg'))

        print('Number of Non-Vehicles Images found', len(notcars))

        with open('data/non-vehicles/notcars.txt', 'w') as f:
            for fn in notcars:
                f.write(fn + '\n')

        # Read in car / not-car image
        test_cars = cars  #np.array(cars)[car_indxs]
        test_notcars = notcars  #np.array(notcars)[notcar_indxs]

        car_features = self.extract_features(
            test_cars,
            color_space=self.color_space,
            spatial_size=self.spacial_size,
            hist_bins=self.hist_bins,
            orient=self.orient,
            pix_per_cell=self.pix_per_call,
            cell_per_block=self.cell_per_block,
            hog_channel=self.hog_channel,
            spatial_feat=self.spatial_feat,
            hist_feat=self.hist_feat,
            hog_feat=self.hog_feat)

        notcar_features = self.extract_features(
            test_notcars,
            color_space=self.color_space,
            spatial_size=self.spacial_size,
            hist_bins=self.hist_bins,
            orient=self.orient,
            pix_per_cell=self.pix_per_call,
            cell_per_block=self.cell_per_block,
            hog_channel=self.hog_channel,
            spatial_feat=self.spatial_feat,
            hist_feat=self.hist_feat,
            hog_feat=self.hog_feat)
        print(time.time() - t, ' Seconds to compute features...')
        X = np.vstack((car_features, notcar_features)).astype(np.float)
        # Fit a per column scaler
        self.X_scaler = StandardScaler().fit(X)
        # Apply the scaler to X
        scaled_X = self.X_scaler.transform(X)

        # Define the labels vector
        y = np.hstack(
            (np.ones(len(car_features)), np.zeros(len(notcar_features))))

        # Split up data into randomized training and test sets
        rand_state = np.random.randint(0, 100)
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_X, y, test_size=0.1, random_state=rand_state)

        print('Using: ', self.orient, 'orientations,', self.pix_per_cell,
              'pixels per cell', self.cell_per_block, 'cells per block,',
              self.hist_bins, 'histogram bins, and', self.spacial_size,
              'spatial sampling')
        print('Feature vector length:', len(X_train[0]))
        # Use a linear SVC
        self.svc = LinearSVC()
        # Check the learning time of the SVC
        t = time.time()
        self.svc.fit(X_train, y_train)
        # Save the camera calibration result for later use (we won't worry about rvecs / tvecs)
        self.dist_pickle = {}
        dist_pickle["clf"] = self.svc
        dist_pickle["scaler"] = self.X_scaler
        dist_pickle["color_space"] = self.color_space
        dist_pickle["orient"] = self.orient
        dist_pickle["pix_per_cell"] = self.pix_per_cell
        dist_pickle["cell_per_block"] = self.cell_per_block
        dist_pickle["spatial_size"] = self.spatial_size
        dist_pickle["hog_channel"] = self.hog_channel
        dist_pickle["hist_bins"] = self.hist_bins
        dist_pickle["spatial_feat"] = self.spatial_feat
        dist_pickle["hist_feat"] = self.hist_feat
        dist_pickle["hog_feat"] = self.hog_feat
        pickle.dump(self.dist_pickle, open("model/dist_pickle.p", "wb"))

        print(round(time.time() - t, 2), ' Seconds to train SVC...')
        # Check the score of the SVC
        print('Test accuracy of svc = ',
              round(self.svc.score(X_test, y_test), 4))
def main(args):
    args = parser.parse_args()

    source = domain_map.get( args.source, args.source)
    target = domain_map.get( args.target, args.target)

    # algorithm steps:
    # 0) create or load a random data split
    print("Extracting data")
    source_train, source_train_labels, source_test, source_test_label, target_train, target_train_labels, target_test, target_test_labels = get_splits(source, target)
    all_unlabeled,source_unlabeled,target_unlabeled=XML2arrayRAW("data/"+source+"/"+source+"UN.txt",
                                                                 "data/"+target+"/"+target+"UN.txt")

    # 1) extract pivot features for domain 0 based on MI and frequency in domain 1 being over some threshold.
    print("Computing pivot candidates for the source domain (printing 2*num_pivots)")
    pivot_candidates = get_pivot_candidates(source, target, source_train, source_train_labels, source_unlabeled, target_unlabeled)
    for pivot_ind, pivot_candidate in enumerate(pivot_candidates[:args.pivots*2]):
        feature_name, feature_ind, feature_score = pivot_candidate
        print("%s : %f" % (feature_name, feature_score))
    
    # 2) extract a training problem for each domain in the domain pair where we try to predict the pivot feature from the non-pivot features. (this is tricky because there may be interactions?)
    X_st_train, y_st_train, X_st_valid, y_st_valid = create_pivot_training_problem(source_train+source_unlabeled, target_unlabeled, pivot_candidates[:args.pivots])
    num_feats = X_st_train.shape[1] // 3
    parameters = {'C':[0.01, 0.1, 1, 10]}
    for pivot_num in range(args.pivots):
        print("Building a classifier for feature ind %d (%s)" % (pivot_num, pivot_candidates[pivot_num][0]) )
        # build a classifier for predicting pivot_num
        labels = y_st_valid[:,pivot_num].toarray()
        # clf = LogisticRegression().fit(X_st_train, y_st_train[:, pivot_num])
        svr = LinearSVC()
        clf = GridSearchCV(svr, parameters, scoring='f1', n_jobs=1)
        clf.fit(X_st_train, np.ravel(y_st_train[:, pivot_num].toarray()))

        print("  Average score of best setting %s is %f" % (str(clf.best_params_), clf.best_score_))

        gen_coefs = clf.best_estimator_.coef_[0,:num_feats]
        src_coefs = clf.best_estimator_.coef_[0,num_feats:2*num_feats]
        tgt_coefs = clf.best_estimator_.coef_[0,2*num_feats:3*num_feats]
        src_diff = abs(src_coefs) - abs(gen_coefs)
        tgt_diff = abs(tgt_coefs) - abs(gen_coefs)
        src_power = (src_diff > 0).sum() / num_feats
        tgt_power = (tgt_diff > 0).sum() / num_feats

        gen_feat_sum = abs(gen_coefs).sum()
        src_feat_sum = abs(src_coefs).sum()
        tgt_feat_sum = abs(tgt_coefs).sum()

        assert abs(clf.best_estimator_.coef_).sum() - (gen_feat_sum + src_feat_sum + tgt_feat_sum) < 1, 'Something must be wrong with the summing of the feature weights! Weight of three feature partitions does not sum to total weight (within some tolerance).'

        preds = clf.predict(X_st_valid)  
        tps = (labels[:,0] * preds).sum()
        total_true = labels.sum()
        total_preds = preds.sum()
        prec = tps / total_preds
        rec = tps / total_true
        f1 = 2* prec * rec / (prec + rec)

        # Do source and target-specific predictions to see whether there are any where target
        # f1 is much worse than source or both f1. might mean there is no way to predict that feature
        # in the target domain.
        # X_src = X_st_train[s]

        print("   F1 is %f" % (f1,))
        print("   Feature weights are gen=%f, src=%f, tgt=%f" % (gen_feat_sum, src_feat_sum, tgt_feat_sum))
        print("   Src feature power is %0.3f, tgt feature power is %0.3f" % (src_power, tgt_power))
Example #53
0
 def __init__(self, observations, groups, features, peaked, tail_prob=0.4, regressor=HuberRegressor(), classifier=LinearSVC(random_state=42)):
     super().__init__(observations, groups, features)
     if len(observations) != len(features) or len(observations) != len(peaked):
         raise ValueError()
     self.peaked = peaked 
     self.regressor = regressor
     self.classifier = classifier
     self.tail_prob = tail_prob
    print(df.head())

    # # Drop Scale 0
    # drop_cols = [c for c in df.columns if '0_0_' in c]
    # df.drop(drop_cols, axis=1, inplace=True)

    classifiers = {
        'svm': SVC(gamma='auto', kernel='rbf', probability=True),
        'rf': RandomForestClassifier(),
        'lda': LinearDiscriminantAnalysis()
    }

    selectors = {
        'mrmr': MRMR(method='MID', k_features=10),
        'mrmr2': MutualInformationFeatureSelector(method='MRMR', n_features=50, n_jobs=n_cores),
        'svc': SelectFromModel(LinearSVC(penalty='l2')),
        'lasso': SelectFromModel(LassoCV(cv=5))
    }

    # Create a classification pipeline
    pipeline = Pipeline([
        # ('scaler', RobustScaler()),
        ('selector', selectors['mrmr2']),
        ('clf', classifiers['rf'])
    ])

    # Define conversion times in months
    times = [24, 36, 60]

    # Results dataframe
    results = pd.DataFrame()
    def get_ten_fold_crossvalid_perfermance(self, fisher_mode, settings=None):
        analysis_scr = []
        predicted_score = False
        reduce_ratio = 1
        #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
        #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0)
        kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds=10)
        #for subset_no in range(1, 11):
        for ((train_index, test_index), subset_no) in izip(kf, range(1, 11)):
            #for train_index, test_index in kf;
            print("Subset:", subset_no)
            print("Train index: ", train_index)
            print("Test index: ", test_index)
            #logger.info('subset number: ' + str(subset_no))
            if 1:
                print "SVM"
                #start_index = int((subset_no - 1) * subset_size + 1)
                #if subset_no == 10:
                #    end_index  = int(max(start_index + subset_size, self.ddi_obj.total_number_of_sequences))
                #else:
                #    end_index  = int(start_index + subset_size)
                #print  start_index, end_index
                #(train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(start_index, end_index, reduce_ratio = reduce_ratio)
                (train_X_10fold,
                 train_y_10fold), (train_X_reduced, train_y_reduced), (
                     test_X,
                     test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(
                         train_index, test_index, reduce_ratio=reduce_ratio)
                standard_scaler = preprocessing.StandardScaler().fit(
                    train_X_reduced)
                scaled_train_X = standard_scaler.transform(train_X_reduced)
                scaled_test_X = standard_scaler.transform(test_X)
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))

            # direct deep learning
            min_max_scaler = Precessing_Scaler_0_9()
            X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
            X_train_pre_validation_minmax = min_max_scaler.transform(
                train_X_reduced)
            x_test_minmax = min_max_scaler.transform(test_X)
            pretraining_X_minmax = min_max_scaler.transform(train_X_10fold)
            x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(
                X_train_pre_validation_minmax,
                train_y_reduced,
                test_size=0.4,
                random_state=42)
            finetune_lr = 1
            batch_size = 100
            pretraining_epochs = cal_epochs(5000,
                                            x_train_minmax,
                                            batch_size=batch_size)
            #pretrain_lr=0.001
            pretrain_lr = 0.001
            training_epochs = 1500
            hidden_layers_sizes = [100, 100]
            corruption_levels = [0.1, 0.1]
            if 1:
                print "direct deep learning"
                sda = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, subset_no, fisher_mode, 'DL', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))

            if 0:
                # deep learning using unlabeled data for pretraining
                print 'deep learning with unlabel data'
                pretraining_epochs = cal_epochs(5000,
                                                pretraining_X_minmax,
                                                batch_size=batch_size)
                sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             pretraining_X_minmax = pretraining_X_minmax,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_unlabel.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_unlabel.predict(x_test_minmax)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))
            if 0:
                # deep learning using split network
                print 'deep learning using split network'
                # get the new representation for A set. first 784-D
                pretraining_epochs = 5000
                hidden_layers_sizes = [100, 100, 100]
                corruption_levels = [0, 0, 0]

                x = x_train_minmax[:, :x_train_minmax.shape[1] / 2]
                print "original shape for A", x.shape
                a_MAE_A = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_A = a_MAE_A.transform(
                    x_train_minmax[:, :x_train_minmax.shape[1] / 2])
                x = x_train_minmax[:, x_train_minmax.shape[1] / 2:]

                print "original shape for B", x.shape
                a_MAE_B = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_B = a_MAE_B.transform(
                    x_train_minmax[:, x_train_minmax.shape[1] / 2:])

                new_x_test_minmax_A = a_MAE_A.transform(
                    x_test_minmax[:, :x_test_minmax.shape[1] / 2])
                new_x_test_minmax_B = a_MAE_B.transform(
                    x_test_minmax[:, x_test_minmax.shape[1] / 2:])
                new_x_validation_minmax_A = a_MAE_A.transform(
                    x_validation_minmax[:, :x_validation_minmax.shape[1] / 2])
                new_x_validation_minmax_B = a_MAE_B.transform(
                    x_validation_minmax[:, x_validation_minmax.shape[1] / 2:])
                new_x_train_minmax_whole = np.hstack(
                    (new_x_train_minmax_A, new_x_train_minmax_B))
                new_x_test_minmax_whole = np.hstack(
                    (new_x_test_minmax_A, new_x_test_minmax_B))
                new_x_validationt_minmax_whole = np.hstack(
                    (new_x_validation_minmax_A, new_x_validation_minmax_B))

                finetune_lr = 1
                batch_size = 100
                pretraining_epochs = cal_epochs(5000,
                                                x_train_minmax,
                                                batch_size=batch_size)
                #pretrain_lr=0.001
                pretrain_lr = 0.001
                training_epochs = 1500
                hidden_layers_sizes = [100, 100, 100]
                corruption_levels = [0, 0, 0]

                sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
                     new_x_validationt_minmax_whole, y_validation_minmax ,
                     new_x_test_minmax_whole, y_test,
                     hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                     training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                     pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )

                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_transformed.predict(
                    new_x_train_minmax_whole)
                y_train = y_train_minmax

                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_transformed.predict(
                    new_x_test_minmax_whole)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))

        report_name = filename + '_' + '_test10fold_'.join(
            map(str, hidden_layers_sizes)
        ) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(
            reduce_ratio) + '_' + str(training_epochs) + '_' + current_date
        saveAsCsv(predicted_score, report_name,
                  performance_score(y_test, test_predicted, predicted_score),
                  analysis_scr)
Example #56
0
class Vehicle_Detection():
    def __init__(self,
                 ystart=400,
                 ystop=656,
                 xstart=200,
                 xstop=1280,
                 scale=1.5,
                 scales=[1.5],
                 threshold_factor=5,
                 heat_threshold=1,
                 nb_frame_ave=5,
                 window=64,
                 cnn_predict=False):
        self.dist_pickle = pickle.load(open("model/dist_pickle.p", "rb"))
        self.svc = self.dist_pickle["clf"]
        self.X_scaler = self.dist_pickle["scaler"]
        self.orient = self.dist_pickle["orient"]
        self.pix_per_cell = self.dist_pickle["pix_per_cell"]
        self.pix_per_cell2 = np.int(self.pix_per_cell / 2)
        self.cell_per_block = self.dist_pickle["cell_per_block"]
        self.cell_per_block2 = self.cell_per_block * 2
        self.spatial_size = self.dist_pickle["spatial_size"]
        self.hist_bins = self.dist_pickle["hist_bins"]
        self.ystart = ystart
        self.ystop = ystop
        self.ystart2 = ystart
        self.ystop2 = np.int(ystart + (ystop - ystart) / 2)
        self.scale = scale
        self.threshold_factor = threshold_factor
        self.heatmaps = deque(maxlen=self.threshold_factor)
        self.heatmap = []
        self.nb_frame_ave = nb_frame_ave
        self.frames = deque(maxlen=self.threshold_factor)
        self.heat_threshold = heat_threshold
        self.window = window
        self.scales = scales
        self.cnn_model = load_model('./dl_detect/model.h5')
        self.cnn_predict = cnn_predict
        self.xstart = xstart
        self.xstop = xstop
        self.lane_line_tracker = Lane_Line_Tracker()

    def calibrate(self):
        # Define feature parameters
        self.color_space = 'YCrCb'  # #ANY OTHER
        self.orient = 9
        self.pix_per_cell = 8
        self.cell_per_block = 2
        self.hog_channel = 'ALL'
        self.spatial_size = (16, 16)
        self.hist_bins = 16
        self.spatial_feat = True
        self.hist_feat = True
        self.hog_feat = True

        t = time.time()
        n_samples = 1000

        dirs = os.listdir("data/vehicles/")
        cars = []
        print(dirs)
        for image_type in dirs:
            cars.extend(glob.glob('data/vehicles/' + image_type + '/*.jpg'))

        print('Number of Vehicles Images found', len(cars))

        with open('data/vehicles/cars.txt', 'w') as f:
            for fn in cars:
                f.write(fn + '\n')

        dirs = os.listdir("data/non-vehicles/")
        notcars = []
        print(dirs)
        for image_type in dirs:
            notcars.extend(
                glob.glob('data/non-vehicles/' + image_type + '/*.jpg'))

        print('Number of Non-Vehicles Images found', len(notcars))

        with open('data/non-vehicles/notcars.txt', 'w') as f:
            for fn in notcars:
                f.write(fn + '\n')

        # Read in car / not-car image
        test_cars = cars  #np.array(cars)[car_indxs]
        test_notcars = notcars  #np.array(notcars)[notcar_indxs]

        car_features = self.extract_features(
            test_cars,
            color_space=self.color_space,
            spatial_size=self.spacial_size,
            hist_bins=self.hist_bins,
            orient=self.orient,
            pix_per_cell=self.pix_per_call,
            cell_per_block=self.cell_per_block,
            hog_channel=self.hog_channel,
            spatial_feat=self.spatial_feat,
            hist_feat=self.hist_feat,
            hog_feat=self.hog_feat)

        notcar_features = self.extract_features(
            test_notcars,
            color_space=self.color_space,
            spatial_size=self.spacial_size,
            hist_bins=self.hist_bins,
            orient=self.orient,
            pix_per_cell=self.pix_per_call,
            cell_per_block=self.cell_per_block,
            hog_channel=self.hog_channel,
            spatial_feat=self.spatial_feat,
            hist_feat=self.hist_feat,
            hog_feat=self.hog_feat)
        print(time.time() - t, ' Seconds to compute features...')
        X = np.vstack((car_features, notcar_features)).astype(np.float)
        # Fit a per column scaler
        self.X_scaler = StandardScaler().fit(X)
        # Apply the scaler to X
        scaled_X = self.X_scaler.transform(X)

        # Define the labels vector
        y = np.hstack(
            (np.ones(len(car_features)), np.zeros(len(notcar_features))))

        # Split up data into randomized training and test sets
        rand_state = np.random.randint(0, 100)
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_X, y, test_size=0.1, random_state=rand_state)

        print('Using: ', self.orient, 'orientations,', self.pix_per_cell,
              'pixels per cell', self.cell_per_block, 'cells per block,',
              self.hist_bins, 'histogram bins, and', self.spacial_size,
              'spatial sampling')
        print('Feature vector length:', len(X_train[0]))
        # Use a linear SVC
        self.svc = LinearSVC()
        # Check the learning time of the SVC
        t = time.time()
        self.svc.fit(X_train, y_train)
        # Save the camera calibration result for later use (we won't worry about rvecs / tvecs)
        self.dist_pickle = {}
        dist_pickle["clf"] = self.svc
        dist_pickle["scaler"] = self.X_scaler
        dist_pickle["color_space"] = self.color_space
        dist_pickle["orient"] = self.orient
        dist_pickle["pix_per_cell"] = self.pix_per_cell
        dist_pickle["cell_per_block"] = self.cell_per_block
        dist_pickle["spatial_size"] = self.spatial_size
        dist_pickle["hog_channel"] = self.hog_channel
        dist_pickle["hist_bins"] = self.hist_bins
        dist_pickle["spatial_feat"] = self.spatial_feat
        dist_pickle["hist_feat"] = self.hist_feat
        dist_pickle["hog_feat"] = self.hog_feat
        pickle.dump(self.dist_pickle, open("model/dist_pickle.p", "wb"))

        print(round(time.time() - t, 2), ' Seconds to train SVC...')
        # Check the score of the SVC
        print('Test accuracy of svc = ',
              round(self.svc.score(X_test, y_test), 4))

    # Define a function to compute color histogram features
    def color_hist(self, image, nbins=32):
        # Compute the histogram of the RGB channels separately
        # Concatenate the histograms into a single feature vector
        # Return the feature vector
        # Take histograms in R, G, and B
        rhist = np.histogram(image[:, :, 0], bins=32)
        ghist = np.histogram(image[:, :, 1], bins=32)
        bhist = np.histogram(image[:, :, 2], bins=32)
        # Generating bin centers
        bin_edges = rhist[1]
        # Concatenate the histograms into a single feature vector
        hist_features = np.concatenate((rhist[0], ghist[0], bhist[0]))
        # Return the individual histograms, bin_centers and feature vector
        return hist_features

    def bin_spatial(self, img, size=(32, 32)):
        # Use cv2.resize().ravel() to create the feature vector
        features = cv2.resize(img, size).ravel()
        # Return the feature vector
        return features

    def data_look(self, car_list, notcar_list):
        data_dict = {}
        # Define a key in data_dict "n_cars" and store the number of car images
        data_dict["n_cars"] = len(car_list)
        # Define a key "n_notcars" and store the number of notcar images
        data_dict["n_notcars"] = len(notcar_list)
        # Read in a test image, either car or notcar
        example_img = mpimg.imread(car_list[0])
        # Define a key "image_shape" and store the test image shape 3-tuple
        data_dict["image_shape"] = example_img.shape
        # Define a key "data_type" and store the data type of the test image.
        data_dict["data_type"] = example_img.dtype
        # Return data_dict
        return data_dict

    # Define a function to return HOG features and visualization

    def get_hog_features(img,
                         orient,
                         pix_per_cell,
                         cell_per_block,
                         vis=False,
                         feature_vec=True):
        if vis == True:
            # Use skimage.hog() to get both features and a visualization
            features, hog_image = hog(img,
                                      orientations=orient,
                                      pixels_per_cell=(pix_per_cell,
                                                       pix_per_cell),
                                      cells_per_block=(cell_per_block,
                                                       cell_per_block),
                                      visualise=vis,
                                      feature_vector=feature_vec)

            return features, hog_image
        else:
            # Use skimage.hog() to get features only
            features = features, hog_image = hog(
                img,
                orientations=orient,
                pixels_per_cell=(pix_per_cell, pix_per_cell),
                cells_per_block=(cell_per_block, cell_per_block),
                visualise=vis,
                feature_vector=feature_vec)

            return features

    # Define a function to extract features from a list of images
    # Have this function call bin_spatial() and color_hist()

    def extract_features(self,
                         imgs,
                         color_space='RGB',
                         spatial_size=(32, 32),
                         hist_bins=32,
                         orient=9,
                         pix_per_cell=8,
                         cell_per_block=2,
                         hog_channel=0,
                         spatial_feat=True,
                         hist_feat=True,
                         hog_feat=True):
        # Create a list to append feature vectors to
        features = []
        # Iterate through the list of images
        for file in imgs:
            file_features = []
            # Read in each one by one
            image = mpimg.imread(file)
            # apply color conversion if other than 'RGB'
            if color_space != 'RGB':
                if color_space == 'HSV':
                    feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
                elif color_space == 'LUV':
                    feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
                elif color_space == 'HLS':
                    feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
                elif color_space == 'YUV':
                    feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YUV)
                elif color_space == 'YCrCb':
                    feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb)
            else:
                feature_image = np.copy(image)

            if spatial_feat == True:
                spatial_features = self.bin_spatial(feature_image,
                                                    size=spatial_size)
                file_features.append(spatial_features)
            if hist_feat == True:
                # Apply color_hist()
                hist_features = self.color_hist(feature_image, nbins=hist_bins)
                file_features.append(hist_features)
            if hog_feat == True:
                # Call get_hog_features() with vis=False, feature_vec=True
                if hog_channel == 'ALL':
                    hog_features = []
                    for channel in range(feature_image.shape[2]):
                        hog_features.append(
                            self.get_hog_features(feature_image[:, :, channel],
                                                  orient,
                                                  pix_per_cell,
                                                  cell_per_block,
                                                  vis=False,
                                                  feature_vec=True))
                    hog_features = np.ravel(hog_features)
                else:
                    hog_features = self.get_hog_features(
                        feature_image[:, :, hog_channel],
                        orient,
                        pix_per_cell,
                        cell_per_block,
                        vis=False,
                        feature_vec=True)
                # Append the new feature vector to the features list
                file_features.append(hog_features)
            features.append(np.concatenate(file_features))
        # Return list of feature vectors
        return features

    # Define a function to return HOG features and visualization
    def get_hog_features(self,
                         img,
                         orient,
                         pix_per_cell,
                         cell_per_block,
                         vis=False,
                         feature_vec=True):
        # Call with two outputs if vis==True
        if vis == True:
            features, hog_image = hog(img,
                                      orientations=orient,
                                      pixels_per_cell=(pix_per_cell,
                                                       pix_per_cell),
                                      cells_per_block=(cell_per_block,
                                                       cell_per_block),
                                      transform_sqrt=True,
                                      visualise=vis,
                                      feature_vector=feature_vec)
            return features, hog_image
        # Otherwise call with one output
        else:
            features = hog(img,
                           orientations=orient,
                           pixels_per_cell=(pix_per_cell, pix_per_cell),
                           cells_per_block=(cell_per_block, cell_per_block),
                           transform_sqrt=True,
                           visualise=vis,
                           feature_vector=feature_vec)
            return features

    # Define a function that takes an image,
    # start and stop positions in both x and y,
    # window size (x and y dimensions),
    # and overlap fraction (for both x and y)
    def slide_window(self,
                     img,
                     x_start_stop=[None, None],
                     y_start_stop=[None, None],
                     xy_window=(64, 64),
                     xy_overlap=(0.5, 0.5)):
        # If x and/or y start/stop positions not defined, set to image size
        if x_start_stop[0] == None:
            x_start_stop[0] = 0
        if x_start_stop[1] == None:
            x_start_stop[1] = img.shape[1]
        if y_start_stop[0] == None:
            y_start_stop[0] = 0
        if y_start_stop[1] == None:
            y_start_stop[1] = img.shape[0]
        # Compute the span of the region to be searched
        xspan = x_start_stop[1] - x_start_stop[0]
        yspan = y_start_stop[1] - y_start_stop[0]
        # Compute the number of pixels per step in x/y
        nx_pix_per_step = np.int(xy_window[0] * (1 - xy_overlap[0]))
        ny_pix_per_step = np.int(xy_window[1] * (1 - xy_overlap[1]))
        # Compute the number of windows in x/y
        nx_buffer = np.int(xy_window[0] * (xy_overlap[0]))
        ny_buffer = np.int(xy_window[1] * (xy_overlap[1]))
        nx_windows = np.int((xspan - nx_buffer) / nx_pix_per_step)
        ny_windows = np.int((yspan - ny_buffer) / ny_pix_per_step)
        # Initialize a list to append window positions to
        window_list = []
        # Loop through finding x and y window positions
        # Note: you could vectorize this step, but in practice
        # you'll be considering windows one by one with your
        # classifier, so looping makes sense
        for ys in range(ny_windows):
            for xs in range(nx_windows):
                # Calculate window position
                startx = xs * nx_pix_per_step + x_start_stop[0]
                endx = startx + xy_window[0]
                starty = ys * ny_pix_per_step + y_start_stop[0]
                endy = starty + xy_window[1]
                # Append window position to list
                window_list.append(((startx, starty), (endx, endy)))
        # Return the list of windows
        return window_list

    # Define a function to extract features from a single image window
    # This function is very similar to extract_features()
    # just for a single image rather than list of images
    def single_img_features(self,
                            img,
                            color_space='RGB',
                            spatial_size=(32, 32),
                            hist_bins=32,
                            orient=9,
                            pix_per_cell=8,
                            cell_per_block=2,
                            hog_channel=0,
                            spatial_feat=True,
                            hist_feat=True,
                            hog_feat=True):
        # 1) Define an empty list to receive features
        img_features = []
        # 2) Apply color conversion if other than 'RGB'
        if (color_space != 'RGB'):
            if (color_space == 'HSV'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
            elif (color_space == 'LUV'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2LUV)
            elif (color_space == 'HLS'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HLS)
            elif (color_space == 'YUV'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YUV)
            elif (color_space == 'YCrCb'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
        else:
            # feature_image = np.copy(img)
            feature_image = img.copy()

        # 3) Compute spatial features if flag is set
        if (spatial_feat == True):
            spatial_features = self.bin_spatial(feature_image,
                                                size=spatial_size)
            # 4) Append features to list
            img_features.append(spatial_features)
        # 5) Compute histogram features if flag is set
        if (hist_feat == True):
            hist_features = self.color_hist(feature_image, nbins=hist_bins)
            # 6) Append features to list
            img_features.append(hist_features)
        # 7) Compute HOG features if flag is set
        if (hog_feat == True):
            if (hog_channel == 'ALL'):
                hog_features = []
                hog_image = None
                for channel in range(feature_image.shape[2]):
                    hog_features.extend(
                        self.get_hog_features(feature_image[:, :, channel],
                                              orient,
                                              pix_per_cell,
                                              cell_per_block,
                                              vis=False,
                                              feature_vec=True))
            else:
                hog_features = self.get_hog_features(
                    feature_image[:, :, hog_channel],
                    orient,
                    pix_per_cell,
                    cell_per_block,
                    vis=True,
                    feature_vec=True)
            # 8) Append features to list
            img_features.append(hog_features)

        # 9) Return concatenated array of features
        return np.concatenate(img_features)

    def single_img_features_train(self,
                                  img,
                                  color_space='RGB',
                                  spatial_size=(32, 32),
                                  hist_bins=32,
                                  orient=9,
                                  pix_per_cell=8,
                                  cell_per_block=2,
                                  hog_channel=0,
                                  spatial_feat=True,
                                  hist_feat=True,
                                  hog_feat=True):
        # 1) Define an empty list to receive features
        img_features = []
        # 2) Apply color conversion if other than 'RGB'
        if (color_space != 'RGB'):
            if (color_space == 'HSV'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
            elif (color_space == 'LUV'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2LUV)
            elif (color_space == 'HLS'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2HLS)
            elif (color_space == 'YUV'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YUV)
            elif (color_space == 'YCrCb'):
                feature_image = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
        else:
            # feature_image = np.copy(img)
            feature_image = img.copy()

        # 3) Compute spatial features if flag is set
        if (spatial_feat == True):
            spatial_features = self.bin_spatial(feature_image,
                                                size=spatial_size)
            # 4) Append features to list
            img_features.append(spatial_features)
        # 5) Compute histogram features if flag is set
        if (hist_feat == True):
            hist_features = self.color_hist(feature_image, nbins=hist_bins)
            # 6) Append features to list
            img_features.append(hist_features)
        # 7) Compute HOG features if flag is set
        if (hog_feat == True):
            if (hog_channel == 'ALL'):
                hog_features = []
                hog_image = None
                for channel in range(feature_image.shape[2]):
                    hog_features.extend(
                        self.get_hog_features(feature_image[:, :, channel],
                                              orient,
                                              pix_per_cell,
                                              cell_per_block,
                                              vis=False,
                                              feature_vec=True))
            else:
                hog_features, hog_image = self.get_hog_features(
                    feature_image[:, :, hog_channel],
                    orient,
                    pix_per_cell,
                    cell_per_block,
                    vis=True,
                    feature_vec=True)
            # 8) Append features to list
            img_features.append(hog_features)

        # 9) Return concatenated array of features
        return np.concatenate(img_features), hog_image

    # Define a function you will pass an image
    # and the list of windows to be searched (output of slide_windows())
    def search_windows(self,
                       img,
                       windows,
                       clf,
                       scaler,
                       color_space='RGB',
                       spatial_size=(32, 32),
                       hist_bins=32,
                       hist_range=(0, 256),
                       orient=9,
                       pix_per_cell=8,
                       cell_per_block=2,
                       hog_channel=0,
                       spatial_feat=True,
                       hist_feat=True,
                       hog_feat=True):

        # 1) Create an empty list to receive positive detection windows
        on_windows = []
        # 2) Iterate over all windows in the list
        for window in windows:
            # 3) Extract the test window from original image
            test_img = cv2.resize(
                img[window[0][1]:window[1][1], window[0][0]:window[1][0]],
                (64, 64))
            # 4) Extract features for that window using single_img_features()
            features = self.single_img_features(test_img,
                                                color_space=color_space,
                                                spatial_size=spatial_size,
                                                hist_bins=hist_bins,
                                                orient=orient,
                                                pix_per_cell=pix_per_cell,
                                                cell_per_block=cell_per_block,
                                                hog_channel=hog_channel,
                                                spatial_feat=spatial_feat,
                                                hist_feat=hist_feat,
                                                hog_feat=hog_feat)
            # 5) Scale extracted features to be fed to classifier
            test_features = scaler.transform(np.array(features).reshape(1, -1))
            # 6) Predict using your classifier
            prediction = clf.predict(test_features)
            # 7) If positive (prediction == 1) then save the window
            if prediction == 1:
                on_windows.append(window)
        # 8) Return windows for positive detections
        return on_windows

    # Define a function to draw bounding boxes
    def draw_boxes(self, img, bboxes, color=(0, 0, 255), thick=6):
        # Make a copy of the image
        imcopy = np.copy(img)
        # Iterate through the bounding boxes
        for bbox in bboxes:
            # Draw a rectangle given bbox coordinates
            cv2.rectangle(imcopy, bbox[0], bbox[1], color, thick)
        # Return the image copy with boxes drawn
        return imcopy

    def convert_color(self, img, conv='RGB2YCrCb'):
        if conv == 'RGB2YCrCb':
            return cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
        if conv == 'BGR2YCrCb':
            return cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
        if conv == 'RGB2LUV':
            return cv2.cvtColor(img, cv2.COLOR_RGB2LUV)

    # Define a single function that can extract features using hog sub-sampling and make predictions
    def find_cars(self,
                  img,
                  ystart,
                  ystop,
                  scale,
                  svc,
                  X_scaler,
                  orient,
                  pix_per_cell,
                  cell_per_block,
                  spatial_size,
                  hist_bins,
                  window=64,
                  subsample=(64, 64)):
        img_boxes = []
        count = 0

        draw_img = np.copy(img)
        #Make a heatmap of zeros
        heatmap = np.zeros_like(img[:, :, 0])
        img2 = img
        img = img.astype(np.float32) / 255

        img_tosearch = img[ystart:ystop, :, :]
        img_tosearch2 = img2[ystart:ystop, :, :]
        ctrans_tosearch = self.convert_color(img_tosearch, conv='RGB2YCrCb')
        if scale != 1:
            imshape = ctrans_tosearch.shape
            ctrans_tosearch = cv2.resize(
                ctrans_tosearch,
                (np.int(imshape[1] / scale), np.int(imshape[0] / scale)))
            img_tosearch2 = cv2.resize(
                img_tosearch2,
                (np.int(imshape[1] / scale), np.int(imshape[0] / scale)))

        ch1 = ctrans_tosearch[:, :, 0]
        ch2 = ctrans_tosearch[:, :, 1]
        ch3 = ctrans_tosearch[:, :, 2]

        # Compute individual channel HOG features for the entire image
        hog1 = self.get_hog_features(ch1,
                                     orient,
                                     pix_per_cell,
                                     cell_per_block,
                                     feature_vec=False)
        hog2 = self.get_hog_features(ch2,
                                     orient,
                                     pix_per_cell,
                                     cell_per_block,
                                     feature_vec=False)
        hog3 = self.get_hog_features(ch3,
                                     orient,
                                     pix_per_cell,
                                     cell_per_block,
                                     feature_vec=False)
        # Define blocks and steps as above
        nxblocks = (ch1.shape[1] // pix_per_cell) - cell_per_block + 1
        nyblocks = (ch1.shape[0] // pix_per_cell) - cell_per_block + 1
        nfeat_per_block = orient * cell_per_block**2

        # 64 was the orginal sampling rate, with 8 cells and 8 pix per cell
        nblocks_per_window = (window // pix_per_cell) - cell_per_block + 1
        cells_per_step = 2  # Instead of overlap, define how many cells to step
        nxsteps = (nxblocks - nblocks_per_window) // cells_per_step
        nysteps = (nyblocks - nblocks_per_window) // cells_per_step

        for xb in range(nxsteps):
            for yb in range(nysteps):
                ypos = yb * cells_per_step
                xpos = xb * cells_per_step
                xleft = xpos * pix_per_cell
                ytop = ypos * pix_per_cell

                # Extract the image patch
                subimg = cv2.resize(
                    ctrans_tosearch[ytop:ytop + window, xleft:xleft + window],
                    subsample)

                # Extract HOG for this patch
                hog_feat1 = hog1[ypos:ypos + nblocks_per_window,
                                 xpos:xpos + nblocks_per_window].ravel()
                hog_feat2 = hog2[ypos:ypos + nblocks_per_window,
                                 xpos:xpos + nblocks_per_window].ravel()
                hog_feat3 = hog3[ypos:ypos + nblocks_per_window,
                                 xpos:xpos + nblocks_per_window].ravel()
                hog_features = np.hstack((hog_feat1, hog_feat2, hog_feat3))

                # Get color features
                spatial_features = self.bin_spatial(subimg, size=spatial_size)
                hist_features = self.color_hist(subimg, nbins=hist_bins)

                # Scale features and make a prediction
                test_features = X_scaler.transform(
                    np.hstack((spatial_features, hist_features,
                               hog_features)).reshape(1, -1))
                test_prediction = svc.predict(test_features)
                if test_prediction == 1:
                    xbox_left = np.int(xleft * scale)
                    ytop_draw = np.int(ytop * scale)
                    win_draw = np.int(window * scale)
                    cv2.rectangle(
                        draw_img, (xbox_left, ytop_draw + ystart),
                        (xbox_left + win_draw, ytop_draw + win_draw + ystart),
                        (0, 0, 255), 6)
                    img_boxes.append(((xbox_left, ytop_draw + ystart),
                                      (xbox_left + win_draw,
                                       ytop_draw + win_draw + ystart)))
                    heatmap[ytop_draw + ystart:ytop_draw + win_draw + ystart +
                            1, xbox_left:xbox_left + win_draw + 1] += 1

        return draw_img, heatmap

    def find_cars_nn(self,
                     img,
                     ystart,
                     ystop,
                     scale,
                     pix_per_cell,
                     cell_per_block,
                     window=64,
                     subsample=(64, 64)):
        img_boxes = []
        count = 0
        draw_img = np.copy(img)
        #Make a heatmap of zeros
        heatmap = np.zeros_like(img[:, :, 0])

        img_tosearch = img[ystart:ystop, self.xstart:self.xstop, :]

        imshape = img_tosearch.shape
        if scale != 1:
            img_tosearch = cv2.resize(
                img_tosearch,
                (np.int(imshape[1] / scale), np.int(imshape[0] / scale)))

        # Define blocks and steps as above
        nxblocks = (img_tosearch.shape[1] // pix_per_cell) - cell_per_block + 1
        nyblocks = (img_tosearch.shape[0] // pix_per_cell) - cell_per_block + 1

        # 64 was the orginal sampling rate, with 8 cells and 8 pix per cell
        #window = 64
        nblocks_per_window = (window // pix_per_cell) - cell_per_block + 1
        cells_per_step = 2  # Instead of overlap, define how many cells to step
        nxsteps = (nxblocks - nblocks_per_window) // cells_per_step
        nysteps = (nyblocks - nblocks_per_window) // cells_per_step

        for xb in range(nxsteps):
            for yb in range(nysteps):
                ypos = yb * cells_per_step
                xpos = xb * cells_per_step
                xleft = xpos * pix_per_cell
                ytop = ypos * pix_per_cell

                # Extract the image patch
                subimg = cv2.resize(
                    img_tosearch[ytop:ytop + window, xleft:xleft + window],
                    subsample)
                count += 1

                test_prediction = self.cnn_model.predict(subimg[None, :, :, :],
                                                         batch_size=1)
                if (test_prediction[0][0] > 0.5):
                    test_prediction = 1
                else:
                    test_prediction = 0

                if (test_prediction == 1):
                    xbox_left = np.int(xleft * scale)
                    ytop_draw = np.int(ytop * scale)
                    win_draw = np.int(window * scale)
                    cv2.rectangle(
                        draw_img,
                        (xbox_left + self.xstart, ytop_draw + ystart),
                        (xbox_left + win_draw + self.xstop,
                         ytop_draw + win_draw + ystart), (0, 0, 255), 6)
                    heatmap[ytop_draw + ystart:ytop_draw + win_draw + ystart +
                            1, xbox_left + self.xstart:xbox_left + win_draw +
                            self.xstart + 1] += 1

        return draw_img, heatmap

    def add_heat(self, heatmap, bbox_list):
        # Iterate through list of bboxes
        for box in bbox_list:
            # Add += 1 for all pixels inside each bbox
            # Assuming each "box" takes the form ((x1, y1), (x2, y2))
            heatmap[box[0][1]:box[1][1], box[0][0]:box[1][0]] += 1

        # Return updated heatmap
        return heatmap  # Iterate through list of bboxes

    def apply_threshold(self, heatmap, threshold):
        # Zero out pixels below the threshold
        heatmap[heatmap <= threshold] = 0
        # Return thresholded map
        return heatmap

    def draw_labeled_bboxes(self, img, labels):
        # Iterate through all detected cars
        for car_number in range(1, labels[1] + 1):
            # Find pixels with each car_number label value
            nonzero = (labels[0] == car_number).nonzero()
            # Identify x and y values of those pixels
            nonzeroy = np.array(nonzero[0])
            nonzerox = np.array(nonzero[1])
            # Define a bounding box based on min/max x and y
            bbox = ((np.min(nonzerox), np.min(nonzeroy)), (np.max(nonzerox),
                                                           np.max(nonzeroy)))
            # Draw the box on the image
            cv2.rectangle(img, bbox[0], bbox[1], (0, 0, 255), 6)
        # Return the image
        return img

    def visualise(self, fig, rows, cols, imgs, titles):
        for i, img in enumerate(imgs):
            plt.subplot(rows, cols, i + 1)
            plt.title(i + 1)
            img_dims = len(img.shape)
            if (img_dims < 3):
                plt.imshow(img, cmap='hot')
                plt.title(titles[i])
            else:
                plt.imshow(img)
                plt.title(titles[i])

    def visualisex(self, fig, rows, cols, imgs, titles):
        for i, img in enumerate(imgs):
            plt.subplot(rows, cols, i + 1)
            plt.title(i + 1)
            img_dims = len(img.shape)
            if (img_dims < 3):
                plt.imshow(img, cmap='gray')
                plt.title(titles[i])
            else:
                plt.imshow(img)
                plt.title(titles[i])

    def process_image(self, img):
        for scale in self.scales:
            if (self.cnn_predict):
                out_image, heatmap = self.find_cars_nn(
                    img,
                    self.ystart,
                    self.ystop,
                    scale,
                    self.pix_per_cell,
                    self.cell_per_block,
                )
            else:
                out_image, heatmap = self.find_cars(
                    img, self.ystart, self.ystop, scale, self.svc,
                    self.X_scaler, self.orient, self.pix_per_cell,
                    self.cell_per_block, self.spatial_size, self.hist_bins)
            self.heatmaps.append(heatmap)
        intergrated_heat_maps = np.sum(self.heatmaps, axis=0)
        threshold_heat_map = self.apply_threshold(intergrated_heat_maps,
                                                  self.heat_threshold)
        labels = label(threshold_heat_map)
        # Draw bounding boxes on a copy of the image
        draw_image = self.draw_labeled_bboxes(np.copy(img), labels)
        draw_image = self.lane_line_tracker.process_image(draw_image)
        return draw_image
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

# In[87]:

acc_log

# In[88]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

# In[89]:

linear_svm = LinearSVC()
linear_svm.fit(X_train, Y_train)
Y_pred = linear_svm.predict(X_test)
acc_linear_svm = round(linear_svm.score(X_train, Y_train) * 100, 2)

# In[98]:

acc_linear_svm

# In[99]:

random_forest = RandomForestClassifier(n_estimators=150)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
#GET THE TRAINING AND TESTING DATA.

X_train_vids, X_test_vids = classify_library.limited_input(training_dict, testing_dict, 30, 24)
X_train, Y_train = classify_library.make_FV_matrix(X_train_vids,training_output, class_index)
X_test, Y_test = classify_library.make_FV_matrix(X_test_vids,testing_output, class_index)
training_PCA = classify_library.limited_input1(training_dict,1)


#Experiments with PCA
pca_dim = 500
pca = PCA(n_components=pca_dim)
pca.fit(X_train)
X_train_PCA = pca.transform(X_train)
X_test_PCA = pca.transform(X_test)
estimator = OneVsRestClassifier(LinearSVC(penalty='l2', random_state=0, C=100, loss='hinge'))
classifier = estimator.fit(X_train_PCA, Y_train)
metrics = classify_library.metric_scores(classifier, X_test_PCA, Y_test, verbose=True)
logging.info("mAP, accuracy_score, avg_Precision, avg_Recall")
logging.info (metrics)
logging.info("Complete Evaluation") 

do_learning_curve = True
if do_learning_curve:
    X_full = np.vstack([X_train_PCA, X_test_PCA])
    Y_full = np.hstack([Y_train, Y_test])
    title= "Learning Curves (Linear SVM, C: %d, loss: %s, penalty: %s, PCA dim: %d)" % (100,'hinge','l2',pca_dim)
    #cv = cross_validation.ShuffleSplit(X_full.shape[0], n_iter=4,test_size=0.2, random_state=0)
    cv = ShuffleSplit(n_splits=5,test_size=0.5, random_state=0)
    estimator = OneVsRestClassifier(LinearSVC(random_state=0, C=100, loss='hinge', penalty='l2'))
    classify_library.plot_learning_curve(estimator, title, X_full, Y_full, (0.7, 1.01), cv=cv, n_jobs=1)
Example #59
0
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC

import util
from feature_extraction import create_fbcsp

args = util.parse_args()
loader = util.Loader(args.d, dataset_dir=args.d_path, labels_dir=args.l_path)
logger = util.get_logger("TEST__%s" % args.d)

# Configuration paramenters
win_len = 2  # Window size
C = 0.04

estimator = LinearSVC(C=C)
n_components = 2  # Number of components in CSP
start_time = 3.5  # Window start time for training
train_sessions = [[1, 3], [1, 2, 3], [1, 2, 3], [3], [3], [3], [3], [3], [3]]
test_sessions = [4, 5]
freq_bands = [[10, 14, 20, 24], [10, 14, 20, 24], [8, 12, 18, 22],
              [8, 12, 10, 14], [22, 26, 26, 30], [10, 14, 12, 16],
              [12, 16, 18, 22], [8, 12, 10, 14], [18, 22, 22, 26]]

logger.info("CLASSIFICATION RESULTS")
logger.info("------------")
logger.info("- Dataset = BCI Competition IV - %s" % args.d)
logger.info("- Dataset dir = %s" % args.d_path)
logger.info("- Estimator = SVM (C=%.2f)" % C)
logger.info("- Window start time for training (at sec) = %s" % start_time)
logger.info("- Window length (secs) = %s" % win_len)
    def get_LOO_perfermance(self, fisher_mode, settings=None):
        analysis_scr = []
        predicted_score = False
        reduce_ratio = 1
        for seq_no in range(1, self.ddi_obj.total_number_of_sequences + 1):
            print seq_no
            logger.info('sequence number: ' + str(seq_no))
            if 1:
                print "SVM"
                (train_X_LOO,
                 train_y_LOO), (train_X_reduced, train_y_reduced), (
                     test_X, test_y
                 ) = self.ddi_obj.get_LOO_training_and_reduced_traing(
                     seq_no, reduce_ratio=reduce_ratio)
                standard_scaler = preprocessing.StandardScaler().fit(
                    train_X_reduced)
                scaled_train_X = standard_scaler.transform(train_X_reduced)
                scaled_test_X = standard_scaler.transform(test_X)
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(test_y,
                                          predicted_test_y).values()))  #new

                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(
                        performance_score(train_y_reduced,
                                          predicted_train_y).values()))
            # Deep learning part
            min_max_scaler = Precessing_Scaler_0_9()
            X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
            X_train_pre_validation_minmax = min_max_scaler.transform(
                train_X_reduced)
            x_test_minmax = min_max_scaler.transform(test_X)
            pretraining_X_minmax = min_max_scaler.transform(train_X_LOO)
            x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(
                X_train_pre_validation_minmax,
                train_y_reduced,
                test_size=0.4,
                random_state=42)
            finetune_lr = 1
            batch_size = 100
            pretraining_epochs = cal_epochs(1500,
                                            x_train_minmax,
                                            batch_size=batch_size)
            #pretrain_lr=0.001
            pretrain_lr = 0.001
            training_epochs = 1500
            hidden_layers_sizes = [100, 100]
            corruption_levels = [0, 0]
            if 1:
                print "direct deep learning"
                # direct deep learning
                sda = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append((
                    self.ddi, seq_no, fisher_mode, 'DL', isTest
                ) + tuple(
                    performance_score(y_train, training_predicted).values()))

                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'DL', isTest) +
                    tuple(performance_score(y_test, test_predicted).values()))

            if 0:
                # deep learning using unlabeled data for pretraining
                print 'deep learning with unlabel data'
                pretraining_epochs_for_reduced = cal_epochs(
                    1500, pretraining_X_minmax, batch_size=batch_size)
                sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax ,
                             x_test_minmax, test_y,
                             pretraining_X_minmax = pretraining_X_minmax,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs_for_reduced,
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_unlabel.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_unlabel.predict(x_test_minmax)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))
            if 0:
                # deep learning using split network
                print 'deep learning using split network'
                # get the new representation for A set. first 784-D
                pretraining_epochs = 1500
                hidden_layers_sizes = [50, 50]
                corruption_levels = [0, 0]

                x = x_train_minmax[:, :x_train_minmax.shape[1] / 2]
                print "original shape for A", x.shape
                a_MAE_A = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_A = a_MAE_A.transform(
                    x_train_minmax[:, :x_train_minmax.shape[1] / 2])
                x = x_train_minmax[:, x_train_minmax.shape[1] / 2:]

                print "original shape for B", x.shape
                a_MAE_B = train_a_MultipleAEs(
                    x,
                    pretraining_epochs=pretraining_epochs,
                    pretrain_lr=pretrain_lr,
                    batch_size=batch_size,
                    hidden_layers_sizes=hidden_layers_sizes,
                    corruption_levels=corruption_levels)
                new_x_train_minmax_B = a_MAE_B.transform(
                    x_train_minmax[:, x_train_minmax.shape[1] / 2:])

                new_x_test_minmax_A = a_MAE_A.transform(
                    x_test_minmax[:, :x_test_minmax.shape[1] / 2])
                new_x_test_minmax_B = a_MAE_B.transform(
                    x_test_minmax[:, x_test_minmax.shape[1] / 2:])
                new_x_validation_minmax_A = a_MAE_A.transform(
                    x_validation_minmax[:, :x_validation_minmax.shape[1] / 2])
                new_x_validation_minmax_B = a_MAE_B.transform(
                    x_validation_minmax[:, x_validation_minmax.shape[1] / 2:])
                new_x_train_minmax_whole = np.hstack(
                    (new_x_train_minmax_A, new_x_train_minmax_B))
                new_x_test_minmax_whole = np.hstack(
                    (new_x_test_minmax_A, new_x_test_minmax_B))
                new_x_validationt_minmax_whole = np.hstack(
                    (new_x_validation_minmax_A, new_x_validation_minmax_B))

                finetune_lr = 1
                batch_size = 100
                pretraining_epochs = cal_epochs(1500,
                                                x_train_minmax,
                                                batch_size=batch_size)
                #pretrain_lr=0.001
                pretrain_lr = 0.001
                training_epochs = 1500
                hidden_layers_sizes = [100, 100]
                corruption_levels = [0, 0]

                sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
                     new_x_validationt_minmax_whole, y_validation_minmax ,
                     new_x_test_minmax_whole, y_test,
                     hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                     training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
                     pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                     )

                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_transformed.predict(
                    new_x_train_minmax_whole)
                y_train = y_train_minmax

                isTest = False
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_train, training_predicted,
                                          predicted_score).values()))

                test_predicted = sda_transformed.predict(
                    new_x_test_minmax_whole)
                y_test = test_y

                isTest = True
                #new
                analysis_scr.append(
                    (self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple(
                        performance_score(y_test, test_predicted,
                                          predicted_score).values()))

        report_name = filename + '_' + '_'.join(map(
            str, hidden_layers_sizes)) + '_' + str(pretrain_lr) + '_' + str(
                finetune_lr) + '_' + str(reduce_ratio) + '_' + str(
                    training_epochs) + '_' + current_date
        saveAsCsv(predicted_score, report_name,
                  performance_score(y_test, test_predicted, predicted_score),
                  analysis_scr)