コード例 #1
0
class svm():
    def __init__(self):
        # self.clf = SVC(kernel='rbf')
        self.clf = NuSVC()

    def train(self, inputs):
        # Parameters:
        #     inputs: An array of Input objects containing input vectors along with their corresponding labels.

        # Creates lists to use for fitting model
        X = []
        Y = []
        for data in inputs:
            X.append((data.x/np.linalg.norm(data.x)))
            Y.append(data.y)
        # Fit model
        self.clf.fit(X, Y)

    def predict(self, input):
        # Parameters:
        #     input: An Input object containing an input vector to be used for predicting a label.

        x = input.x/np.linalg.norm(input.x)
        if isinstance(input, Input):
            return self.clf.predict(x)
        else:
            x = input/np.linalg.norm(input)
            return self.clf.predict(x)
コード例 #2
0
ファイル: imgpred.py プロジェクト: cameronphchen/pHA
def predict_loo(transformed_data, args, trn_label ,tst_label):
  print 'imgpred loo',
  print args.loo,
  sys.stdout.flush()

  (ndim, nsample , nsubjs) = transformed_data.shape

  loo = args.loo
  loo_idx = range(nsubjs)
  loo_idx.remove(loo)

  #tst_data = np.zeros(shape = (ndim,nsample))
  trn_data = np.zeros(shape = (ndim,(nsubjs-1)*nsample))
  # image stimulus prediction
  # tst_data : ndim x nsample
  tst_data = transformed_data[:,:,loo]

  for m in range(len(loo_idx)):
    trn_data[:,m*nsample:(m+1)*nsample] = transformed_data[:,:,loo_idx[m]]
  
  # scikit-learn svm for classification
  clf = NuSVC(nu=0.5, kernel = 'linear')
  clf.fit(trn_data.T, trn_label)
  pred_label = clf.predict(tst_data.T)
      
  accu = sum(pred_label == tst_label)/float(len(pred_label))

  return accu
コード例 #3
0
ファイル: lib.py プロジェクト: richelite/classify
class RbfSVM:
	def __init__(self):
		self.clf = NuSVC(nu=0.7, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1)
		self.pattern ='(?u)\\b[A-Za-z]{3,}'
		self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(1, 3))
	def train(self,fileName):
		print "RbfSVM Classifier is being trained"
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_train = self.tfidf.fit_transform(table.message)
		Y_train = []
		for item in table.cat:
			Y_train.append(int(item)) 
		self.clf.fit(X_train, Y_train)
		print "RbfSVM Classifier has been trained"

	def classify(self,cFileName, rFileName):
		table = pandas.read_table(cFileName, names=["message"])
		X_test = self.tfidf.transform(table.message)
		print "Data have been classified"
		with open(rFileName,'w') as f:
			for item in self.clf.predict(X_test).astype(str):
				f.write(item+'\n')

	def validate(self,fileName):
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_validate = self.tfidf.transform(table.message)
		Y_validated = self.clf.predict(X_validate).astype(str)
		totalNum = len(table.cat)
		errorCount = 0
		for i in range(0,totalNum):
			if int(table.cat[i])!=int(Y_validated[i]):
				errorCount += 1
		print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
コード例 #4
0
ファイル: imgpred.py プロジェクト: cameronphchen/pHA
def predict(transformed_data, args, trn_label ,tst_label):
  print 'imgpred',
  sys.stdout.flush()
  
  (ndim, nsample , nsubjs) = transformed_data.shape
  accu = np.zeros(shape=nsubjs)

  tst_data = np.zeros(shape = (ndim,nsample))
  trn_data = np.zeros(shape = (ndim,(nsubjs-1)*nsample))
  # image stimulus prediction 
  for tst_subj in range(nsubjs):
    tst_data = transformed_data[:,:,tst_subj]

    trn_subj = range(nsubjs)
    trn_subj.remove(tst_subj)

    for m in range(nsubjs-1):
      trn_data[:,m*nsample:(m+1)*nsample] = transformed_data[:,:,trn_subj[m]]

    # scikit-learn svm for classification
    #clf = NuSVC(nu=0.5, kernel = 'linear')
    clf = NuSVC(nu=0.5, kernel = 'linear')
    clf.fit(trn_data.T, trn_label)

    pred_label = clf.predict(tst_data.T)
      
    accu[tst_subj] = sum(pred_label == tst_label)/float(len(pred_label))

  return accu
コード例 #5
0
ファイル: svm.py プロジェクト: vishnu-locket/orange3
 def fit(self, X, Y, W):
     clf = NuSVC(nu=self.nu, kernel=self.kernel, degree=self.degree,
                 gamma=self.gamma, coef0=self.coef0, shrinking=self.shrinking,
                 probability=self.probability, tol=self.tol, cache_size=self.cache_size,
                 max_iter=self.max_iter)
     if W is not None:
         return NuSVMClassifier(clf.fit(X, Y.reshape(-1), W.reshape(-1)))
     return NuSVMClassifier(clf.fit(X, Y.reshape(-1)))
コード例 #6
0
ファイル: Classifier.py プロジェクト: yqji/MySK
 def SVM_nuSVC(self):
     clf = NuSVC(nu=0.5, kernel=b'rbf', degree=3, gamma='auto', coef0=0.0,
                 shrinking=True, probability=False, tol=0.001,
                 cache_size=200, class_weight=None, verbose=False,
                 max_iter=-1, decision_function_shape=None,
                 random_state=None)
     print('nuSVC Classifier is fitting...')
     clf.fit(self.X_train, self.y_train)
     return clf
コード例 #7
0
def svc_nu(X_train, categories,X_test, test_categories):
    from sklearn.svm import NuSVC

    svm_nu_classifier = NuSVC().fit(X_train, categories)
    y_svm_predicted = svm_nu_classifier.predict(X_test)
    print '\n Here is the classification report for support vector machine classiffier:'
    print metrics.classification_report(test_categories, y_svm_predicted)



    ''''
コード例 #8
0
ファイル: Classify.py プロジェクト: tbs1980/Kaggle_DecMeg2014
def NonLinearSupportVectorMachine(x_train, y_train, x_cv, y_cv):
	"""
	Non Linear Support Vector Machine
	"""
	#print "Classifier: Support Vector Machine"
	clfr = NuSVC(probability=False)
	clfr.fit(x_train, y_train)
	#print 'Accuracy in training set: %f' % clfr.score(x_train, y_train)
	#if y_cv != None:
		#print 'Accuracy in cv set: %f' % clfr.score(x_cv, y_cv)
	
	return clfr
コード例 #9
0
ファイル: days_work.py プロジェクト: JakeMick/kaggle
    def fit_model_7(self,toWrite=False):
        model = NuSVC(probability=True,kernel='linear')

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 7 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model7/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
コード例 #10
0
def testing():
    plot_x = range(1, 10)
    plot_y = []
    for i in xrange(1,10):
        vals = []
        for _ in xrange(20):
            train_data, validation_data, train_labels, validation_labels = split_data()
            clf = NuSVC(**get_kwargs(i))
            clf.fit(train_data, train_labels)
            vals.append(check_fit(clf.predict(validation_data), validation_labels))
        plot_y.append(np.mean(vals))

    plot_results(plot_x, plot_y)
コード例 #11
0
ファイル: NLSVM.py プロジェクト: Cypher42/AutoBuffett
 def __init__(self,adaption = 0.5,transactionCost = 0.001, recurrence=35, realy_recurrent=False, w_size=20,label_par='r'):
     self.learner = NuSVC()
     self.transactionCost = transactionCost
     self.adaption = adaption
     
     #size of each training batch
     self.batch_size = 200 * (recurrence)
     #size of the sliding window for sharpé ratio
     self.window_size = w_size * self.batch_size
     
     # the data matrix of a single batch
     # Data-Vector = r_1, ... r_n, prediction_t-1
     # with r_n := r_n - r_n-1
     self.returns = list()
     self.labels = list()
     self.decisions = [0]
     self.weighted_returns = list()
     
     #self.rng = rnj.Learner()
     
     self.recurrence = recurrence
     self.last_decision = 0
     self.ready = False
     self.tstep = 0
     self.recurrent = realy_recurrent
     self.prices = list()
     self.label_par = label_par
     
     self.sharpeA_old = 1
     self.sharpeB_old = 1
     return
コード例 #12
0
ファイル: SVM.py プロジェクト: Raphael-De-Wang/Semestre02
def test_nusvc():    
    # print '==== NuSVC ===='
    # print 'Training...'
    clf = NuSVC()
    clf = clf.fit( train_data, train_labels )
    
    # print 'Predicting...'
    output = clf.predict(test_data).astype(int)
    
    predictions_file = open("CLF.csv", "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId","Survived"])
    open_file_object.writerows(zip(test_id, output))
    predictions_file.close()
    # print 'Done.'
    print 'NuSVC : '
コード例 #13
0
ファイル: NLSVC.py プロジェクト: Cypher42/AutoBuffett
 def __init__(self, recurrence=30, w_size=20,hybrid = False):
     self.learner = NuSVC()
     
     #size of each training batch
     self.batch_size = w_size * (recurrence)
     #size of the sliding window for sharpé ratio
     self.window_size = 5 * self.batch_size
     
     #true if part of a hybrid learner
     self.hybrid = hybrid
     
     # the data matrix of a single batch
     # Data-Vector = r_1, ... r_n
     # with r_n := r_n - r_n-1
     self.returns = list()
     #training data for experimental apporach
     self.train_dat = list()
     self.labels = list()
     self.decisions = list()
     self.recurrence = recurrence
     
     self.last_decision = 0
     self.ready = False
     self.tstep = 0
     self.prices = list()
     return
コード例 #14
0
def nu_support_vector_machines(corpus, documents_training, documents_test, words_features, kernel, nu):
    """
    Another implementation of Support Vector Machines algorithm.
    :param corpus:
    :param documents_training:
    :param documents_test:
    :param words_features:
    :param kernel:
    :param nu:
    :return:
    """

    print
    print "----- nu-Support Vector Machines algorithm ------"
    print "Creating Training Vectors..."
    categories = util_classify.get_categories(corpus)  

    array_vector_training = []
    array_categories = []
    for (id, original_category, annotations) in documents_training:
        array_vector_training.append(util_classify.transform_document_in_vector(annotations, words_features, corpus))
        array_categories.append(util_classify.get_categories(corpus).index(original_category))    
        
    print "Training the algorithm..."
    classifier = NuSVC(nu=nu, kernel=kernel)

    X_train_features = []
    y_train_categories = []
    # Train all
    for (id, original_category, annotations) in documents_training:
        X_train_features.append(util_classify.transform_document_in_vector(annotations, words_features, corpus))
        y_train_categories.append(original_category)

    classifier.fit(np.array(X_train_features), np.array(y_train_categories))    

    print "Calculating metrics..."
    estimated_categories = []
    original_categories = []

    for (id, cat_original, annotations) in documents_test:
        cat_estimated = classifier.predict(np.array((util_classify.transform_document_in_vector(annotations, words_features, corpus))))
        estimated_categories.append(categories.index(cat_estimated))
        original_categories.append(categories.index(cat_original))

    return original_categories, estimated_categories
コード例 #15
0
class Classifier:
	def __init__(self, objective_data, subjective_data):
		OBJECTIVE = 0
		SUBJECTIVE = 1

		self.objective_data = objective_data
		self.subjective_data = subjective_data

		self.text = objective_data + subjective_data

		self.labels = [OBJECTIVE for i in objective_data] + [SUBJECTIVE for i in subjective_data]

		tuple_list = zip(self.text, self.labels)

		random.shuffle(tuple_list)

		self.text = [x for x,y in tuple_list]
		self.label = [y for x,y in tuple_list]

		self.count_vectorizer = CountVectorizer(stop_words="english", min_df=3)

		# count vectorizer and specific classifier that will be used

		self.counts = self.count_vectorizer.fit_transform(self.text)
		self.classifier = None

		self.tf_transformer = TfidfTransformer(use_idf=True)
		self.frequencies = self.tf_transformer.fit_transform(self.counts)

	def multinomialNB(self):
		self.classifier = MultinomialNB(alpha=.001)
		self.classifier.fit(self.frequencies, self.labels)

	def predict(self, examples):
		example_counts = self.count_vectorizer.transform(examples)
		example_tf = self.tf_transformer.transform(example_counts)
		predictions = self.classifier.predict(example_tf)
		return predictions

	def linearSVC(self):
  		self.classifier = LinearSVC()
  		self.classifier.fit(self.frequencies, self.labels)

  	def nuSVC(self):
  		self.classifier = NuSVC()
  		self.classifier.fit(self.frequencies, self.labels)

  	def accurracy(self, text, labels):
  		prediction = self.predict(text)
  		accurracy = 0
  		for i in range(len(prediction)):
  			if prediction[i] == labels[i]:
  				accurracy += 1
  		return accurracy / float(len(prediction))

  	def f1(self, text, actual):
  		prediction = self.predict(text)
  		return f1_score(actual, prediction)
コード例 #16
0
def sigmoidNuSVC():
    maxRandomPerformance = []
    for gamma in xrange(1,200):
        clf = NuSVC(kernel="sigmoid",gamma=gamma)
        clf.fit(trainData, trainLabel)
        maxRandomPerformance.append(clf.score(validationData, validationLabel))

    gammaValue = maxRandomPerformance.index(max(maxRandomPerformance)) + 1
    clfFinal = NuSVC(kernel='sigmoid', gamma=gammaValue)
    clfFinal.fit(trainData,trainLabel)
    score = clfFinal.score(testData,testLabel)

    guideToGraph['Sigmoid Nu-SVC'] = score
コード例 #17
0
def polyNuSVC():
    maxRandomPerformance = []

    for deg in xrange(1,200):
        clf = NuSVC(kernel="poly",degree=deg)
        clf.fit(trainData, trainLabel)
        maxRandomPerformance.append(clf.score(validationData, validationLabel))

    gammaValue = maxRandomPerformance.index(max(maxRandomPerformance)) + 1
    clfFinal = NuSVC(kernel='poly', gamma=gammaValue)
    clfFinal.fit(trainData,trainLabel)
    score = clfFinal.score(testData,testLabel)

    guideToGraph['Polynomial Nu-SVC'] = score
コード例 #18
0
ファイル: multiclass2.py プロジェクト: d-giles/KeplerML
                #print i, y_pred[i], y_test[i]
                    n = n+1
                    accuracy[m] = accuracy[m]+1
                box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
                #posterior[m] =  knc.predict_proba(X_test)
    
        print "Error-Correcting Output Code: ", np.mean(accuracy)/0.72, np.std(accuracy)/0.72
        print k
        for i in range(0,6):
            for j in range(0,6):
                print '{:5.2f} '.format(box[i,j]/100.0),
            print
    
    #end GBC(n_estimators=60, max_depth=3)

    nusvc = NuSVC(nu=0.66,degree=1)

    n=0
    box = np.zeros([6,6])
    y_pred = nusvc.fit(X_train, y_train).predict(X_test)        
    for i in range(0,len(y_pred)):
        if y_pred[i] == y_test[i]:
            n = n+1
        box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
    print "NuSVC, nu=0.66, degree=1: ",n/0.72
    for i in range(0,6):
        for j in range(0,6):
            print '{:5.0f} '.format(box[i,j]),
        print

    n=0
コード例 #19
0
ファイル: lib.py プロジェクト: richelite/classify
	def __init__(self):
		self.clf = NuSVC(nu=0.7, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1)
		self.pattern ='(?u)\\b[A-Za-z]{3,}'
		self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(1, 3))
コード例 #20
0
def final_run():
    clf = NuSVC()
    clf.fit(train, trainLabels)
    predict_labels = clf.predict(test)
    write_out(predict_labels)
コード例 #21
0
def execute_trade(context, data):
    """
    Execute orders according to our schedule_function() timing. 
    """
    prices = data.history(assets=context.stocks,
                          bar_count=context.historical_bars,
                          frequency='1d',
                          fields='price')

    for stock in context.stocks:

        try:

            price_hist = data.history(stock, 'price', 50, '1d')
            ma1 = price_hist.mean()
            price_hist = data.history(stock, 'price', 200, '1d')
            ma2 = price_hist.mean()

            start_bar = context.feature_window
            price_list = prices[stock].tolist()

            X = []  # list of feature sets
            y = []  # list of labels, one for each feature set

            bar = start_bar

            # feature creation
            while bar < len(price_list) - 1:
                try:

                    end_price = price_list[bar + 1]  # "tomorrow"'s price'
                    begin_price = price_list[bar]  # today's price

                    pricing_list = []
                    xx = 0
                    for _ in range(context.feature_window):
                        price = price_list[bar - (context.feature_window - xx)]
                        pricing_list.append(price)
                        xx += 1

                    # get the % change in daily prices of last 10 days
                    features = np.around(
                        np.diff(pricing_list) / pricing_list[:-1] * 100.0, 1)

                    # if tomorrow's price is more than today's price
                    # label the feature set (% change in last 10 days)
                    # a 1 (strong outlook, buy) else -1 (weak outlook, sell)
                    if end_price > begin_price:
                        label = 1
                    else:
                        label = -1

                    bar += 1
                    X.append(features)
                    y.append(label)
                    # print(features)

                except Exception as e:
                    bar += 1
                    print(('feature creation', str(e)))

            clf1 = RandomForestClassifier()
            clf2 = LinearSVC()
            clf3 = NuSVC()
            clf4 = LogisticRegression()

            # now we get the prices and features for the last 10 days
            last_prices = price_list[-context.feature_window:]
            current_features = np.around(
                np.diff(last_prices) / last_prices[:-1] * 100.0, 1)

            # append the last 10 days feature set
            # scale the data (mean becomes zero, SD = 0), necessary for ML algo to work
            X.append(current_features)
            X = preprocessing.scale(X)

            # the current feature will be the last SCALED feature set
            # X will be all the feature sets, excluding the most recent one,
            # this is the feature set which we will be using to predict
            current_features = X[-1]
            X = X[:-1]

            # this is where the magic happens:
            # we will be training our algorithm here to see the correlation between
            # the features and the labels (this feature set, was a buy etc.)
            # the Most CPU intensive part of the program
            # sklearn documentation says it time complexity is quadratic to number of samples
            # this means it is difficult to scale to a large dataset > a couple 10,000 samples
            # Bonus: How the documentation describes this function: Build a forest of trees from the training set (X, y).
            # we can also provide a sample_weight, if some samples are more important than others
            clf1.fit(X, y)
            clf2.fit(X, y)
            clf3.fit(X, y)
            clf4.fit(X, y)

            # then based on the RandomForestClassifier we predict what our current
            # feature set should be labelled: (1 (buy) or 0 (sell), [0] is the index of the actual predection
            # returns an array of labels based on the  n samples
            p1 = clf1.predict(current_features)[0]
            p2 = clf2.predict(current_features)[0]
            p3 = clf3.predict(current_features)[0]
            p4 = clf4.predict(current_features)[0]

            # Counter('abracadabra').most_common(3)
            #   >>[('a', 5), ('r', 2), ('b', 2)]
            # if all the classifiers agree on the same prediction we will either buy or sell the stock
            #if there is no consensus, we do nothing

            if Counter([p1, p2, p3, p4]).most_common(1)[0][1] >= 4:
                p = Counter([p1, p2, p3, p4]).most_common(1)[0][0]

            else:
                p = 0

            print(('ma1_d: ', ma1))
            print(('ma2_d :', ma2))
            print(('p1 :', p1))
            print(('p2 :', p2))
            print(('p3 :', p3))
            print(('p4 :', p4))
            print(('Prediction', p))

            # Based on the voted prediction and the momentum of the moving averages
            if p == 1 and ma1 > ma2:
                order_target_percent(stock, 0.11)
            elif p == -1 and ma1 < ma2:
                order_target_percent(stock, -0.11)
            # alternatively we could just do:
            # order_target_percent(stock,(p*0.11))

        except Exception as e:
            print(str(e))

    record('ma1', ma1)
    record('ma2', ma2)
    record('Leverage', context.account.leverage)
コード例 #22
0
    # NuSVC has some _xfail_checks. They should be skipped regardless of
    # strict_mode
    with pytest.warns(SkipTestWarning,
                      match='fails for the decision_function method'):
        check_estimator(NuSVC(), strict_mode=True)
    # When strict mode is off, check_n_features_in is skipped along with the
    # rest of the xfail_checks
    with pytest.warns(SkipTestWarning, match=msg_check_n_features_in):
        check_estimator(NuSVC(), strict_mode=False)

    # MyNMF will fail check_fit_non_negative() in strict mode because it yields
    # a bad error message
    with pytest.raises(AssertionError,
                       match="The error message should contain"):
        check_estimator(MyNMFWithBadErrorMessage(), strict_mode=True)
    # However, it should pass the test suite in non-strict mode because when
    # strict mode is off, check_fit_non_negative() will not check the exact
    # error messsage. (We still assert that the warning from
    # check_n_features_in is raised)
    with pytest.warns(SkipTestWarning, match=msg_check_n_features_in):
        check_estimator(MyNMFWithBadErrorMessage(), strict_mode=False)


@parametrize_with_checks(
    [LogisticRegression(),
     NuSVC(), MyNMFWithBadErrorMessage()],
    strict_mode=False)
def test_strict_mode_parametrize_with_checks(estimator, check):
    # Ideally we should assert that the strict checks are Xfailed...
    check(estimator)
コード例 #23
0
ファイル: nusvc.py プロジェクト: d-giles/KeplerML
     y_train = labels[100:172,i]
     X_test = sample2
     y_test = labels[272:,i]
 else:
     X_train = training
     y_train = labels[:172,i]
     X_test = sampletest
     y_test = labels[172:,i]
 #best case: 67, 1
 posterior = np.empty([100,72,6])
 for j in range(1,67):
     for k in range(1,2):
         box = np.zeros([6,6])
         accuracy = np.zeros(72)
         for m in range(0,10):
             nsvc = NuSVC(nu=j/100.0, degree=k)
             nsvc.fit(X_train, y_train)
             y_pred = nsvc.predict(X_test)
             
             n=0
             for i in range(0,len(y_pred)):
                 if y_pred[i] == y_test[i]:
             #print i, y_pred[i], y_test[i]
                     n = n+1
                     accuracy[i] = accuracy[i]+1
                 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
             #posterior[m] =  knc.predict_proba(X_test)
         #print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72
         print j, k, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
     '''
 means = np.empty([72,6])
コード例 #24
0
ファイル: NuSVC_5_sigmoid.py プロジェクト: zashin-AI/hyunmin
print(y.shape)  # (4536,)

# 전처리
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42)
print(x_train.shape)    # (3628, 110336)
print(x_test.shape)     # (908, 110336)
print(y_train.shape)    # (3628,)
print(y_test.shape)     # (908,)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 모델 구성
model = NuSVC(verbose=1, random_state=42, kernel='sigmoid')
# model.fit(x_train, y_train)

# model & weight save
# pickle.dump(model, open('E:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\NuSVC_5_sigmoid.data', 'wb')) # wb : write
# print("== save complete ==")
 
# model load
model = pickle.load(open('E:\\nmb\\nmb_data\\cp\\5s_last_0510_ml\\NuSVC_5_sigmoid.data', 'rb'))  # rb : read
# time >>  

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])
コード例 #25
0
linear_model.LinearRegression(),
linear_model.Lasso(alpha = 0.1),
linear_model.Lasso(alpha = 0.5),
tree.DecisionTreeClassifier(),
tree.DecisionTreeRegressor(),
linear_model.ElasticNet(alpha=0.1, l1_ratio=0.7),
linear_model.ElasticNet(alpha=0.5, l1_ratio=0.7),
linear_model.ElasticNet(alpha=0.1, l1_ratio=0.2),
linear_model.ElasticNet(alpha=0.5, l1_ratio=0.2),
linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0]),
linear_model.LassoLars(alpha=0.1),
linear_model.LassoLars(alpha=0.5)]
models = [
svm.SVC(kernel='linear'),
svm.SVC(kernel='rbf'),
NuSVC()
]
models = [
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(20, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(25, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(30, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 4), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15, 6), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(20, 8), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(25, 10), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(30, 12), random_state=1),
  MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2, 2), random_state=1),
コード例 #26
0
y = train['Type']
print(y.head())
print("训练集尺寸:", X.shape)

## 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values,
                                                    y.values,
                                                    test_size=0.2,
                                                    random_state=42)
print("训练集数据尺寸", X_train.shape)
print("测试集数据尺寸", X_test.shape)
print("训练集目标尺寸", y_train.shape)
print("测试集目标尺寸", y_test.shape)

clf = NuSVC(probability=True)
clf.fit(X_train, y_train)

res = clf.predict(Test)
print("预测结果尺寸为:", res.shape)
Test_label_dic = {}
for i in range(99):
    Test_label_dic[i] = np.where(res == i)[0].tolist()

Train_label_dic = {}
for i in range(99):
    Train_label_dic[i] = np.where(y.values == i)[0].tolist()

print(Test_label_dic)

DImage = []  ###建立所有图片的数据集
コード例 #27
0
dados = load_breast_cancer()

#Dividir Dataset
train, test, train_labels, test_labels = train_test_split(dados['data'], dados['target'], test_size=0.3, random_state=0)

#Alg 1 - GNB
gnb = GaussianNB()  
gnb.fit(train, train_labels)
predicted = gnb.predict(test)

print("Relatório do classificador GaussianNB \n  %s:\n%s\n" % (gnb, metrics.classification_report(test_labels, predicted)))

#Alg 2 - KMeans
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(train, train_labels)
predicted1 = kmeans.predict(test)
print("Relatório do classificador KMeans \n  %s:\n%s\n" % (KMeans, metrics.classification_report(test_labels, predicted1)))

#Alg 3 - SVC
clf = NuSVC(gamma='scale')
clf.fit(train, train_labels)
predicted2 = clf.predict(test)
print("Relatório do classificador NuSVC \n  %s:\n%s\n" % (clf, metrics.classification_report(test_labels, predicted2)))

#Alg 4 - RFC
classifier = RandomForestClassifier(n_estimators=10)
classifier.fit(train, train_labels)
predicted3 = classifier.predict(test)
print("Relatório do RandomForest \n  %s:\n%s\n" % (classifier, metrics.classification_report(test_labels, predicted3)))
print(dados['DESCR'])
コード例 #28
0
def test_xfail_ignored_in_check_estimator():
    # Make sure checks marked as xfail are just ignored and not run by
    # check_estimator(), but still raise a warning.
    assert_warns(SkipTestWarning, check_estimator, NuSVC())
コード例 #29
0
ファイル: titanic.py プロジェクト: makixi/kaggle
    LogisticRegressionCV(),
    PassiveAggressiveClassifier(),
    RidgeClassifierCV(),
    SGDClassifier(),
    Perceptron(),

    # Navies Bayes
    BernoulliNB(),
    GaussianNB(),

    # Nearest Neighbor
    KNeighborsClassifier(),

    # SVM
    SVC(probability=True),
    NuSVC(probability=True),
    LinearSVC(),

    # Trees
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),

    # Discriminant Analysis
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),

    # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()
]

vote_est = [
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)

    predict = classifier.classify_many(test)
    return accuracy_score(tag_test, predict)


print 'BernoulliNB`s accuracy is %f' % clf_score(BernoulliNB())
#print 'GaussianNB`s accuracy is %f' %clf_score(GaussianNB())
print 'MultinomiaNB`s accuracy is %f' % clf_score(MultinomialNB())
print 'LogisticRegression`s accuracy is %f' % clf_score(LogisticRegression())
print 'SVC`s accuracy is %f' % clf_score(
    SVC(gamma=0.001, C=100., kernel='linear'))
print 'LinearSVC`s accuracy is %f' % clf_score(LinearSVC())
print 'NuSVC`s accuracy is %f' % clf_score(NuSVC())


# 5. After finding the best classifier, then check different dimension classification accuracy
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    #pred = classifier.batch_classify(test)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)


#dimention = ['500','1000','1500','2000','2500','3000']
dimention = ['500', '1000']
コード例 #31
0
    #clfb.fit(Train_data,Train_target)

    #print(Test_target)
    #print(clf.predict(Test_data))
    """
	#(KnearestNeighbors)
	from sklearn.neighbors import KNeighborsClassifier
	clf  = KNeighborsClassifier(n_neighbors=50)
	clf.fit(Train_data,Train_target)
	print(Test_target)
	print(clf.predict(Test_data))
	"""

    #highest accuracy (svm classifier :: robust against outliers)
    from sklearn.svm import NuSVC
    my_classifier = NuSVC(kernel='linear')
    my_classifier.fit(Train_data, Train_target)
    prediction = my_classifier.predict(Test_data)
    """#MLPClassifier
	#from sklearn.neural_network import MLPClassifier
	#my_classifier2 = MLPClassifier()
	#my_classifier2.fit(Train_data,Train_target)
	
	#SVRClassifier
	#from sklearn.svm import SVR
	#my_classifier1 = SVR(C=1.0, epsilon=0.2)
	#my_classifier1.fit(Train_data,Train_target)
	
	#RandomForestClassifier
	from sklearn.ensemble import RandomForestClassifier
	rf = RandomForestClassifier(n_estimators=300)
コード例 #32
0
def linearNuSVC():
    clf = NuSVC(kernel="linear")
    clf.fit(trainData, trainLabel)
    guideToGraph['Linear Nu-SVC'] = clf.score(validationData, validationLabel)
コード例 #33
0
ファイル: syn_0_1.py プロジェクト: kirill-pugachov/card
def classifiers_evaluation(df_res, y):

    classifiers = [
        LinearSVC(),
        LinearSVR(),
        KNeighborsClassifier(3),
        SVC(probability=True),
        NuSVC(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        BernoulliNB(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression(),
        MLPClassifier(max_iter=600),
        SGDClassifier(max_iter=600),
        LogisticRegressionCV(max_iter=600)
    ]

    res = list()

    preprocess = [
        preprocessing.QuantileTransformer(),
        preprocessing.MinMaxScaler(),
        preprocessing.Normalizer(),
        preprocessing.StandardScaler(),
        preprocessing.RobustScaler(),
        preprocessing.MaxAbsScaler()
    ]
    for processor in preprocess:
        X = processor.fit_transform(df_res)

        log_cols = ["Classifier", "ROC_AUC score"]
        log = pd.DataFrame(columns=log_cols)

        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.1,
                                     random_state=0)

        acc_dict = {}

        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for clf in classifiers:
                name = clf.__class__.__name__
                clf.fit(X_train, y_train)
                train_predictions = clf.predict(X_test)
                #            acc = accuracy_score(y_test, train_predictions)
                acc = roc_auc_score(y_test, train_predictions)

                if name in acc_dict:
                    acc_dict[name] += acc
                else:
                    acc_dict[name] = acc

        for clf in acc_dict:
            acc_dict[clf] = acc_dict[clf] / 10.0
            log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
            log = log.append(log_entry)

        print(processor.__class__.__name__)
        print(log)
        res.append([processor.__class__.__name__, log])

    return res
コード例 #34
0
ファイル: ens_model.py プロジェクト: y-bai/bgi-aneuploidy
def ens_model_train(x_train, y_train, f_est, cls_weight=None):
    """
    this is using from sklearn.ensemble import StackingClassifier,
    which is not used in the current project
    :param x_train:
    :param y_train:
    :param f_est:
    :param cls_weight:
    :return:
    """
    # from sklearn.ensemble import StackingClassifier
    # if cls_weight is None:
    #     cls = np.unique(y_train)
    #     cls_wei = np.ones(len(cls)) * 0.5
    #     cls_weight = dict(zip(cls, cls_wei))
    # https://towardsdatascience.com/stacking-classifiers-for-higher-predictive-performance-566f963e4840
    clfs = [
        # ('knn', KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree', n_jobs=-1)),
        ('svc',
         SVC(C=50,
             degree=1,
             gamma="auto",
             kernel="rbf",
             probability=True,
             random_state=10,
             class_weight=cls_weight)),
        ('nusvc',
         NuSVC(degree=1,
               kernel="rbf",
               nu=0.25,
               probability=True,
               random_state=10,
               class_weight=cls_weight)),
        ('nb', GaussianNB()),
        # ('gp', GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1, random_state=10)),
        # ('etc', ExtraTreesClassifier(n_estimators=40, max_depth=5, max_features='auto', class_weight=cls_weight)),
        ('rf',
         RandomForestClassifier(n_estimators=100,
                                criterion="gini",
                                max_depth=5,
                                max_features="auto",
                                min_samples_leaf=1,
                                min_samples_split=2,
                                n_jobs=-1,
                                random_state=5,
                                class_weight=cls_weight)),
        # ('gbm', GradientBoostingClassifier(n_estimators=100, max_depth=10,
        #                                    max_features='auto', learning_rate=0.01, random_state=10)),
        ('xgb',
         XGBClassifier(n_estimators=100,
                       min_child_weight=1,
                       gamma=0.0,
                       colsample_bytree=0.8,
                       subsample=0.7,
                       reg_alpha=0.01,
                       max_depth=5,
                       learning_rate=0.05,
                       n_jobs=-1))
    ]

    # f_est = LogisticRegression(class_weight=cls_weight)

    # f_est = ExtraTreesClassifier(
    #     n_estimators=100, max_depth=10, max_features='auto', class_weight=cls_weight
    # )

    f_clf = StackingClassifier(estimators=clfs,
                               final_estimator=f_est,
                               stack_method='predict_proba',
                               n_jobs=-1)
    # f_clf = StackingClassifier(estimators=clfs,
    #                            final_estimator=ExtraTreesClassifier(
    #                                n_estimators=100, max_depth=10, max_features='auto', class_weight=cls_weight
    #                            ),
    #                            ,
    #                            n_jobs=-1)
    f_clf.fit(x_train, y_train)
    return f_clf
 test_labels = labels
 accuracy = np.zeros((subjects,))
 cm = [None] * subjects
 for subject in range(subjects):
     # Concatenate the subjects' data for training into one matrix
     train_subjects = list(range(subjects))
     train_subjects.remove(subject)
     TRs = image_data_shared[0].shape[1]
     train_data = np.zeros((image_data_shared[0].shape[0], len(train_labels)))
     for train_subject in range(len(train_subjects)):
         start_index = train_subject*TRs
         end_index = start_index+TRs
         train_data[:, start_index:end_index] = image_data_shared[train_subjects[train_subject]]
 
     # Train a Nu-SVM classifier using scikit learn
     classifier = NuSVC(nu=0.5, kernel='linear')
     classifier = classifier.fit(train_data.T, train_labels)
 
     # Predict on the test data
     predicted_labels = classifier.predict(image_data_shared[subject].T)
     accuracy[subject] = sum(predicted_labels == test_labels)/float(len(predicted_labels))
 
     # Create a confusion matrix to see the accuracy of each class
     cm[subject] = confusion_matrix(test_labels, predicted_labels)
 
     # Normalize the confusion matrix
     cm[subject] = cm[subject].astype('float') / cm[subject].sum(axis=1)[:, np.newaxis]
 
 
 # Plot and print the results
 plot_confusion_matrix(cm, title="Confusion matrices for different test subjects with Probabilistic SRM")
コード例 #36
0
ファイル: classifier_save.py プロジェクト: bromjiri/Presto
def train(trainfeats, testfeats):
    # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    my_classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = my_classifier.classify(feats)
        testsets[observed].add(i)

    # precision and recall
    accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100
    pos_prec = precision(refsets['pos'], testsets['pos']) * 100
    pos_rec = recall(refsets['pos'], testsets['pos']) * 100
    neg_prec = precision(refsets['neg'], testsets['neg']) * 100
    neg_rec = recall(refsets['neg'], testsets['neg']) * 100

    # round
    accuracy = round(accuracy, 1)
    pos_prec = round(pos_prec, 1)
    pos_rec = round(pos_rec, 1)
    neg_prec = round(neg_prec, 1)
    neg_rec = round(neg_rec, 1)

    print(accuracy)
    # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
    # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
    my_classifier.show_most_informative_features(50)

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier._vectorizer.sort = False
    MNB_classifier.train(trainfeats)
    MNB_classifier_f = open("pickled/MNB_classifier.pickle", "wb")
    pickle.dump(MNB_classifier, MNB_classifier_f)
    MNB_classifier_f.close()
    mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100
    mnb = round(mnb, 1)
    print(mnb)

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier._vectorizer.sort = False
    BernoulliNB_classifier.train(trainfeats)
    BernoulliNB_classifier_f = open("pickled/BernoulliNB_classifier.pickle",
                                    "wb")
    pickle.dump(BernoulliNB_classifier, BernoulliNB_classifier_f)
    BernoulliNB_classifier_f.close()
    bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100
    bnb = round(bnb, 1)
    print(bnb)

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier._vectorizer.sort = False
    LogisticRegression_classifier.train(trainfeats)
    LogisticRegression_classifier_f = open(
        "pickled/LogisticRegression_classifier.pickle", "wb")
    pickle.dump(LogisticRegression_classifier, LogisticRegression_classifier_f)
    LogisticRegression_classifier_f.close()
    lr = (nltk.classify.accuracy(LogisticRegression_classifier,
                                 testfeats)) * 100
    lr = round(lr, 1)
    print(lr)

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier._vectorizer.sort = False
    LinearSVC_classifier.train(trainfeats)
    LinearSVC_classifier_f = open("pickled/LinearSVC_classifier.pickle", "wb")
    pickle.dump(LinearSVC_classifier, LinearSVC_classifier_f)
    LinearSVC_classifier_f.close()
    lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100
    lsvc = round(lsvc, 1)
    print(lsvc)

    NuSVC_classifier = SklearnClassifier(NuSVC())
    NuSVC_classifier._vectorizer.sort = False
    NuSVC_classifier.train(trainfeats)
    NuSVC_classifier_f = open("pickled/NuSVC_classifier.pickle", "wb")
    pickle.dump(NuSVC_classifier, NuSVC_classifier_f)
    NuSVC_classifier_f.close()
    nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100
    nsvc = round(nsvc, 1)
    print(nsvc)

    voted_classifier = VoteClassifier(NuSVC_classifier, LinearSVC_classifier,
                                      MNB_classifier, BernoulliNB_classifier,
                                      LogisticRegression_classifier)
    voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100
    voted = round(voted, 1)
    print(voted)

    nltk_output = "nlt, " + str(accuracy) + ", " + str(pos_prec) + ", " + str(
        neg_prec) + ", " + str(pos_rec) + ", " + str(neg_rec) + "\n"
    sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(
        lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n"

    return (nltk_output, sklearn_output)
コード例 #37
0
            y[index, 0] = 0
        else:
            y[index, 0] = 1

    kf = KFold(n_splits=20)
    kf.get_n_splits(X)
    print(kf)

    KFold(n_splits=20, random_state=None, shuffle=False)
    ii = 0
    dum = np.zeros(len(X))
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = NuSVC()
        clf.fit(X_train, y_train)
        NuSVC(cache_size=200,
              class_weight=None,
              coef0=0.0,
              decision_function_shape=None,
              degree=1,
              gamma='auto',
              kernel='rbf',
              max_iter=-1,
              nu=0.5,
              probability=False,
              random_state=None,
              shrinking=True,
              tol=0.001,
              verbose=False)
コード例 #38
0
df.Label = df.Label.astype('category')
data, labels = df.ix[:, :-1], df.ix[:, -1]

# lets normalize the features before doing anything
data_norm = pd.DataFrame(StandardScaler().fit_transform(data),
                         columns=data.columns)

# split the data to train and test
X_train, X_test, y_train, y_test = \
  train_test_split(data_norm, labels, test_size=0.2, random_state=18)

# just trying a buch of classifiers to see which one is better
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

# Logging for Visual Comparison
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
コード例 #39
0
Ytrain[250:500] = 2
Ytrain[500:750] = 3

Yval = np.ones((valshape[0]))
Yval_predict = np.ones((valshape[0]))
Yval[0:150] = 1
Yval[150:300] = 2
Yval[300:450] = 3

Ytest = np.ones((testshape[0]))
Ytest_predict = np.ones((testshape[0]))
Ytest[0:100] = 1
Ytest[100:200] = 2
Ytest[200:300] = 3

clf = NuSVC()
clf.fit(Xtrain, Ytrain)
NuSVC(cache_size=2000, class_weight=None, coef0=0.0,
      decision_function_shape=None, degree=3, gamma='auto', kernel='polynomial',
      max_iter=-1, nu=0.5, probability=False, random_state=None,
      shrinking=True, tol=0.00001, verbose=False)

for i in range(trainshape[0]):
    Ytrain_predict[i] = clf.predict([Xtrain[i, :]])

for i in range(valshape[0]):
    Yval_predict[i] = clf.predict([Xval[i, :]])

for i in range(testshape[0]):
    Ytest_predict[i] = clf.predict([Xtest[i, :]])
コード例 #40
0
classifiers = {}
classifiers.update({"LDA": LinearDiscriminantAnalysis()})
classifiers.update({"QDA": QuadraticDiscriminantAnalysis()})
classifiers.update({"AdaBoost": AdaBoostClassifier()})
classifiers.update({"Bagging": BaggingClassifier()})
classifiers.update({"Extra Trees Ensemble": ExtraTreesClassifier()})
classifiers.update({"Gradient Boosting": GradientBoostingClassifier()})
classifiers.update({"Random Forest": RandomForestClassifier()})
classifiers.update({"Ridge": RidgeClassifier()})
classifiers.update({"SGD": SGDClassifier()})
classifiers.update({"BNB": BernoulliNB()})
classifiers.update({"GNB": GaussianNB()})
classifiers.update({"KNN": KNeighborsClassifier()})
classifiers.update({"MLP": MLPClassifier()})
classifiers.update({"LSVC": LinearSVC()})
classifiers.update({"NuSVC": NuSVC()})
classifiers.update({"SVC": SVC()})
classifiers.update({"DTC": DecisionTreeClassifier()})
classifiers.update({"ETC": ExtraTreeClassifier()})

# Create dict of decision function labels
DECISION_FUNCTIONS = {"Ridge", "SGD", "LSVC", "NuSVC", "SVC"}

# Create dict for classifiers with feature_importances_ attribute
FEATURE_IMPORTANCE = {
    "Gradient Boosting", "Extra Trees Ensemble", "Random Forest"
}

###############################################################################
#                             5. Hyper-parameters                             #
###############################################################################
コード例 #41
0
def test_full():
    clf = NuSVC()
    clf.fit(train, trainLabels)
    print check_fit(clf.predict(train), trainLabels)
コード例 #42
0
plt.show()

matrix = confusion_matrix(Newy_test, prediction1)
plt.figure(figsize=[10, 10])
plt.imshow(matrix, cmap='hot', interpolation='nearest', vmin=0, vmax=200)
plt.colorbar()
plt.title('Random Forest Confusion Map', fontsize=18)
plt.ylabel('Actual', fontsize=18)
plt.xlabel('Predicted', fontsize=18)
plt.grid(b=False)
plt.yticks(range(10), class_label, fontsize=14)
plt.xticks(range(10), class_label, fontsize=14, rotation='vertical')
plt.show()

svm = OneVsRestClassifier(
    NuSVC(nu=.008, gamma='scale', kernel='poly',
          decision_function_shape='ovr'))
svmmodel = svm.fit(NewX_train, Newy_train)

svc_prediction = svmmodel.predict(NewX_test)

print('fit to train new: ', svm.score(NewX_train, Newy_train))
print('fit to test: ', svm.score(NewX_test, Newy_test))

matrix = confusion_matrix(Newy_test, svc_prediction)
plt.figure(figsize=[10, 10])
plt.imshow(matrix, cmap='hot', interpolation='nearest', vmin=0, vmax=200)
plt.colorbar()
plt.title('Support Vector Machine Confusion Map', fontsize=18)
plt.ylabel('Actual', fontsize=18)
plt.xlabel('Predicted', fontsize=18)
plt.grid(b=False)
コード例 #43
0
# Documentation: http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

linear_svc_classifier = SklearnClassifier(LinearSVC())
linear_svc_classifier.train(training_set)
print("LinearSVC_classifier accuracy %:",
      (nltk.classify.accuracy(linear_svc_classifier, testing_set)) * 100)

pickle_classifiers(linear_svc_classifier, "linear_svc_classifier")

# Use NuSVC classifier
# Nu-Support Vector Classification.
# Similar to SVC but uses a parameter to control the number of support vectors.
# The implementation is based on libsvm.
# Documentation: http://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html

nu_svc_classifier = SklearnClassifier(NuSVC())
nu_svc_classifier.train(training_set)
print("NuSVC_classifier accuracy %:",
      (nltk.classify.accuracy(nu_svc_classifier, testing_set)) * 100)

pickle_classifiers(nu_svc_classifier, "nu_svc_classifier")

# Create a classifier based on equal votes.
voted_classifier = VoteClassifier(bayes_classifier, mnb_classifier,
                                  bernoulli_nb_classifier,
                                  logistic_regression_classifier,
                                  sgdc_classifier, linear_svc_classifier,
                                  nu_svc_classifier)

print("Voted_classifier accuracy %:",
      (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)
コード例 #44
0
def test_xfail_ignored_in_check_estimator():
    # Make sure checks marked as xfail are just ignored and not run by
    # check_estimator(), but still raise a warning.
    with warnings.catch_warnings(record=True) as records:
        check_estimator(NuSVC())
    assert SkipTestWarning in [rec.category for rec in records]
コード例 #45
0
ファイル: plot_nusvm.py プロジェクト: Raz0r/lightning
def fit_nusvc(X_train, y_train, nu, kernel, gamma=0.1, degree=4, coef0=1):
    print "Training, nu = ", nu
    start = time.time()
    clf = NuSVC(nu=nu, kernel=kernel, degree=degree, coef0=coef0)
    clf.fit(X_train, y_train)
    return clf, time.time() - start
コード例 #46
0
def evaluateAlgorithm(train_X, test_X, train_Y, test_Y):
    # ---------------------------------------------------------------------    
    # Machine Lreaning Algorithm, Parameter settings 
    # ---------------------------------------------------------------------    
    model_List = []

    # ---------------------------------------------------------------------
    # Algorithms are applied with different parameter settings 
    # - manual parameter tuning.
    # [Note: Grid Search and Random Search for 
    # Parameters tuning will be introduced later]
    # ---------------------------------------------------------------------
    # Support Vactor Machine (SVM) with manual parameter settings
    
    # 1. Support Vector Machine - SVC : SVC()
    # kernel = ['poly', 'rbf', 'sigmoid']    
    
    # 1.1 SVC : SVC(); 
    SVC_1 = SVC(C=1.0, kernel='rbf', 
                degree=3, gamma='auto', coef0=0.0, shrinking=True, 
                probability=False, tol=0.001, cache_size=200, 
                class_weight=None, verbose=False, max_iter=-1, 
                decision_function_shape='ovr', random_state=None)
    model_List.append(('SVC-1', 
               'Support Vector Machine: SVC - PS-1', SVC_1))

    # 1.2 SVC : SVC(); 
    SVC_2 = SVC(C=1.0, kernel='poly', 
                degree=3, gamma='auto', coef0=0.0, shrinking=True, 
                probability=False, tol=0.001, cache_size=200, 
                class_weight=None, verbose=False, max_iter=-1, 
                decision_function_shape='ovr', random_state=None)
    model_List.append(('SVC-2', 
               'Support Vector Machine: SVC - PS-2', SVC_2))

    # 1.3 SVC : SVC(); 
    SVC_3 = SVC(C=1.0, kernel='sigmoid', degree=3, gamma='auto', 
                coef0=0.0, shrinking=True, probability=False, 
                tol=0.001, cache_size=200, class_weight=None, 
                verbose=False, max_iter=-1, 
                decision_function_shape='ovr', random_state=None)
    model_List.append(('SVC-3', 
               'Support Vector Machine: SVC - PS-3', SVC_3))    
    
    # 2. Support Vector Machine - NuSVC : NuSVC(); 
    # kernel = ['poly', 'rbf', 'sigmoid']   
    
    # 2.1 NuSVC : NuSVC(); 
    NuSVC_1 = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto', 
                    coef0=0.0, shrinking=True, probability=False, 
                    tol=0.001, cache_size=200, 
                    class_weight=None, verbose=False, 
                    max_iter=-1, decision_function_shape='ovr', 
                    random_state=None)
    model_List.append(('NuSVC-1', 
          'Support Vector Machine: NuSVC - PS-1', NuSVC_1))

    # 2.2 NuSVC : NuSVC(); 
    NuSVC_2 = NuSVC(nu=0.5, kernel='poly', degree=3, gamma='auto', 
                    coef0=0.0, 
                    shrinking=True, probability=False, 
                    tol=0.001, cache_size=200, 
                    class_weight=None, verbose=False, 
                    max_iter=-1, 
                    decision_function_shape='ovr', 
                    random_state=None)
    model_List.append(('NuSVC-2', 
          'Support Vector Machine: NuSVC - PS-2', NuSVC_2))

    # 2.3 NuSVC : NuSVC(); 
    NuSVC_3 = NuSVC(nu=0.5, kernel='sigmoid', degree=3, gamma='auto',
                    coef0=0.0, 
                    shrinking=True, probability=False, tol=0.001,
                    cache_size=200, 
                    class_weight=None, verbose=False, max_iter=-1, 
                    decision_function_shape='ovr', 
                    random_state=None)
    model_List.append(('NuSVC-3', 
          'Support Vector Machine: NuSVC - PS-3', NuSVC_3))

    # 3. Support Vector Machine - LinearSVC : LinearSVC(); 
    # loss = ['squared_hinge', 'hinge'] ; penalty = ['l2', 'l1']   
    
    # 3.1 LinearSVC : LinearSVC(); 
    LSVC_1 = LinearSVC(penalty='l2', loss='squared_hinge', 
                       dual=True, tol=0.0001, 
                       C=1.0, multi_class='ovr', 
                       fit_intercept=True, 
                       intercept_scaling=1, 
                       class_weight=None, verbose=0, 
                       random_state=None, max_iter=1000)
    model_List.append(('LSVC-1', 
          'Support Vector Machine: LinearSVC - PS-1', LSVC_1))

    # 3.2 LinearSVC : LinearSVC(); 
    LSVC_2 = LinearSVC(penalty='l2', loss='hinge', 
                       dual=True, tol=0.0001, 
                       C=1.0, multi_class='ovr', 
                       fit_intercept=True, 
                       intercept_scaling=1, 
                       class_weight=None, verbose=0, 
                       random_state=None, max_iter=1000)
    model_List.append(('LSVC-2', 
          'Support Vector Machine: LinearSVC - PS-2', LSVC_2))

    # ---------------------------------------------------------------------
    # Cross Validation ----------------------------------------------------
    # ---------------------------------------------------------------------
    print("Cross Validation Results ")
    outcomes = []
    description = []
    shortDescription = []
    for shtDes, des, model in model_List:
        cv_results = cross_val_score(model, train_X, train_Y, 
                                     cv = 5, scoring='accuracy', 
                                     n_jobs = 4, verbose = 0)
        outcomes.append(cv_results)
        description.append(des)
        shortDescription.append(shtDes)
        prt_string = "\n %s:\n \tMean Accuracy: %f (Std: %f)" % (des, 
                                                cv_results.mean(), 
                                                cv_results.std())
        print(prt_string)

    # ---------------------------------------------------------------------
    # Visualise the outcomes / results from Cross Validation
    # ---------------------------------------------------------------------
    fig = pyplot.figure()
    fig.suptitle('Cross Validation Results (Algorithm Comparison)')
    ax = fig.add_subplot(111)
    #pyplot.boxplot(outcomes)
    #ax.set_xticklabels(shortDescription)
    pyplot.boxplot(outcomes, vert = False)
    ax.set_yticklabels(shortDescription)

    pyplot.show()

    # ---------------------------------------------------------------------
    # Training & Fitting of each Algorithm with training Dataset
    # ---------------------------------------------------------------------
    print('\nEvaluate Algorithms ... ... ... ')
    for shtDes, des, model in model_List:   
        trained_Model = model.fit(train_X, train_Y)
        
    # ---------------------------------------------------------------------
    # Evaluation of trained Algorithm (or Model) and result
    # ---------------------------------------------------------------------    
        pred_Class    = trained_Model.predict(test_X)
        acc           = accuracy_score(test_Y, pred_Class)
        classReport   = classification_report(test_Y, pred_Class)
        confMatrix    = confusion_matrix(test_Y, pred_Class) 
        kappa_score   = cohen_kappa_score(test_Y, pred_Class) 

        print("\n%s: " % (des))
        print('The accuracy: {}'.format(round(acc,2)))
        print('The kappa score: {}'.format(round(kappa_score,2)))        
        print('The Classification Report:\n {}'.format(classReport))
        print('The Confusion Matrix:\n {}'.format(confMatrix))
    
    # ---------------------------------------------------------------------    
    # Save the trained Model
    # ---------------------------------------------------------------------    
        with open('model_'+shtDes+'.pickle', 'wb') as f:
                pk.dump(trained_Model, f)
コード例 #47
0
     'model': DBSCAN(algorithm="brute", n_jobs=-1),
     'methods': [],
     'dataset': 'blobs',
 },
 {
     'model': SVC(kernel='rbf'),
     'methods': ['predict', 'decision_function'],
     'dataset': 'classifier',
 },
 {
     'model': SVC(kernel='rbf'),
     'methods': ['predict', 'decision_function'],
     'dataset': 'sparse',
 },
 {
     'model': NuSVC(kernel='rbf'),
     'methods': ['predict', 'decision_function'],
     'dataset': 'classifier',
 },
 {
     'model': SVR(kernel='rbf'),
     'methods': ['predict'],
     'dataset': 'regression',
 },
 {
     'model': NuSVR(kernel='rbf'),
     'methods': ['predict'],
     'dataset': 'regression',
 },
 {
     'model': TSNE(random_state=0),
コード例 #48
0
ファイル: eda.py プロジェクト: pdikang/kaggle
    pred = lr.predict_proba(model_mat_train[27000:])
    auc_score(ACTION[27000:], pred[:, 1])

    lr.fit(model_mat_train[:, np.where(rfe.support_)[0]], ACTION)
    pred = lr.predict_proba(model_mat_test[:, np.where(rfe.support_)[0]])
    pd.DataFrame({"Id": test_data.index, "Action": pred[:, 1]}).to_csv(
        "../lr2_submission.csv", header=True, index=False
    )

    ## svms
    svc = SVC(C=1.0, kernel="rbf", probability=True, class_weight="auto", verbose=2)
    svc.fit(model_mat_train[:27000, np.where(rfe.support_)[0]], ACTION[:27000])
    pred = svc.predict_proba(model_mat_train[27000:, np.where(rfe.support_)[0]])
    auc_score(ACTION[27000:], pred[:, 1])

    nusvc = NuSVC(nu=0.11, kernel="rbf", degree=3, probability=True, cache_size=1024, verbose=2)
    nusvc.fit(model_mat_train[:27000, np.where(rfe.support_)[0]], ACTION[:27000])
    svc_pred = nusvc.predict_proba(model_mat_train[27000:, np.where(rfe.support_)[0]])
    auc_score(ACTION[27000:], svc_pred[:, 1])

    nusvc = NuSVC(nu=0.11, kernel="rbf", degree=3, probability=True, cache_size=1024, verbose=2)
    nusvc.fit(model_mat_train[:27000], ACTION[:27000])
    svc_pred = nusvc.predict_proba(model_mat_train[27000:])
    auc_score(ACTION[27000:], svc_pred[:, 1])

    nusvc.fit(model_mat_train[:, np.where(rfe.support_)[0]], ACTION)
    svc_pred = nusvc.predict_proba(model_mat_test[:, np.where(rfe.support_)[0]])
    pd.DataFrame({"Id": test_data.index, "Action": svc_pred[:, 1]}).to_csv(
        "../nusvc_submission.csv", header=True, index=False
    )
コード例 #49
0
    [SGDClassifier(random_state=42)],
    [SGDClassifier(random_state=42, loss='log')],
    [PassiveAggressiveClassifier(random_state=42)],
    [Perceptron(random_state=42)],
    [LinearSVC(random_state=42)],
    [OneVsRestClassifier(SGDClassifier(random_state=42))],
])
def test_explain_linear(newsgroups_train, clf):
    assert_explained_weights_linear_classifier(newsgroups_train, clf)


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [SGDClassifier(random_state=42)],
    [SVC(kernel='linear', random_state=42)],
    [NuSVC(kernel='linear', random_state=42)],
])
def test_explain_linear_binary(newsgroups_train_binary, clf):
    assert_explained_weights_linear_classifier(newsgroups_train_binary,
                                               clf,
                                               binary=True)


@pytest.mark.parametrize(['clf'], [
    [SVC()],
    [NuSVC()],
    [SVR()],
    [NuSVR()],
])
def test_explain_linear_unsupported_kernels(clf):
    res = explain_weights(clf)
コード例 #50
0
        enc[features_list[j]] += 1
    to_df.append(enc)

#As our training of model will done with frequency matrice
X = pd.DataFrame(to_df,
                 columns=[
                     'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
                     'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
                 ])

#For testing which model perform best in roc_auc
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
models.append(('NB', GaussianNB()))
models.append(('SVMR', SVC(kernel='rbf', gamma=0.01, C=5)))
models.append(('nuSVMR', NuSVC(kernel='rbf', gamma=0.01)))
models.append(('Random', RandomForestClassifier(n_estimators=130)))
rfc = RandomForestClassifier(n_estimators=100)
models.append(("Adaboost",
               AdaBoostClassifier(base_estimator=rfc,
                                  n_estimators=rfc.n_estimators)))

results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=5, shuffle=True)
    res = model_selection.cross_val_score(model,
                                          X,
                                          Y,
                                          cv=kfold,
                                          scoring='roc_auc')
コード例 #51
0
#LogisticRegression Classifier
clf_LogisticRegression = LogisticRegression(n_jobs=-1).fit(X_train, y_train)
predicted_LogisticRegression = clf_LogisticRegression.predict(X_test)
accuracy_LogisticRegression = np.mean(predicted_LogisticRegression == y_test)

clf_LogisticRegression_f = open("pickled_algos/clf_LogisticRegression.pickle", "wb")
pickle.dump(clf_LogisticRegression, clf_LogisticRegression_f)
clf_LogisticRegression_f.close()

print('LogisticRegression accuracy: %s' %accuracy_LogisticRegression)
print(metrics.classification_report(predicted_LogisticRegression, y_test))



#NuSVC Classifier
clf_NuSVC = NuSVC().fit(X_train, y_train)
predicted_NuSVC = clf_NuSVC.predict(X_test)
accuracy_NuSVC = np.mean(predicted_NuSVC == y_test)

clf_NuSVC_f = open("pickled_algos/clf_NuSVC.pickle", "wb")
pickle.dump(clf_NuSVC, clf_NuSVC_f)
clf_NuSVC_f.close()

print('NuSVC accuracy: %s' %accuracy_NuSVC)
print(metrics.classification_report(predicted_NuSVC, y_test))



#LinearSVC Classifier
clf_LinearSVC = LinearSVC().fit(X_train, y_train)
predicted_LinearSVC = clf_LinearSVC.predict(X_test)
コード例 #52
0
#GaussianNB.train(training_set)
#print("GaussianNB accuracy percent:", (nltk.classify.accuracy(GaussianNB, testing_set))*100)

#BernoulliNB = SklearnClassifier(BernoulliNB())
#BernoulliNB.train(training_set)
#print("BernoulliNB accuracy percent:", (nltk.classify.accuracy(BernoulliNB, testing_set))*100)

# LogisticRegression, SGDClassifier
# SVC, LinearSVC, NuSVC

LogisticRegression = SklearnClassifier(LogisticRegression())
LogisticRegression.train(training_set)
print("LogisticRegression accuracy percent:", (nltk.classify.accuracy(LogisticRegression, testing_set))*100)

SGDClassifier = SklearnClassifier(SGDClassifier())
SGDClassifier.train(training_set)
print("SGDClassifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier, testing_set))*100)

SVC = SklearnClassifier(SVC())
SVC.train(training_set)
print("SVC accuracy percent:", (nltk.classify.accuracy(SVC, testing_set))*100)

LinearSVC = SklearnClassifier(LinearSVC())
LinearSVC.train(training_set)
print("LinearSVC accuracy percent:", (nltk.classify.accuracy(LinearSVC, testing_set))*100)

NuSVC = SklearnClassifier(NuSVC())
NuSVC.train(training_set)
print("NuSVC accuracy percent:", (nltk.classify.accuracy(NuSVC, testing_set))*100)

コード例 #53
0
ファイル: NLSVC.py プロジェクト: Cypher42/AutoBuffett
class Learner:
       
       #@input recurrence: Dimensionality of the feature-space
       #        with dimension corresponding to the last n returns
       #        this is a positive integer
       #@input realy_recurrent: default=False
       #        if true: the last decision is also a dimension in the
       #        feature space
       #@input label_par paramter used for labeling 'r' for returns
       #                                            'p' for prices
    def __init__(self, recurrence=30, w_size=20,hybrid = False):
        self.learner = NuSVC()
        
        #size of each training batch
        self.batch_size = w_size * (recurrence)
        #size of the sliding window for sharpé ratio
        self.window_size = 5 * self.batch_size
        
        #true if part of a hybrid learner
        self.hybrid = hybrid
        
        # the data matrix of a single batch
        # Data-Vector = r_1, ... r_n
        # with r_n := r_n - r_n-1
        self.returns = list()
        #training data for experimental apporach
        self.train_dat = list()
        self.labels = list()
        self.decisions = list()
        self.recurrence = recurrence
        
        self.last_decision = 0
        self.ready = False
        self.tstep = 0
        self.prices = list()
        return
       
    def predict(self,new_price,old_price,tstep = 0):
        #default decision value        
        decision = 0
        #Add prices to sliding window
        self.prices.append(new_price)
        if(len(self.prices) > self.window_size):
            self.prices.pop(0)
        latest_return = new_price - old_price
        #add next label
        if(self.tstep > self.recurrence):
            self.labels.append(self.label_returns(latest_return))
        #increment timer
        self.tstep += 1
        #add latest return to history
        self.returns.append(latest_return)
        if(self.tstep > self.window_size):
            if(len(self.returns) > self.window_size):
                self.returns.pop(0)
        #if batch is full, start training
        if(self.tstep%self.batch_size == 0 and self.tstep != 0):
            self.train()
            #disabled this, normally for predicting prices, but performance is
            #worse, so this is actually dead code
        #setup x-vector
        if(self.tstep > self.recurrence):
            x = self.returns[len(self.returns)-self.recurrence-1:len(self.returns)-1]
            #set up training matrix            
            x = np.array(x)
            x = x.reshape((len(x),1))
            self.train_dat.append(x)
            x = np.transpose(x)
            #create decision only if svm is trained
            if(self.ready):
                decision = np.tanh(self.learner.decision_function(x))
                decision = decision[0]
        #if the system is truly recurrent (uses the last decision input-vecotr)
        #append the decision
        self.last_decision = decision
        return  decision
    
    #calls partial_fit() on the svm to adjust it's internal model
    def train(self):
        #setup training matrix
        train_dat = np.zeros((len(self.labels),self.recurrence))
        for i in range(len(train_dat)):
            train_dat[i][:] = np.transpose(self.train_dat[i])
        #np.transpose(train_dat)
        self.learner.fit(train_dat, self.labels)
        #clear the training-related strzctures
        self.labels = list()
        self.train_dat = list()
        self.ready = True
        return
        #calls partial_fit() on the svm to adjust it's internal model
        
    #labeling function using the complete vector
    #very simple, since it only detects trends depending on the mu
    def label_set(self,return_list):
        mu_current = np.mean(return_list)
        mu_total = np.mean(self.returns)
        if(mu_current >= mu_total):
            return 1
        else:
            return -1
            
    def label_returns(self,next_return):
        if next_return > 0:
            return 1
        else:
            return -1
コード例 #54
0
class SVM:
    """
    Wrapper class around scikit-learn's support vector machine functionality.
    This class supports binary and multi-class classification on a dataset, along with regression via Support Vector
    Regression (SVR).
    Per scikit-learn's documentation:

    Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and
    outliers detection.

    The advantages of support vector machines are:

        – Effective in high dimensional spaces.
        – Still effective in cases where number of dimensions is greater than the number of samples.
        – Uses a subset of training points in the decision function (called support vectors), so it is also memory
        efficient.
        – Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided,
        but it is also possible to specify custom kernels.

    The disadvantages of support vector machines include:

        – If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel
        functions and regularization term is crucial.
        – SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold
        cross-validation.
    """
    def __init__(self, attributes=None, labels=None, test_size=0.25):
        """
        Initializes a SVM object.

        The following parameters are needed to use a SVM:

            – attributes: a numpy array of the independent variables
            – labels: a numpy array of the classes (for classification) or dependent variables (for regression)
            – test_size: the proportion of the dataset to be used for testing the model (defaults to 0.25);
            the proportion of the dataset to be used for training will be the complement of test_size

        After successfully running one of the classifier methods (SVC(), nu_SVC(), or linear_SVC()), the corresponding
        classifier below will be trained:

            – classifier_SVC: a classifier trained using scikit-learn's SVC implementation
            – accuracy_SVC: the accuracy of the SVC model, based on its predictions for dataset_X_test
            – roc_auc_SVC: the area under the ROC curve for the SVC model
            – classifier_nu_SVC: a classifier trained using scikit-learn's NuSVC implementation
            – accuracy_nu_SVC: the accuracy of the NuSVC model, based on its predictions for dataset_X_test
            – roc_auc_nu_SVC: the area under the ROC curve for the NuSVC model
            – classifier_linear_SVC: a classifier trained using scikit-learn's LinearSVC implementation
            – accuracy_linear_SVC: the accuracy of the LinearSVC model, based on its predictions for dataset_X_test

        After successfully running one of the regression methods (SVR(), nu_SVR(), or linear_SVR()), the corresponding
        regression model below will be trained:

            – regression_SVR: a regression model trained using scikit-learn's SVR implementation
            – r2_score_SVR: the coefficient of determination for the SVR model
            – r_score_SVR: the correlation coefficient for the SVR model
            – regression_nu_SVR: a regression model trained using scikit-learn's NuSVR implementation
            – r2_score_nu_SVR: the coefficient of determination for the NuSVR model
            – r_score_nu_SVR: the correlation coefficient for the NuSVR model
            – regression_linear_SVR: a regression model trained using scikit-learn's LinearSVR implementation
            – r2_score_linear_SVR: the coefficient of determination for the LinearSVR model
            – r_score_linear_SVR: the correlation coefficient for the LinearSVR model
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = 0.25

        self.classifier_SVC = None
        self.accuracy_SVC = None
        self.roc_auc_SVC = None
        self.classifier_nu_SVC = None
        self.accuracy_nu_SVC = None
        self.roc_auc_nu_SVC = None
        self.classifier_linear_SVC = None
        self.accuracy_linear_SVC = None

        self.regression_SVR = None
        self.r2_score_SVR = None
        self.r_score_SVR = None
        self.regression_nu_SVR = None
        self.r2_score_nu_SVR = None
        self.r_score_nu_SVR = None
        self.regression_linear_SVR = None
        self.r2_score_linear_SVR = None
        self.r_score_linear_SVR = None

        # References to training and testing subsets of dataset; instance data for re-use purposes
        self.dataset_X_train = None
        self.dataset_y_train = None
        self.dataset_X_test = None
        self.dataset_y_test = None

    # Accessor Methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If a SVM object is initialized without specifying attributes, attributes will be None. No SVM functionality can
        be used until attributes is a populated numpy array. Call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If a SVM object is initialized without specifying labels, labels will be None. No SVM functionality can be used
        until labels is a populated numpy array. Call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_classifier_SVC(self):
        """
        Accessor method for classifier_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.classifier_SVC

    def get_accuracy_SVC(self):
        """
        Accessor method for accuracy_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.accuracy_SVC

    def get_roc_auc_SVC(self):
        """
        Accessor method for roc_auc_SVC.

        Will return None if SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_SVC

    def get_classifier_nu_SVC(self):
        """
        Accessor method for classifier_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.classifier_nu_SVC

    def get_accuracy_nu_SVC(self):
        """
        Accessor method for accuracy_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_nu_SVC

    def get_roc_auc_nu_SVC(self):
        """
        Accessor method for roc_auc_nu_SVC.

        Will return None if nu_SVC() hasn't successfully run, yet.
        """
        return self.roc_auc_nu_SVC

    def get_classifier_linear_SVC(self):
        """
        Accessor method for classifier_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.classifier_linear_SVC

    def get_accuracy_linear_SVC(self):
        """
        Accessor method for accuracy_linear_SVC.

        Will return None if linear_SVC() hasn't successfully run, yet.
        """
        return self.accuracy_linear_SVC

    def get_regression_SVR(self):
        """
        Accessor method for regression_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.regression_SVR

    def get_r2_score_SVR(self):
        """
        Accessor method for r2_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r2_score_SVR

    def get_r_score_SVR(self):
        """
        Accessor method for r_score_SVR.

        Will return None if SVR() hasn't successfully run, yet.
        """
        return self.r_score_SVR

    def get_regression_nu_SVR(self):
        """
        Accessor method for regression_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.regression_nu_SVR

    def get_r2_score_nu_SVR(self):
        """
        Accessor method for r2_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_nu_SVR

    def get_r_score_nu_SVR(self):
        """
        Accessor method for r_score_nu_SVR.

        Will return None if nu_SVR() hasn't successfully run, yet.
        """
        return self.r_score_nu_SVR

    def get_regression_linear_SVR(self):
        """
        Accessor method for regression_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.regression_linear_SVR

    def get_r2_score_linear_SVR(self):
        """
        Accessor method for r2_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r2_score_linear_SVR

    def get_r_score_linear_SVR(self):
        """
        Accessor method for r_score_linear_SVR.

        Will return None if linear_SVR() hasn't successfully run, yet.
        """
        return self.r_score_linear_SVR

    # Modifier Methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a populated numpy array. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a populated numpy array. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a float between 0.0 and 1.0 or None. Defaults to 0.25. The training size will be set to the
        complement of test_size.
        """
        self.test_size = new_test_size

    # Wrappers for SVM classification classes

    def SVC(self,
            C=1.0,
            kernel="rbf",
            degree=3,
            gamma="scale",
            coef0=0.0,
            shrinking=True,
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False,
            max_iter=-1,
            decision_function_shape="ovr",
            break_ties=False,
            random_state=None):
        """
        Wrapper for scikit-learn's C-Support Vector Classification implementation.
        Parameters per scikit-learn's documentation:

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm. The fit time scales at least quadratically with the number of samples
        and may be impractical beyond tens of thousands of samples.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_SVC =\
                SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                    probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose,
                    max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties,
                    random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_SVC.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_SVC = self.classifier_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_SVC = roc_auc_score(
                    self.classifier_SVC.predict(self.dataset_X_test),
                    self.classifier_SVC.predict_proba(self.dataset_X_test)[::,
                                                                           1])

    def nu_SVC(self,
               nu=0.5,
               kernel="rbf",
               degree=3,
               gamma="scale",
               coef0=0.0,
               shrinking=True,
               probability=False,
               tol=0.001,
               cache_size=200,
               class_weight=None,
               verbose=False,
               max_iter=-1,
               decision_function_shape="ovr",
               break_ties=False,
               random_state=None):
        """
        Wrapper for scikit-learn's Nu-Support Vector Classification implementation.
        Per scikit-learn's documentation, NuSVC is similar to SVC, but uses a parameter, nu, to set the number of
        support vectors.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_nu_SVC =\
                NuSVC(nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                      probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight,
                      verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape,
                      break_ties=break_ties, random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_nu_SVC.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_nu_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_nu_SVC = self.classifier_nu_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_nu_SVC = roc_auc_score(
                    self.classifier_nu_SVC.predict(self.dataset_X_test),
                    self.classifier_nu_SVC.predict_proba(
                        self.dataset_X_test)[::, 1])

    def linear_SVC(self,
                   penalty="l2",
                   loss="squared_hinge",
                   dual=True,
                   tol=0.0001,
                   C=1.0,
                   multi_class='ovr',
                   fit_intercept=True,
                   intercept_scaling=1,
                   class_weight=None,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Classification implementation. Per scikit-learn's documentation,
        LinearSVC is similar to SVC with a linear kernel, but implemented with liblinear instead of libsvm, providing
        more flexibility in choice of penalties and loss functions. LinearSVC should also scale better to large sample
        sizes. LinearSVC supports both dense and sparse input, and the multiclass support is handled according to a
        one-vs-the-rest scheme.
        Parameters per scikit-learn's documentation:

            – penalty: Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The
            ‘l1’ leads to coef_ vectors that are sparse. (Default is "l2")

            – loss: Specifies the loss function. ‘hinge’ is the standard SVM loss (used e.g. by the SVC class) while
            ‘squared_hinge’ is the square of the hinge loss. (Default is "squared_hinge")

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)
            
            – tol: Tolerance for stopping criteria. (Default is 1e-4, or 0.0001)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be
            strictly positive. (Default is 1.0)
            
            – multi_class: Determines the multi-class strategy if y contains more than two classes. "ovr" trains
            n_classes one-vs-rest classifiers, while "crammer_singer" optimizes a joint objective over all classes.
            While crammer_singer is interesting from a theoretical perspective as it is consistent, it is seldom used
            in practice as it rarely leads to better accuracy and is more expensive to compute. If "crammer_singer" is
            chosen, the options loss, penalty and dual will be ignored. (Default is "ovr")
            
            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_linear_SVC =\
                LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol, C=C, multi_class=multi_class,
                          fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
                          verbose=verbose, random_state=random_state, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_linear_SVC.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_linear_SVC = None
                return

            # Evaluate accuracy of model using testing set and actual classification
            self.accuracy_linear_SVC = self.classifier_linear_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

    # Wrappers for SVM regression classes

    def SVR(self,
            kernel='rbf',
            degree=3,
            gamma='scale',
            coef0=0.0,
            tol=0.001,
            C=1.0,
            epsilon=0.1,
            shrinking=True,
            cache_size=200,
            verbose=False,
            max_iter=-1):
        """
        Wrapper for scikit-learn's Epsilon-Support Vector Regression implementation. Per scikit-learn's documentation,
        this implementation is based on libsvm. Scaling to tens of thousands of samples is difficult, as the fit time
        complexity is more than quadratic with the number of samples. For large datasets, consider using LinearSVR by
        calling linear_SVR().
        Parameters per scikit-learn's documentation:

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – shrinking: Whether to use the shrinking heuristic. (Default is True)

            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)

            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_SVR =\
                SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon,
                    shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_SVR.fit(self.dataset_X_train,
                                        self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the SVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_SVR = self.regression_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_SVR = sqrt(self.r2_score_SVR)

    def nu_SVR(self,
               nu=0.5,
               C=1.0,
               kernel='rbf',
               degree=3,
               gamma='scale',
               coef0=0.0,
               shrinking=True,
               tol=0.001,
               cache_size=200,
               verbose=False,
               max_iter=-1):
        """
        Wrapper for scikit-learn's Nu Support Vector Regression implementation. Per scikit-learn's documentation,
        NuSVR uses the parameter nu to control the number of support vectors, similar to NuSVC. Yet unlike NuSVC,
        nu replaces the parameter epsilon of epsilon-SVR, not C. This implementation is based on libsvm.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
                        
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_nu_SVR =\
                NuSVR(nu=nu, C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, tol=tol,
                      cache_size=cache_size, verbose=verbose, max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or if labels isn't
            # quantitative data
            try:
                self.regression_nu_SVR.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_nu_SVR = None
                return

            # Get coefficient of determination for model
            self.r2_score_nu_SVR = self.regression_nu_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_nu_SVR = sqrt(self.r2_score_nu_SVR)

    def linear_SVR(self,
                   epsilon=0.0,
                   tol=0.0001,
                   C=1.0,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=1000):
        """
        Wrapper for scikit-learn's Linear Support Vector Regression implementation. Per scikit-learn's documentation,
        LinearSVR is similar to SVR with a linear kernel, but is implemented with liblinear instead of libsvm. This
        provides greater flexibility in choice of penalties and loss functions, and should scale better to large sample
        sizes. LinearSVM supports both dense and sparse input.
        Parameters per scikit-learn's documentation:

            – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is
            associated in the training loss function with points predicted within a distance epsilon from the actual
            value. (Default is 0.1)

            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)

            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – loss: Specifies the loss function. The epsilon-insensitive loss (standard SVR) is the L1 loss, while the
            squared epsilon-insensitive loss (‘squared_epsilon_insensitive’) is the L2 loss.
            (Default is "epsilon_insensitive")

            – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be
            used in calculations (i.e. data is expected to be already centered). (Default is True)
            
            – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling],
            i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance
            vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature
            weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on
            synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.
            (Default is 1)

            – dual: Select the algorithm to either solve the dual or primal optimization problem.
            Prefer dual=False when n_samples > n_features. (Default is True)

            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting
            in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate
            descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and
            random_state has no effect on the results. Pass an int for reproducible output across multiple function
            calls. (Default is None)
            
            – max_iter: The maximum number of iterations to be run. (Default is 1000)
        """
        if self._check_inputs():
            # Initialize regression model
            self.regression_linear_SVR =\
                LinearSVR(epsilon=epsilon, tol=tol, C=C, loss=loss, fit_intercept=fit_intercept,
                          intercept_scaling=intercept_scaling, dual=dual, verbose=verbose, random_state=random_state,
                          max_iter=max_iter)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train regression model; handle exception if arguments are incorrect and/or labels isn't
            # quantitative data
            try:
                self.regression_linear_SVR.fit(self.dataset_X_train,
                                               self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the LinearSVR model. Check you arguments and try again."
                )
                print("Does labels only contain quantitative data?")
                print("Here is the exception message:")
                print(e)
                self.regression_linear_SVR = None
                return

            # Get coefficient of determination and correlation coefficient for model
            self.r2_score_linear_SVR = self.regression_linear_SVR.score(
                self.dataset_X_test, self.dataset_y_test)
            self.r_score_linear_SVR = sqrt(self.r2_score_linear_SVR)

    # Helper methods

    def _split_data(self):
        """
        Helper method for splitting attributes and labels into training and testing sets.

        This method runs under the assumption that all relevant instance data has been checked for correctness.
        """

        self.dataset_X_train, self.dataset_X_test, self.dataset_y_train, self.dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

    def _check_inputs(self):
        """
        Verifies if instance data is ready for use in SVM model.
        """

        # Check if attributes exists
        if self.attributes is None:
            print(
                "attributes is missing; call set_attributes(new_attributes) to fix this! new_attributes should be a",
                "populated dataset of independent variables.")
            return False

        # Check if labels exists
        if self.labels is None:
            print(
                "labels is missing; call set_labels(new_labels) to fix this! new_labels should be a populated dataset",
                "of classes.")
            return False

        # Check if attributes and labels have same number of rows (samples)
        if self.attributes.shape[0] != self.labels.shape[0]:
            print(
                "attributes and labels don't have the same number of rows. Make sure the number of samples in each",
                "dataset matches!")
            return False

        # Check if test_size is a number
        if self.test_size is not None and not isinstance(
                self.test_size, (int, float)):
            print(
                "test_size must be None or a number; call set_test_size(new_test_size) to fix this!"
            )
            return False

        return True
コード例 #55
0
 def SklearnNuSVC():
     return NltkClassifierWrapper(SklearnClassifier(NuSVC(probability=True)))
コード例 #56
0
    def nu_SVC(self,
               nu=0.5,
               kernel="rbf",
               degree=3,
               gamma="scale",
               coef0=0.0,
               shrinking=True,
               probability=False,
               tol=0.001,
               cache_size=200,
               class_weight=None,
               verbose=False,
               max_iter=-1,
               decision_function_shape="ovr",
               break_ties=False,
               random_state=None):
        """
        Wrapper for scikit-learn's Nu-Support Vector Classification implementation.
        Per scikit-learn's documentation, NuSVC is similar to SVC, but uses a parameter, nu, to set the number of
        support vectors.
        Parameters per scikit-learn's documentation:

            – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors.
            Should be in the interval (0, 1]. (Default is 0.5)
            
            – C: Regularization parameter. The strength of the regularization is inversely proportional to C.
            Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0)

            – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’,
            ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is
            used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape
            (n_samples, n_samples). (Default is "rbf")
            
            – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3)
            
            – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses
            1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features.
            (Default is "scale")
            
            – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0)
            
            – shrinking: Whether to use the shrinking heuristic. (Default is True)
            
            – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow
            down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with
            predict. (Default is False)
            
            – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001)
            
            – cache_size: Specify the size of the kernel cache in MB. (Default is 200)
            
            – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are
            supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
            (Default is None)
            
            – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in
            libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False)
            
            – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1)
            
            – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape
            (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of
            libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always
            used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr")
            
            – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties
            according to the confidence values of decision_function; otherwise the first class among the tied classes is
            returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple
            predict. (Default is False)
            
            – random_state: Controls the pseudo random number generation for shuffling the data for probability
            estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function
            calls. (Default is None)

        The implementation is based on libsvm.
        """
        if self._check_inputs():
            # Initialize classifier
            self.classifier_nu_SVC =\
                NuSVC(nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
                      probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight,
                      verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape,
                      break_ties=break_ties, random_state=random_state)

            # Split data, if needed; if testing/training sets are still None, call _split_data()
            if self.dataset_X_test is None:
                self._split_data()

            # Train classifier; handle exception if arguments are incorrect
            try:
                self.classifier_nu_SVC.fit(self.dataset_X_train,
                                           self.dataset_y_train)
            except Exception as e:
                print(
                    "An exception occurred while training the NuSVC model. Check your arguments and try again."
                )
                print("Here is the exception message:")
                print(e)
                self.classifier_nu_SVC = None
                return

            # Evaluate accuracy and ROC-AUC of model using testing set and actual classification
            self.accuracy_nu_SVC = self.classifier_nu_SVC.score(
                self.dataset_X_test, self.dataset_y_test)

            if probability:
                self.roc_auc_nu_SVC = roc_auc_score(
                    self.classifier_nu_SVC.predict(self.dataset_X_test),
                    self.classifier_nu_SVC.predict_proba(
                        self.dataset_X_test)[::, 1])
コード例 #57
0
svc_new = SVC(probability=True, C=.000001, kernel='poly', gamma=4,
                  degree=4)
svc_new.fit(train_x_reduced, train_y_practice)
print svc_new.score(test_x_reduced, test_y_practice)
"""
"""
parameters = {'degree':(1, 3, 6)}
svclass = NuSVC(kernel='poly', probability=True, gamma=0, nu=.5852, tol=.00001)
clf = GridSearchCV(svclass, parameters, cv=10)
clf.fit(train_x_reduced, train_y_practice)
print "SVC"
print clf.best_estimator_
print clf.best_score_
print clf.best_params_
"""
svc_new = NuSVC(kernel='poly', probability=True, gamma=0, nu=.5852, tol=.00001)
svc_new.fit(train_x_reduced, train_y_practice)
print svc_new.score(test_x_reduced, test_y_practice)


print 'Predicting'
estimator = SelectKBest(score_func=f_classif, k=components)
estimator.fit(train_x, train_y_leaderboard)
train_x_reduced = estimator.transform(train_x)
test_x_reduced = estimator.transform(test_x)
print train_x.shape
print train_x_reduced.shape

#svc_new = SVC(probability=True, C=.000001, kernel='poly', gamma=4,
#                  degree=4)
svc_new = NuSVC(kernel='poly', probability=True, gamma=0, nu=.5852, tol=.00001)
コード例 #58
0
    standard_scaler = StandardScaler()
    scaled_data = pca_scaler.fit_transform(train_data[cols])
    scaled_data = standard_scaler.fit_transform(scaled_data)
    scaled_data = np.append(scaled_data, partition_column, 1)

    train_models_data, train_stack_data, train_models_y, train_stack_y = train_test_split(
        scaled_data, train_y, test_size=0.1, random_state=42)
    train_models_y = train_models_y.reset_index(drop=True)

    clf_svnu, clf_knn, clf_lr, clf_mlp, clf_svc = [], [], [], [], []
    for _ in range(NUM_PARTITIONS):
        clf_svnu.append(
            NuSVC(probability=True,
                  kernel='poly',
                  degree=4,
                  gamma='auto',
                  random_state=4,
                  nu=0.59,
                  coef0=0.053))
        clf_knn.append(neighbors.KNeighborsClassifier(n_neighbors=17, p=2.9))
        clf_lr.append(
            linear_model.LogisticRegression(solver='saga', penalty='l1',
                                            C=0.1))
        clf_mlp.append(
            neural_network.MLPClassifier(random_state=3,
                                         activation='relu',
                                         solver='lbfgs',
                                         tol=1e-06,
                                         hidden_layer_sizes=(250, )))
        clf_svc.append(
            svm.SVC(probability=True,
コード例 #59
0
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:",
      (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:",
      (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:",
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:",
      (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

# In[39]:

rev.info()

# In[76]:

rev_new = rev[:981166]
rev_new.head()

# In[77]:
コード例 #60
0
def fd_svm(train, test, ytrain, ytest):
    clf = NuSVC()
    clf.fit(train, ytrain)
    return clf.score(test, ytest)