Example #1
0
class NormalSVCTrainer(AbstractLearner):
    def __init__(self, kernel='linear', gamma='auto', penalty=1.0, cache=200, scale=True, scheme='ovr', class_w='balanced'):
        self.learner = SVC(C=penalty, kernel=kernel, gamma=gamma, probability=True, cache_size=cache, decision_function_shape=scheme,
                           class_weight=class_w)
        self.kernel = kernel
        self.gamma = gamma
        self.penalty = penalty
        self.scheme = scheme
        self.scale = scale

    def _train(self, x_train, y_train):
        if self.scale:
            self.scaler = preprocessing.StandardScaler().fit(x_train)
            x_scaled = self.scaler.transform(x_train)
            self.learner = self.learner.fit(x_scaled, y_train)
        else:
            self.learner = self.learner.fit(x_train, y_train)

    def _predict(self, x):
        if self.scale:
            x_scaled = self.scaler.transform(x)
            return self.learner.predict(x_scaled)
        else:
            return self.learner.predict(x)

    def _predict_proba(self, x):
        if self.scale:
            x_scaled = self.scaler.transform(x)
            return self.learner.predict_proba(x_scaled)
        else:
            return self.learner.predict_proba(x)

    def __str__(self):
        return 'SVC (kernel=%s, penalty: %f, scheme: %s, gamma=%s)' % \
               (self.kernel, self.penalty, self.scheme, str(self.gamma))
Example #2
0
def pipeline(iteration, C, gamma, random_seed):
    x_train, _x, y_train, _y = train_test_split(train_x, train_y, test_size=0.4, random_state=random_seed)
    print x_train.shape
    clf = SVC(
        C=C,
        kernel="rbf",
        gamma=gamma,
        probability=True,
        cache_size=7000,
        class_weight="balanced",
        verbose=True,
        random_state=random_seed,
    )
    clf.fit(x_train, y_train)
    # predict test set
    pred = clf.predict_proba(test_x)
    test_result = pd.DataFrame(columns=["Idx", "score"])
    test_result.Idx = test_Idx
    test_result.score = pred[:, 1]
    test_result.to_csv("./test/svm_{0}.csv".format(iteration), index=None)
    # predict val set
    pred = clf.predict_proba(val_x)
    val_result = pd.DataFrame(columns=["Idx", "score"])
    val_result.Idx = val_Idx
    val_result.score = pred[:, 1]
    val_result.to_csv("./val/svm_{0}.csv".format(iteration), index=None)
Example #3
0
def svm_grid_search():

	#get data
	training_input,training_target,validation_input,validation_target = prepare_input()

	#set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false,
	#and log-loss requires a probability
	log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

	training_input = training_input[:100000]
	training_target = training_target[:100000]

	print training_input.shape[0]
	print training_target.shape[0]

	start = time.time()
	svm = SVC(random_state=31,probability=True)
	
	
	svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]}
	svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1)
	svm_grid_obj = svm_grid_obj.fit(training_input,training_target)
	svm = svm_grid_obj.best_estimator_
	print "Best params: " + str(svm_grid_obj.best_params_)	
	svm_train_error = log_loss(training_target,svm.predict_proba(training_input))
	svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input))
	print "Best SVM training error: {:02.4f}".format(svm_train_error)
	print "Best SVM validation error: {:02.4f}".format(svm_validation_error)
	end = time.time()
	print "RF grid search took {:02.4f} seconds".format(end-start)

	return svm
Example #4
0
def svm_solver(train_data, train_label, validation, test, dimreduce, convertbinary) :
    """
    """
    logging.info ('begin to train the svm classifier')

    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test = dimreduce(train_data, train_label, validation, test)
    # print new_train_data.shape
    train_data, validation, test = convertbinary(train_data, validation, test)

    """
    svc = SVC ()
    params_rbf = {"kernel": ['rbf'],
             "class_weight": ['auto'],
             "C": [0.1 ,0.2 ,0.3 ,0.5 ,1, 2, 3, 5, 10],
             "gamma": [0.01, 0.03,  0.05, 0.1, 0.2, 0.3, 0.5],
             "tol": 10.0** -np.arange(1, 5),
             "random_state": [1000000007]}
    logging.info ("Hyperparameter opimization using RandomizedSearchCV...")
    rand_search_result = RandomizedSearchCV (svc, param_distributions = params_rbf, n_jobs = -1, cv = 3, n_iter = 30)
    # rand_search_result = GridSearchCV (svc , param_grid = params_rbf , n_jobs = 8  , cv = 3)
    rand_search_result.fit (train_data , train_label)
    params = tools.report (rand_search_result.grid_scores_)
    """
    params = {'kernel': 'poly', 'C': 0.1, 'random_state': 1000000007, 'tol': 0.001, 'gamma': 0.1, 'class_weight': 'auto'}
    svc = SVC (probability = True, **params)

    svc.fit (train_data , train_label)
    evaluate.get_auc (svc.predict_proba (validation)[:,1])
    return svc.predict_proba (test)[:,1]
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--image", required = True, help = "Path to the image")
    args = vars(ap.parse_args())

    image = cv2.imread(args["image"])
    rects, img = detect(image)

    cropped = []

    for idx, (x1, y1, x2, y2) in enumerate(rects):
        crop_img = image[y1:y1 + (y2 - y1), x1:x1 + (x2 - x1)]
        crop_img = cv2.resize(crop_img, (100,100), interpolation = cv2.INTER_AREA)
        cv2.imshow("image" + str(idx), crop_img)
        new_img = crop_img.reshape(crop_img.shape[0] * crop_img.shape[1], 3)
        cropped.append(new_img.flatten())

    # reduce feature size
    cropped_pca = []
    pca = RandomizedPCA(n_components=100)
    cropped_pca = pca.fit_transform(cropped)

    # training (hardcoded for now)
    clf   = SVC(probability=True)
    train = cropped_pca[:7]
    test  = cropped_pca[7:13]
    # clf.fit([[0,0],[1,1]], [1, 2])
    clf.fit(train, [1,2,2,1,2,1,1])

    for item in test:
        print clf.predict_proba(item)
        print clf.predict(item)

    cv2.waitKey(0)
    def go_by_category_2(category):
        input, targets, scaler = TrainingFactory.get_training_data_by_category(category,10000)
        input_train, input_test, target_train, target_test = train_test_split(input, targets, test_size=0.1)

        test_data_sparse = TestingFactory.get_test_data(limit=1000)
        test_data_scaled = scaler.transform(test_data_sparse)
        test_data = csr_matrix(test_data_scaled)

        classif = SVC(kernel='rbf',C=0.1, tol=0.001, probability=True)
        classif.fit(input_train, target_train)

        output_targets_proba = classif.predict_proba(input_test)

        outputs_predicted_proba = [item[1] for item in output_targets_proba]
        output_targets = classif.predict(input_test)

        # print output_targets.tolist()
        # print outputs_predicted_proba
        # print target_test

        print log_loss(target_test, output_targets)
        accuracy = accuracy_score(target_test, output_targets)
        print accuracy
        print confusion_matrix(target_test, output_targets)


        testing_output = classif.predict_proba(test_data)
        testing_output_proba = [item[1] for item in testing_output]
        print testing_output_proba

        return accuracy, output_targets, testing_output_proba
def svm_classify(threshold):
    
    data=pd.DataFrame()
    i=0
    xprev=0
    xprev2=0
    for x in cot.columns[:-1]:
        data[x]=cot[x]/pd.rolling_mean(cot[x],5)
        data[x+'_polynomial2']=data[x]*data[x]
        data[x+'_polynomial3']=data[x]*data[x]*data[x]
        if (xprev!=0):
            data[x+'_polynomial_x_2']=data[x]*data[xprev]
        if (xprev2!=0):
            data[x+'_polynomial_x_3']=data[x]*data[xprev2]*data[xprev]
        i=i+1
        xprev=x
        xprev2=xprev
    
    data['return']=((brent.shift(-5).Rate/brent.shift(-1).Rate)-1)>0
    data=data[8:].dropna(1)
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:-1,:-1], data.iloc[:-1,-1], test_size=0.5)
    gbc=SVC (kernel='rbf',probability=True,C=1)
    gbc.fit(x_train,y_train)
    #min_max_scaler=MinMaxScaler()
    #mms=min_max_scaler.fit(list(max(a) for a in gbc.predict_proba(x_train)))
    pr=list(max(a) for a in gbc.predict_proba(x_test))
    Y=pd.DataFrame()
    Y['actual']=y_test
    Y['predicted']=gbc.predict(x_test)
    Y['P']=mms.transform(list(max(a) for a in gbc.predict_proba(x_test)))
    Y_filtered=Y[Y.P>threshold]
    cm=confusion_matrix(Y_filtered.actual,Y_filtered.predicted)
    return [gbc.score(x_test,y_test,pr>threshold),cm,'Prediction of UP is %s; P = %s' %(gbc.predict(data.iloc[-1:,:-1])[0],
     list((max(x)) for x in gbc.predict_proba(data.iloc[-1:,:-1]))[0]
     ),brent]
Example #8
0
    def grid_searcher(self):
        X_train, X_test, Y_train, Y_test = self.cv_data[-1]
        X_train = np.vstack((X_train, X_test))
        Y_train = np.concatenate((Y_train, Y_test))
        stratifiedCV = StratifiedKFold(Y_train, 10)

        ansDict = {}
        ansDict["train"] = {}
        ansDict["test"] = {}

        C_range = 10.0 ** np.arange(-4, 9)
        gamma_range = 10.0 ** np.arange(-5, 4)
        for ind, i in enumerate(C_range):
            for jnd, j in enumerate(gamma_range):
                # Cantor's pairs
                dictInd = ((ind + jnd + 2) ** 2 + (ind + 1) - (jnd + 1)) / 2
                ansDict["train"][dictInd] = []
                ansDict["test"][dictInd] = []
                for train, test in stratifiedCV:
                    X_trainT, X_testT, Y_trainT, Y_testT = (
                        X_train[train, :],
                        X_train[test, :],
                        Y_train[train, :],
                        Y_train[test, :],
                    )
                    svc = SVC(kernel="rbf", C=i, gamma=j, probability=True, class_weight="auto")
                    svc.fit(X_trainT, Y_trainT)
                    ansDict["train"][dictInd].append(logloss(Y_trainT, svc.predict_proba(X_trainT)[:, 1]))
                    ansDict["test"][dictInd].append(svc.predict_proba(self.testMat)[:, 1])

        meanScores = []
        for i, j in ansDict["train"].items():
            wut = np.array(j)
            meanScores.append(wut.mean())

        meanScores = np.array(meanScores)
        meanScores[meanScores < 0] = 1.0
        print(meanScores.min())
        paramGood = np.where(meanScores == meanScores.min())[0][0]
        testPred = ansDict["test"][paramGood]
        finalPred = np.vstack(testPred).mean(axis=0)

        def write_prediction(f):
            g = open("sc_prediction.csv", "w")
            for i in f:
                g.write(str(i) + "\n")
            g.close()

        write_prediction(finalPred)
Example #9
0
def svcmodel(d,X_2,y_2,X_3,y_3,X_test,y_test):
    X_3_copy = X_3.copy(deep=True)
    X_3_copy['chance']=0
    index = 0    
    
########## k折交叉验证 ###########################
    scores = cross_val_score(SVC(), X_2, y_2, cv=5, scoring='accuracy')
    score_mean =scores.mean()
    print(d+'5折交互检验:'+str(score_mean))
#################################################
    
    svc = SVC(probability=True).fit(X_2,y_2)

################ 预测测试集 ################   
    answer_svc = svc.predict(X_test)
    accuracy = metrics.accuracy_score(y_test,answer_svc)
    print(d+'预测:'+str(accuracy))
###############################################
    
    chance = svc.predict_proba(X_3)[:,1]
    for c in chance:
        X_3_copy.iloc[index,len(X_3_copy.columns)-1]=c
        index += 1
    chance_que = X_3_copy.iloc[:,len(X_3_copy.columns)-1]
    return chance_que
class LinearSVMPredictor(PredictorBase):
    '''
    Linear SVM
    '''

    def __init__(self, animal_type):
        self.animal_type = animal_type
        self.clf = SVC(
            kernel="linear", C=1.0, probability=True, random_state=0)

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def find_best_params(self):
        parameters = {'kernel': ["linear"], 'C': [0.025, 1.0]}
        svc = SVC()
        clf = grid_search.GridSearchCV(svc, parameters)
        train_data = get_data('../data/train.csv')
        train_data = select_features(train_data, self.animal_type)
        X = train_data.drop(['OutcomeType'], axis=1)
        y = train_data['OutcomeType']
        clf.fit(X, y)
        print clf.best_params_
def svc(n_components=10):
  """
  Train a support vector classifier after dimensionality reduction
  with PCA.

  Each fold takes ~10 min. First fold gave log loss: 0.684875244651
  """

  train = pandas.read_csv('train.csv')
  y = train['target'].values
  X = raw_scaled_features(train)

  folds = StratifiedKFold(train['target'], 10)
  
  for train_indices, test_indices in folds:
    #print train_indices, test_indices
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

    pca = PCA(n_components=n_components)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    #print X_train.shape

    svc = SVC(probability=True, verbose=False)
    svc.fit(X_train, y_train)
    y_prob = svc.predict_proba(X_test)

    print log_loss(y_test, y_prob, svc.classes_)
Example #12
0
def support_vector(XTrain, yTrain, XTest):
    svm = SVC(kernel='linear',probability = True)
    svm.fit(XTrain, yTrain)
    scores = svm.predict_proba(XTest)
    labels = svm.predict(XTest)

    return (labels, scores)
class SVMPredictor(object):
    """"
    A simple application of SVM classifier

    @author: Shaun
    """

    def __init__(self):
        self.clf = SVC(probability=True)

    @abstractmethod
    def fit(self, X, y):
        """
        Method to fit the model.

        Parameters:
        X - 2d numpy array of training data
        y - 1d numpy array of training labels
        """
        self.clf = self.clf.fit(X, y)

    @abstractmethod
    def predict(self, X):
        """
        Method to apply the model data

        Parameters:
        X - 2d numpy array of test data
        """
        return self.clf.predict_proba(X)[:, 1]
Example #14
0
	 def test(self):
                 X, y = self.dataMat,self.labelMat
                 X_test = self.testData
                 clf = SVC(kernel='linear', C= 0.001, probability=True)
                 clf.fit(X, y);
                 y_pred = clf.predict(X_test[1,:]);
                 y_predprob = clf.predict_proba(X_test[1,:]);
Example #15
0
def predict_svc(X_train, y_train, X_test, sample_weight):
    clf = SVC(degree=3, gamma=0.0,
              kernel='rbf', probability=True)
    clf.fit(X_train, y_train, sample_weight=sample_weight)

    predictions = clf.predict_proba(X_test)
    return predictions
Example #16
0
def svc((C, gamma)):
    s = SVC(C=C, gamma=gamma, probability=True)
    start = time.time()
    s.fit(X[:border], y[:border])
    train_time = time.time() - start
    pred = s.predict_proba(X[border:])[:, 0]
    test_time = (time.time() - start) - train_time

    # This is the literal is-it-the-right-answer  binary score.
    # This measure is what we try to maximize but its relation to question
    # accuracy is complicated
    accu = np.sum((pred > 0.5) == y) / len(y)

    ###  This is the actual question prediction error, in bits
    # First, find the probabilities
    pred_y = pred * y[border:] # These are the probabilities for right answers
    pred_y = pred_y[pred_y.nonzero()]   # the same, stripped of 0's
    mean_bits = np.mean(-np.log(pred_y) / np.log(2))  # measured in mean bits

    ### This is the literal accuracy - it gets complicated
    # Sort the answers by probability, descending (only getting the indices)
    confidence_order = np.argsort(pred)
    # This indexing trick always takes the last assignment for each index
    # This will hold the index of the best answer for each question
    best_answer = np.zeros(np.max(q.astype(int))+1)
    best_answer[q[confidence_order].astype(int)] = confidence_order
    # Take the average correctness of the best answer
    accu_by_q = y[border:][best_answer.astype(int)].mean()

    return [C, gamma, accu, mean_bits, accu_by_q, train_time, test_time]
def train_validate_test(parameter_dict, X_train, X_validate, y_train, y_validate, scaler):

    classifier = SVC(C=parameter_dict["C"],
                     kernel=parameter_dict["kernel"],
                     degree=parameter_dict["degree"],
                     gamma=parameter_dict["gamma"],
                     coef0=parameter_dict["coef0"],
                     probability=parameter_dict["probability"],
                     shrinking=parameter_dict["shrinking"],
                     tol=parameter_dict["tol"],
                     cache_size=parameter_dict["cache_size"],
                     class_weight=parameter_dict["class_weight"],
                     verbose=parameter_dict["verbose"],
                     max_iter=parameter_dict["max_iter"],
                     random_state=parameter_dict["random_state"])
    
    print "training ..."
    classifier.fit(X_train, y_train)
    print "trained."
    
    print "testing ..."
    y_train_proba = classifier.predict_proba(X_train)
    y_validate_proba = classifier.predict_proba(X_validate)
    print "tested."

    acc_train = classifier.score(X_train, y_train)
    acc_validate = classifier.score(X_validate, y_validate)
    print "mean accuracy on training set:   %s" % str(acc_train)
    print "mean accuracy on validation set: %s" % str(acc_validate)

    encoder = LabelEncoder()
    
    logloss_train = logloss_mc(encoder.fit_transform(y_train), y_train_proba)
    print "logarithmic loss on training set:    %s" % str(logloss_train)

    logloss_validate = logloss_mc(encoder.fit_transform(y_validate), y_validate_proba)
    print "logarithmic loss on validateion set: %s" % str(logloss_validate)

    info_dict = parameter_dict.copy()
    info_dict["acc_train"] = acc_train
    info_dict["acc_validate"] = acc_validate
    info_dict["logloss_train"] = logloss_train
    info_dict["logloss_validate"] = logloss_validate
    
    make_submission(classifier, encoder, scaler, info_dict)

    return info_dict
Example #18
0
def svc(X,y,Z,test_data):
    from sklearn.svm import SVC
    svc = SVC(probability=True)
    svc.fit(X, y) 
    test_probs_svc = svc.predict_proba(Z)[:1]
    sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"],
                        'truth':test_probs_svc}).set_index("enrollment_id")
    sub.to_csv('data\\result\\seventh_svc.csv')
Example #19
0
    def __metric_pipeline(self, metric, params={}, in_data=None):

        X_in, y_in = self.__process_in_data(in_data)

        metric_stage = wrap_and_make_instance(metric, **params)
        in_keys = metric_stage.input_keys
        out_keys = metric_stage.output_keys

        p = Pipeline()

        node_X_in = p.add(NumpyRead(X_in))
        node_y_in = p.add(NumpyRead(y_in))

        node_split = p.add(SplitTrainTest(2, random_state=0))
        node_X_in["output"] > node_split["input0"]
        node_y_in["output"] > node_split["input1"]

        ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = train_test_split(X_in, y_in, random_state=0)

        node_clf = p.add(wrap_and_make_instance(SVC, random_state=0))
        node_split["train0"] > node_clf["X_train"]
        node_split["train1"] > node_clf["y_train"]
        node_split["test0"] > node_clf["X_test"]

        ctrl_clf = SVC(random_state=0, probability=True)
        ctrl_clf.fit(ctrl_X_train, ctrl_y_train)

        node_proba_1 = p.add(SplitY(1))
        node_clf["pred_proba"] > node_proba_1["input"]

        ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1]

        node_metric = p.add(metric_stage)

        ctrl_metric_args = {}
        if "y_true" in in_keys:
            node_split["test1"] > node_metric["y_true"]
            ctrl_metric_args["y_true"] = ctrl_y_test
        if "y_score" in in_keys:
            node_proba_1["y"] > node_metric["y_score"]
            ctrl_metric_args["y_score"] = ctrl_y_score
        if "probas_pred" in in_keys:
            node_proba_1["y"] > node_metric["probas_pred"]
            ctrl_metric_args["probas_pred"] = ctrl_y_score

        out_nodes = [p.add(CSVWrite(self._tmp_files("out_{}.csv".format(out_key)))) for out_key in out_keys]
        [node_metric[out_key] > out_nodes[i]["input"] for i, out_key in enumerate(out_keys)]

        self.run_pipeline(p)

        ctrl_returns = metric(**ctrl_metric_args)
        if len(out_keys) == 1:
            ctrl_returns = (ctrl_returns,)

        for i, out_key in enumerate(out_keys):
            control = ctrl_returns[i]
            result = self._tmp_files.csv_read("out_{}.csv".format(out_key), as_nd=True)
            self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def inter_svm(train_data, test_data, num_fold):
    tmp = inter_kernel(test_data[:, 1:], train_data[:, 1:])
    test_data = numpy.hstack([test_data[:, :1], tmp])
    tmp = inter_kernel(train_data[:, 1:], train_data[:, 1:])
    train_data = numpy.hstack([train_data[:, :1], tmp])

    best_valid_ACC = 0
    # Tune parameters
    for z in range(-8, 8):
        C = pow(2, z)
        result_train = list()
        result_valid = list()

        # Do cross-validation
        skf = cross_validation.StratifiedKFold(
                                   train_data[:, 0], 
                                   n_folds=num_fold
                               )        
        clf = SVC(C, kernel='precomputed', probability=True)
        for train_index, valid_index in skf:
            clf.fit(train_data[train_index, :][:, train_index+1], train_data[train_index, 0])
            train_pred = clf.predict_proba(train_data[train_index, :][:, train_index+1])
            valid_pred = clf.predict_proba(train_data[valid_index, :][:, train_index+1])
            train_acc = roc_auc_score(train_data[train_index, 0], train_pred[:, 1])
            valid_acc = roc_auc_score(train_data[valid_index, 0], valid_pred[:, 1])
            #train_pred = clf.predict(train_data[train_index, :][:, train_index+1])
            #valid_pred = clf.predict(train_data[valid_index, :][:, train_index+1])

            #train_acc = accuracy_score(train_data[train_index, 0], train_pred)
            #valid_acc = accuracy_score(train_data[valid_index, 0], valid_pred)
            result_train.append(train_acc)
            result_valid.append(valid_acc)
        # If mean accuracy greater than best accuracy, then record it
        if sum(result_valid)/num_fold > best_valid_ACC:
            best_valid_ACC = sum(result_valid)/num_fold
            best_train_ACC = sum(result_train)/num_fold
            best_C = C

    # Predict test data with best C
    clf = SVC(best_C, kernel='precomputed', probability=True)
    clf.fit(train_data[:, 1:], train_data[:, 0])
    test_pred = clf.predict_proba(test_data[:, 1:])
    test_ACC = roc_auc_score(test_data[:, 0], test_pred[:, 1])
    
    return best_train_ACC, best_valid_ACC, test_ACC, best_C
Example #21
0
def main():

    # Load the data
    print('Reading data...')
    main_data = pd.read_csv('../data/main_data.csv')
    targets = pd.read_csv('../data/target.csv')
    big_array = pd.concat([main_data, targets], axis=1)
    big_array = big_array.sample(frac=0.010)
    print(len(main_data.index))
    print(len(big_array.index))


    # Split the Data
    print('Splitting...')
    X_train, X_test, y_train, y_test = train_test_split(main_data, targets, test_size=0.25, random_state=42)

    # Train tree
    print('Training Tree...')
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    print('Predicting Tree...')
    tree_pred = tree.predict_proba(X_test)[:, 1]
    tree_fpr, tree_tpr, _ = roc_curve(y_test, tree_pred)

    # train random forest
    print('Training Random Forest...')
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print('Predicting Random Forest...')
    rf_pred = rf.predict_proba(X_test)[:, 1]
    rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_pred)

    # train svm
    # Had to split it to a subset, way too mcuh data, too long to run
    svm_train, svm_test, svm_y_train, svm_y_test = train_test_split(big_array.drop('TARGET', axis=1),
                                                                    big_array['TARGET'],
                                                                    test_size=0.75)
    print('Training SVM...')
    svm_c = SVC(kernel='linear', probability=True)
    svm_c.fit(svm_train, svm_y_train)
    print('Predicting SVM...')
    svm_pred = svm_c.predict_proba(X_test)[:, 1]
    svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_pred)

    # plot model comparison
    print('Creating Plot...')
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(tree_fpr, tree_tpr, label='Tree')
    plt.plot(rf_fpr, rf_tpr, label='RF')
    plt.plot(svm_fpr, svm_tpr, label='SVM')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    print('Saving Plot...')
    plt.savefig('rocCurve.png')
def predictSVC(train, labels, test):
    print 'start SVC'
    clf = SVC(probability=True)
    clf.fit(train, labels)
    svc_predictions = clf.predict(test)
    svc_probs = clf.predict_proba(test)
    svc_bestProbs = svc_probs.max(axis=1)
    print 'svc done!'
    return svc_predictions, svc_bestProbs
Example #23
0
def predict_SVM():
    X,y,test_x,test_uid = loadData()
    model = SVC(C=2.5,gamma=0.03,kernel='rbf',probability=True,random_state=23333)
    model.fit(X,y)
    test_y = model.predict_proba(test_x)
    result = pd.DataFrame(columns=["uid","score"])
    result.uid=test_uid
    result.score = test_y[:,1]
    result.to_csv('../result/result_SVM_rank.csv',index=None,encoding='utf-8')
Example #24
0
def pipeLine(iteration,C,gamma,random_seed):
    X,y,test_x,test_uid = loadData()
    model = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,random_state=random_seed)
    model.fit(X,y)
    pred = model.predict_proba(test_x)
    test_result = pd.DataFrame(columns=["uid","score"])
    test_result.uid=test_uid
    test_result.score = pred[:,1]
    test_result.to_csv('../result/svm_pred{0}.csv'.format(iteration),index=None,encoding='utf-8')
def SVM(X, Y, XTest, YTest):
    print '-----------------------------------------------------'
    # grid search over these to find parameters
    CList = [.001, .003, .01, .03, .1, .3, 1, 3, 6, 10, 15, 30, 40]
    gammaList = [.001, .003, .01, .03, .1, .3, 1, 2, 3, 4, 5, 6, 7]
    param_grid = [{'C': CList,
                   'gamma': gammaList,
                   'kernel': ['rbf', 'sigmoid', 'linear']}]
    # grid search over these to find parameters
    # rbf_grid = GridSearchCV(SVC(probability=True), param_grid=param_grid)
    rbf_grid = SVC(C=500, gamma=0.1, probability=True)
    # fit the models
    rbf_grid.fit(X, Y)

    # print("The best parameters are %s with a score of %0.2f"
    #       % (rbf_grid.best_params_, rbf_grid.best_score_))

    print "Computing training statistics"
    rbf_predict_time_training = time.time()
    Ypred_rbf_training = rbf_grid.predict(X)
    rbf_predict_time_training = time.time() - rbf_predict_time_training

    rbf_accuracy_training = metrics.accuracy_score(Y, Ypred_rbf_training)
    rbf_precision_training = metrics.precision_score(Y, Ypred_rbf_training,
                                                     average='binary')
    rbf_recall_training = metrics.recall_score(Y, Ypred_rbf_training,
                                               average='binary')

    print "SVM RBF training prediction time: " + str(rbf_predict_time_training)
    print "SVM RBF training accuracy Score: " + str(rbf_accuracy_training)
    print "SVM RBF training precision Score: " + str(rbf_precision_training)
    print "SVM RBF training recall Score: " + str(rbf_recall_training)

    print "Computing testing statistics"
    rbf_predict_time_test = time.time()
    Ypred_rbf_test = rbf_grid.predict(XTest)
    rbf_predict_time_test = time.time() - rbf_predict_time_test

    rbf_accuracy_test = metrics.accuracy_score(YTest, Ypred_rbf_test)
    rbf_precision_test = metrics.precision_score(YTest, Ypred_rbf_test,
                                                 average='binary')
    rbf_recall_test = metrics.recall_score(YTest, Ypred_rbf_test,
                                           average='binary')

    print "SVM RBF test prediction time: " + str(rbf_predict_time_test)
    print "SVM RBF test accuracy Score: " + str(rbf_accuracy_test)
    print "SVM RBF test precision Score: " + str(rbf_precision_test)
    print "SVM RBF test recall Score: " + str(rbf_recall_test)

    print "Creating ROC curve"
    y_true = YTest
    y_score = rbf_grid.predict_proba(XTest)
    fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
                                          y_score=y_score[:, 0],
                                          pos_label=0)
    plt.plot(fprSVM, trpSVM, 'b-', label='SVM')
Example #26
0
def classify_SVC(train, test, kernel = 'rbf', verbose = False):
	from sklearn.svm import SVC

	x, y = train
	clf = SVC(probability = True, verbose = verbose)
	clf.fit(x, y)
	
	x, y = test
	proba = clf.predict_proba(x, kernel = kernel)
	return proba
def pipeline(iteration,C,gamma,random_seed):
    clf = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,cache_size=7000,class_weight='balanced',verbose=True,random_state=random_seed)
    clf.fit(X,y)
    joblib.dump(clf, './model/svm{0}.pkl'.format(iteration))

    pred = clf.predict_proba(test_x)
    test_result = pd.DataFrame(columns=["uid","score"])
    test_result.uid = test_uid
    test_result.score = pred[:,1]
    test_result.to_csv('./preds/svm_pred{0}.csv'.format(iteration),index=None)
Example #28
0
def svm_train(train_file,test_file):
    _,x,y = readFile(train_file)
    id, tx = readFile(test_file)
    #feature selection
    from sklearn.feature_selection import SelectKBest,chi2
    fselect = SelectKBest(chi2, k =5000)
    x = fselect.fit_transform(x,y)
    tx = fselect.transform(tx)

    print x.shape
    print tx.shape    

    hehe = np.concatenate((x,tx))
    from sklearn.preprocessing import scale
    hehe = scale(hehe,with_mean=False)
    x = hehe[0:x.shape[0]]
    tx = hehe[x.shape[0]:]
    
    from sklearn.cross_validation import train_test_split
    tmp_array = np.arange(x.shape[0])
    train_i,test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 1024)
    train_x = x[train_i]
    train_y = y[train_i]
    test_x = x[test_i]
    test_y = y[test_i]

    from sklearn.svm import SVC
    model = SVC(probability=True)
    model.fit(x,y)
    
    res1 = model.predict_proba(train_x)
    res2 = model.predict_proba(test_x)
    from sklearn.metrics import roc_auc_score
    score1 = roc_auc_score(train_y, res1[:,1])
    score2 = roc_auc_score(test_y, res2[:,1])
    print score1
    print score2

    res = model.predict_proba(tx)
    output = pd.DataFrame( data={"id":id, "sentiment":res[:,1]} )
    output.to_csv( "/home/chuangxin/SVM_result.csv", index=False, quoting=3 )
    
    return model
Example #29
0
    def fit_model_22(self, lol = 2, toWrite = False):
        model = SVC(probability = True, kernel = 'sigmoid', tol = 1e-3, coef0 = lol)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 22 score: %f" % (logloss(Y_test,pred),))
        if toWrite:
            f2 = open('model22/model.pkl','w')
            pickle.dump(model,f2)
Example #30
0
    def fit_model_20(self, lol = 0.0025, toWrite = False):
        model = SVC(probability = True, kernel = 'linear', class_weight = 'auto', tol = 1e-3)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 20 score: %f" % (logloss(Y_test,pred),))
        if toWrite:
            f2 = open('model20/model.pkl','w')
            pickle.dump(model,f2)
Example #31
0
class PseudoRelevanceClassifierReranker:
    def __init__(self, lucene_index: str, vectorizer_class: str, clf_type: List[ClassifierType], r=10, n=100, alpha=0.5):
        self.r = r
        self.n = n
        self.alpha = alpha
        self.clf_type = clf_type

        # get vectorizer
        module = importlib.import_module("pyserini.vectorizer")
        VectorizerClass = getattr(module, vectorizer_class)
        self.vectorizer = VectorizerClass(lucene_index, min_df=5)

        if len(clf_type) > 2:
            raise Exception('Re-ranker takes at most two classifiers')

    def _set_classifier(self, clf_type: ClassifierType):
        if clf_type == ClassifierType.LR:
            self.clf = LogisticRegression(random_state=42)
        elif clf_type == ClassifierType.SVM:
            self.clf = SVC(kernel='linear', probability=True, random_state=42)
        else:
            raise Exception("Invalid classifier type")

    def _get_prf_vectors(self, doc_ids: List[str]):
        train_docs = doc_ids[:self.r] + doc_ids[-self.n:]
        train_labels = [1] * self.r + [0] * self.n

        train_vecs = self.vectorizer.get_vectors(train_docs)
        test_vecs = self.vectorizer.get_vectors(doc_ids)

        return train_vecs, train_labels, test_vecs

    def _rerank_with_classifier(self, doc_ids: List[str], search_scores: List[float]):
        train_vecs, train_labels, test_vecs = self._get_prf_vectors(doc_ids)

        # classification
        self.clf.fit(train_vecs, train_labels)
        pred = self.clf.predict_proba(test_vecs)
        classifier_scores = self._normalize([p[1] for p in pred])
        search_scores = self._normalize(search_scores)

        # interpolation
        interpolated_scores = [a * self.alpha + b * (1-self.alpha) for a, b in zip(classifier_scores, search_scores)]

        return self._sort_dual_list(interpolated_scores, doc_ids)

    def rerank(self, doc_ids: List[str], search_scores: List[float]):
        # one classifier
        if len(self.clf_type) == 1:
            self._set_classifier(self.clf_type[0])
            return self._rerank_with_classifier(doc_ids, search_scores)

        # two classifier with FusionMethod.AVG
        doc_score_dict = {}
        for i in range(2):
            self._set_classifier(self.clf_type[i])
            i_scores, i_doc_ids = self._rerank_with_classifier(doc_ids, search_scores)

            for score, doc_id in zip(i_scores, i_doc_ids):
                if doc_id not in doc_score_dict:
                    doc_score_dict[doc_id] = set()
                doc_score_dict[doc_id].add(score)

        r_scores, r_doc_ids = [], []
        for doc_id, score in doc_score_dict.items():
            avg = sum(score) / len(score)
            r_doc_ids.append(doc_id)
            r_scores.append(avg)

        return r_scores, r_doc_ids

    def _normalize(self, scores: List[float]):
        low = min(scores)
        high = max(scores)
        width = high - low

        return [(s-low)/width for s in scores]

    # sort both list in decreasing order by using the list1 to compare
    def _sort_dual_list(self, list1, list2):
        zipped_lists = zip(list1, list2)
        sorted_pairs = sorted(zipped_lists)

        tuples = zip(*sorted_pairs)
        list1, list2 = [list(tuple) for tuple in tuples]

        list1.reverse()
        list2.reverse()
        return list1, list2
Example #32
0
class SVM(RecognitionModel):
    """This is the Sci-Kit version of SVM.

    .. seealso:: `sklearn.svm.SVC <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_
    """

    def __init__(self, C=100, gamma=0.000001):
        """ Initialize a SVM classifier.
        The default kernel is RBF(Radial Basis Function).

        :param C: The **C** parameter trades off misclassification of training examples against simplicity
                of the decision surface
        :param gamma:  the gamma parameter defines how far the influence of a single training example reaches,
                with low values meaning ‘far’ and high values meaning ‘close’

        .. seealso:: see more about **C** and **gamma** :
                    `RBF SVM parameters <http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html>`_
        """
        super().__init__()
        self.model = SVC()
        self.parameters.append(('C', C))
        self.parameters.append(('gamma', gamma))
        self.model.C = C
        self.model.gamma = gamma
        self.model.probability = True
        self.model.cache_size = 1024

    @staticmethod
    def grid_search_c_gamma(samples, labels):
        """ Grid Search for best combination of **C** and **gamma**.

        :param samples: the samples
        :param labels: the corresponding labels
        :return: the best combination of **C** and **gamma**
        :rtype: Dictionary

        .. note:: This method uses a linear kernel, not the RBF kernel.
        """

        # GridSearch for gamma and C
        C_range = np.logspace(-2, 10, 13)
        gamma_range = np.logspace(-9, 3, 13)
        param_grid = dict(gamma=gamma_range, C=C_range)
        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
        grid.fit(samples, labels)

        return grid.best_params_

    @staticmethod
    def reduce_class_3_4(samples, labels):

        new_samples = list()
        new_labels = list()

        for index in range(len(labels)):
            if labels[index] < 3:
                new_labels.append(labels[index])
                new_samples.append(samples[index])

            if labels[index] == 5:
                new_labels.append(3)
                new_samples.append(samples[index])

        return new_samples, new_labels

    @staticmethod
    def reduce_class_1_5(samples, labels):

        new_samples = list()
        new_labels = list()

        for index in range(len(labels)):
            if labels[index] == 0:
                new_labels.append(0)
                new_samples.append(samples[index])

            if 1 < labels[index] < 5:
                new_labels.append(labels[index] - 1)
                new_samples.append(samples[index])

        return new_samples, new_labels

    @staticmethod
    def mixed_demo(samples, labels):
        """ This is a demo trying to combine Feature Selection and Grid Search.
        But it's too slow to produce a good result.
        To be improved.

        :param samples: the samples
        :param labels: the corresponding labels
        """

        X, y = samples, labels

        # This dataset is way too high-dimensional. Better do PCA:
        pca = PCA(n_components=2)

        # Maybe some original features where good, too?
        selection = SelectKBest(k=1)

        # Build estimator from PCA and Univariate selection:

        combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

        # Use combined features to transform dataset:
        X_features = combined_features.fit(X, y).transform(X)

        svm = SVC(kernel="linear")

        # Do grid search over k, n_components and C:

        pipeline = Pipeline([("features", combined_features), ("svm", svm)])

        param_grid = dict(features__pca__n_components=[1, 2, 3],
                          features__univ_select__k=[1, 2],
                          svm__C=[0.1, 1, 10])

        grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
        # grid_search.fit(X, y)
        # print(grid_search.best_estimator_)
        grid_search.fit(X_features, y)
        logging.info(grid_search.best_estimator_)

    def train(self, samples, labels):
        """ Train the model.

        :param samples: the samples
        :type samples: [float]
        :param labels: the corresponding labels
        :type labels: [str]
        """
        self.model.fit(samples, labels)

    def predict(self, samples):
        """ Predict the samples.

        :param samples: the samples
        :return: a list of label-result
        """
        return self.model.predict(samples)

    def predict_proba(self, samples):
        # Seems unused
        """ Predict the probabilities of each given sample to each label.
        :param samples: the samples
        :return: a list of probas-result
        """
        return self.model.predict_proba(samples)

    def evaluate_model(self, samples, labels):
        """ Evaluate the model using the given samples and labels.

        :param samples: the samples
        :type samples: [float]
        :param labels: the corresponding labels
        :type labels: [str]
        :return: The Evaluation of the model
        :rtype: Evaluation
        """
        resp = self.predict(samples)
        reco = (labels == resp).mean()

        class_names = sorted(list(set(labels)))
        nb_classes = len(class_names)
        confusion = np.zeros((nb_classes, nb_classes), np.int32)

        # the vertical i is the correct answer,
        # while the horizon j is the prediction result
        for i, j in zip(labels, resp):
            confusion[class_names.index(i), class_names.index(j)] += 1

        return Evaluation(reco, confusion)

    def train_and_evaluate(self, samples_train, labels_train, samples_test, labels_test):
        """ A combination of train() and evaluate_model()

        :param samples_train: the samples to train the model
        :param labels_train:  the corresponding labels to train samples
        :param samples_test: the samples to evaluate the model
        :param labels_test: the corresponding labels to evaluate samples
        :return: the error rate
        """
        self.model.fit(samples_train, labels_train)
        return self.evaluate_model(samples_test, labels_test)

    def auto_predict(self, samples):
        """This function will use the model to predict the samples, generate probability map and the final result. """
        probs = self.model.predict_proba(samples)
        labels = list()
        labels.append(self.model.predict(samples)[0])

        return probs, labels

    def set_parameters(self, parameters: {str, float}):
        self.parameters = list()
        self.model.C = parameters['C']
        self.model.gamma = parameters['gamma']
        self.parameters.append(('C', self.model.C))
        self.parameters.append(('gamma', self.model.gamma))

    def auto_config(self, samples, labels):
        best_parameters = self.grid_search_c_gamma(samples, labels)
        self.set_parameters(best_parameters)
# reduce the dimension
# X_train_redu = tsne.fit_transform(X_train_std)

# perform classification
svm = SVC(kernel='rbf',
          random_state=0,
          gamma=GAMMA,
          C=C,
          verbose=True,
          probability=True)
svm.fit(X_train_redu, y_train)

# performance metrics
y_train_pred = svm.predict(X_train_redu)  # the training set predictions
scores = svm.predict_proba(X_train_redu)
print(np.shape(scores))
ACCURACY_SCORE = svm.score(X_train_redu, y_train, sample_weight=None)
NMI = metrics.adjusted_mutual_info_score(y_train, y_train_pred)

# print the summary
print('\n')
print('Time Points Used : ', DIV)
print('Total Datapoints used : ', np.size(X_train_redu, 0))
print('Remove Low Variance Variables : ', REMOVE_LOW_VARIANCE_PARAMS)
if (REMOVE_LOW_VARIANCE_PARAMS):
    print('Low Variance Threshold : ', LOW_VARIANCE_THRESHOLD)
print('Remove Outliers : ', REMOVE_OUTLIERS)
if (REMOVE_OUTLIERS):
    print('Absolute distance to the median : ', M)
print('Perplexity of TSNE : ', PERPLEXITY)
Example #34
0
    fig = plt.figure()
    sns.scatterplot(data=dfe,
                    x='r_mag',
                    y='v_mag',
                    hue='errors',
                    markers=['o', 's', 'v', 'D'])
    plt.savefig('./plots/mag_errors.png', bbox_inches="tight")

    plt.figure()
    sns.histplot(dfe, x='r_mag', y='v_mag', hue='errors')
    plt.savefig('./plots/error_hist_mag.png', bbox_inches="tight")

# %%
run_proba = True
if run_proba:
    probs = best_model.predict_proba(dfe.drop(columns=['errors']))
    probs = probs[:, 0:2]
    prob_diff = probs[:, 0] - probs[:, 1]
    dfp = pd.DataFrame()
    dfp['probability'] = prob_diff
    dfp['error_type'] = dfe.errors
    sns.catplot(x='error_type', y='probability', data=dfp)
    plt.xticks(rotation=45)
    plt.savefig('./plots/error_probs.png', bbox_inches="tight")
# %%
run_tsne = True
if run_tsne:
    cl_1, cl_2 = 0, 1
    x_11 = X_test[(y_test == cl_1) & (y_pred == cl_1)]
    x_12 = X_test[(y_test == cl_1) & (y_pred == cl_2)]
    x_21 = X_test[(y_test == cl_2) & (y_pred == cl_1)]
def Co_KNN_SVM(train_Y, train_X, test_Y, test_X, savepath=None):
    model_max = None
    accuracy_max = 0
    # 每次迭代,添加到对方分类器训练集的样本数
    temp_num_svm = 44
    temp_num_knn = 44

    # 迭代次数
    loop_num = 10

    # knn中的K
    K = 4

    # KNN和SVM用来测试的样本及测试的标签(不变)
    fixed_test_X = test_X.copy()
    fixed_test_Y = test_Y.copy()

    # KNN保存准确率
    accuracy_knn_list = []
    # SVM保存准确率
    accuracy_svm_list = []

    # knn训练标签和训练集特征组成的元组list
    train_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        train_Y.copy(), train_X.copy())
    # knn测试标签和测试集特征组成的元组list
    test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        test_Y.copy(), test_X.copy())
    # svm训练标签和训练集特征组成的元组list
    train_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        train_Y.copy(), train_X.copy())
    # svm测试标签和测试集特征组成的元组list
    test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
        test_Y.copy(), test_X.copy())
    # 协同训练
    for h in range(1, loop_num + 1):
        print(len(train_knn_Y_X_tuple_list))
        print(len(test_knn_Y_X_tuple_list))
        print(len(train_svm_Y_X_tuple_list))
        print(len(test_svm_Y_X_tuple_list))
        # 得到svm的训练集标签和训练集的特征
        train_Y_svm_from_tuple, train_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            train_svm_Y_X_tuple_list.copy())
        # 得到svm的测试集标签和测试集的特征
        test_Y_svm_from_tuple, test_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            test_svm_Y_X_tuple_list.copy())
        # 得到knn的训练集标签和训练集的特征
        train_Y_knn_from_tuple, train_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            train_knn_Y_X_tuple_list)
        # 得到knn的测试集标签和测试集的特征
        test_Y_knn_from_tuple, test_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple(
            test_knn_Y_X_tuple_list)

        # KNN计算准确率
        knn = KNeighborsClassifier(n_neighbors=K, weights='distance')
        # 训练
        knn.fit(train_X_knn_from_tuple, train_Y_knn_from_tuple)
        # 获得准确率
        accuracy_knn = knn.score(fixed_test_X, fixed_test_Y)
        accuracy_knn_list.append(accuracy_knn * 100)

        print("预测结果(KNN)")
        print(h)
        print(accuracy_knn)

        # svm计算准确率
        svc = SVC(C=15, kernel='rbf', degree=3, gamma=2, probability=True)
        # 训练
        svc.fit(train_X_svm_from_tuple, train_Y_svm_from_tuple)
        # 获得准确率
        accuracy_svm = svc.score(fixed_test_X, fixed_test_Y)
        accuracy_svm_list.append(accuracy_svm * 100)

        print("预测结果(SVM)")
        print(h)
        print(accuracy_svm)
        if accuracy_svm > accuracy_max:
            accuracy_max = accuracy_svm
            model_max = svc
        if h == loop_num:
            break

        # KNN和SVM半监督训练过程
        # ---------------------------------KNN测试样本预测和置信度计算过程 ----------------------------------
        # 根据模型,预测样本
        # 获得预测可能性
        probility_knn = knn.predict_proba(test_X_knn_from_tuple)
        # knn的置信list
        confidence_knn_list = []
        for i in range(0, probility_knn.shape[0]):
            probility_knn_temp = probility_knn[i]
            confidence_knn_list.append(
                utilities.get_confidence_knn(probility_knn_temp.copy()))

        # 获得预测标签
        predict_Y_knn = knn.predict(test_X_knn_from_tuple)

        # ---------------------------------SVM测试样本预测和置信度计算过程 ----------------------------------
        # 根据模型,预测样本
        # 获得预测可能性
        probility_svm = svc.predict_proba(test_X_svm_from_tuple)

        # svm的置信list
        confidence_svm_list = []
        for i in range(0, probility_svm.shape[0]):
            probility_svm_temp = probility_svm[i]
            confidence_svm_list.append(
                utilities.get_confidence_svm(probility_svm_temp.copy()))

        # 获得预测标签
        predict_Y_svm = svc.predict(test_X_svm_from_tuple)

        # KNN和SVM伪标签添加过程
        # ---------------------------------------KNN---------------------------------------------
        index_svm_label_high_confidence = utilities.get_confidence_svm_index(
            confidence_svm_list.copy(), predict_Y_svm.copy(),
            predict_Y_knn.copy(), temp_num_svm)

        temp_test_X_svm = []
        temp_test_Y_svm = []

        for i in index_svm_label_high_confidence:
            temp_test_X_svm.append(test_X_svm_from_tuple[i])
            temp_test_Y_svm.append(predict_Y_svm[i])

        temp_test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
            temp_test_Y_svm.copy(), temp_test_X_svm.copy())
        # 把svm的置信度较高的样本加入到knn的训练集中
        train_knn_Y_X_tuple_list.extend(temp_test_svm_Y_X_tuple_list)

        # 获取新的测试样本
        index_all_test_svm_Y_X_tuple_list = np.arange(
            0, len(test_svm_Y_X_tuple_list))
        diff_index_test_svm_Y_X_tuple_list = np.setdiff1d(
            index_all_test_svm_Y_X_tuple_list,
            np.array(index_svm_label_high_confidence))
        diff_test_svm_Y_X_tuple_list = []
        for i in diff_index_test_svm_Y_X_tuple_list:
            diff_test_svm_Y_X_tuple_list.append(test_svm_Y_X_tuple_list[i])
        test_svm_Y_X_tuple_list = diff_test_svm_Y_X_tuple_list

        # ---------------------------------------SVM---------------------------------------------
        index_knn_label_high_confidence = utilities.get_confidence_knn_index(
            confidence_knn_list.copy(), predict_Y_svm.copy(),
            predict_Y_knn.copy(), temp_num_knn)

        temp_test_X_knn = []
        temp_test_Y_knn = []

        for i in index_knn_label_high_confidence:
            temp_test_X_knn.append(test_X_knn_from_tuple[i])
            temp_test_Y_knn.append(predict_Y_knn[i])

        temp_test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list(
            temp_test_Y_knn.copy(), temp_test_X_knn.copy())
        # 把knn的置信度较高的样本加入到svm的训练集中
        train_svm_Y_X_tuple_list.extend(temp_test_knn_Y_X_tuple_list)
        # 获取新的测试样本
        index_all_test_knn_Y_X_tuple_list = np.arange(
            0, len(test_knn_Y_X_tuple_list))
        diff_index_test_knn_Y_X_tuple_list = np.setdiff1d(
            index_all_test_knn_Y_X_tuple_list,
            np.array(index_knn_label_high_confidence))
        diff_test_knn_Y_X_tuple_list = []
        for i in diff_index_test_knn_Y_X_tuple_list:
            diff_test_knn_Y_X_tuple_list.append(test_knn_Y_X_tuple_list[i])
        test_knn_Y_X_tuple_list = diff_test_knn_Y_X_tuple_list
    if model_max is not None:
        print(accuracy_max * 100)
        joblib.dump(model_max, savepath)

    print("KNN的准确率:")
    print(accuracy_knn_list)
    print("SVM的准确率:")
    print(accuracy_svm_list)
w2vec_model = Word2Vec.load('data/raw/amazon/Electronics.bin')
alx = pd.read_csv('data/processed/lexicon_table_asp_raw_09.csv',
                  index_col=['WORD', 'ASP'])

_w = w2vec_model['killer'].reshape(1, -1)
_a = w2vec_model['performance'].reshape(1, -1)
if variant == 'avg':
    _x = (_w + _a) / 2
elif variant == 'add':
    _x = _w + _a
else:
    raise NotImplementedError

svc.predict(_x)

svc.predict_proba(_x)

# SCORE
# -----
conf_threshold = 0.7

w2vec_model = Word2Vec.load('data/raw/amazon/Electronics.bin')
lx_df = pd.read_csv('data/processed/lexicon_table_asp_raw_09.csv',
                    index_col=['WORD', 'ASP'])
lx_words = lx_df.index.tolist()  # 119673 (13297x9)
vocabs = set(w2vec_model.wv.index2entity)  # 43750
score_words = [(w, a) for w, a in lx_words if w in vocabs]  # 51183 (5687x9)
prediction = []

for w, a in score_words:
    _w = w2vec_model[w].reshape(1, -1)
                                      activation='relu')
        net = tflearn.fully_connected(net, 2, activation='softmax')
        net = tflearn.regression(net)
        # Training
        clf = tflearn.DNN(net, tensorboard_verbose=0)

    clf.fit(X_train, labels_train)

    fprs = []
    fnrs = []
    thresholds = []
    if THRESHOLD:
        if classifier == 'mlp':
            pred = clf.predict(X_test)
        else:
            pred = clf.predict_proba(X_test)
        for t in np.arange(0, 1, 0.0005):
            pred_t = prob_to_class_threshold(pred, t)
            print "Threshold:", t
            # print "Accuracy:", acc
            print "Accuracy:", accuracy_score(labels_test, pred_t)
            print "F1Score:", f1_score(labels_test, pred_t)
            print "Recall:", recall_score(labels_test, pred_t)
            print "Precision:", precision_score(labels_test, pred_t)
            cm = confusion_matrix(labels_test, pred_t)
            print cm
            tn = cm[0][0]
            fn = cm[1][0]
            tp = cm[1][1]
            fp = cm[0][1]
            f1 = f1_score(labels_test, pred_t)
Example #38
0
def classifier(args,args_mode,dataset,sess):
    # Check that there are at least one training image per class
    for cls in dataset:
        #print(cls.name,'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
        if(len(cls.image_paths)<1):
            print(cls.image_paths,"@@@@@@@@@@@@@@@@@@@@@@@")
        assert (len(cls.image_paths) > 0, 'There must be at least one image for each class in the dataset')

    paths, labels,class_labels = get_image_paths_and_labels(dataset)

    print('Number of classes: %d' % len(dataset))
    print('Number of images: %d' % len(paths))



    # Get input and output tensors
    images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
    embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
    phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
    embedding_size = embeddings.get_shape()[1]

    # Run forward pass to calculate embeddings
    print('Calculating features for images')
    nrof_images = len(paths)
    nrof_batches_per_epoch = int(math.ceil(1.0 * nrof_images / args.batch_size))
    emb_array = np.zeros((nrof_images, embedding_size))
    for i in range(nrof_batches_per_epoch):
        start_index = i * args.batch_size
        end_index = min((i + 1) * args.batch_size, nrof_images)
        paths_batch = paths[start_index:end_index]
        images = facenet.load_data(paths_batch, False, False, args.image_size)
        feed_dict = {images_placeholder: images, phase_train_placeholder: False}
        emb_array[start_index:end_index, :] = sess.run(embeddings, feed_dict=feed_dict)

    classifier_filename_exp = os.path.expanduser(args.classifier_filename)

    if (args_mode == 'TRAIN'):
        # Train classifier
        print('Training classifier+++++++++++++++++++++++++',args.classifier)
        if args.classifier == 'LinearSvm':
            # clf = SVC(C=1, kernel='linear', probability=True)
            model = SVC(kernel='linear', probability=True)
        elif args.classifier == 'GridSearchSvm':
            print("""
                            Warning: In our experiences, using a grid search over SVM hyper-parameters only
                            gives marginally better performance than a linear SVM with C=1 and
                            is not worth the extra computations of performing a grid search.
                            """)
            param_grid = [
                {'C': [1, 10, 100, 1000],
                 'kernel': ['linear']},
                {'C': [1, 10, 100, 1000],
                 'gamma': [0.001, 0.0001],
                 'kernel': ['rbf']}
            ]
            model = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
        elif args.classifier == 'GMM':  # Doesn't work best
            model = GMM(n_components=nClasses)

        # ref:
        # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
        elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
            # works better with C = 1 and gamma = 2
            model = SVC(C=1, kernel='rbf', probability=True, gamma=2)
        elif args.classifier == 'DecisionTree':  # Doesn't work best
            model = DecisionTreeClassifier(max_depth=20)
        elif args.classifier == 'GaussianNB':
            model = GaussianNB()

        # ref: https://jessesw.com/Deep-Learning/
        elif args.classifier == 'DBN':
            from nolearn.dbn import DBN
            model = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1],  # i/p nodes, hidden nodes, o/p nodes
                        learn_rates=0.3,
                        # Smaller steps mean a possibly more accurate result, but the
                        # training will take longer
                        learn_rate_decays=0.9,
                        # a factor the initial learning rate will be multiplied by
                        # after each iteration of the training
                        epochs=300,  # no of iternation
                        # dropouts = 0.25, # Express the percentage of nodes that
                        # will be randomly dropped as a decimal.
                        verbose=1)
        elif args.classifier == 'KNN':
            model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                                         metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                                         weights='uniform')

        model.fit(emb_array, labels)

        # Create a list of class names
        class_names = [cls.name.replace('_', ' ') for cls in dataset]

        # Saving classifier model
        with open(classifier_filename_exp, 'wb') as outfile:
            pickle.dump((model, class_names), outfile)
        print('Saved classifier model to file "%s"' % classifier_filename_exp)

    elif (args_mode == 'CLASSIFY'):
        # Classify images
        print('Testing classifier~~~~~~~~~~~~~~~~~~~~~~~~')
        with open(classifier_filename_exp, 'rb') as infile:
            (model, class_names) = pickle.load(infile)
        predictions = np.zeros((nrof_images, len(class_names)))
        print('Loaded classifier model from file "%s"' % classifier_filename_exp)
        correctPrediction = 0
        inCorrectPrediction = 0
        sumConfidence = 0.0
        correctConfidence = 0.0
        inCorrectConfidence = 0.0
        '''
         batch_size =args.batch_size
        #batch_size = 1
        for i in range(nrof_batches_per_epoch):
            start_index = i * batch_size
            end_index = min((i + 1) * batch_size, nrof_images)
            starttime = time.time()
            mini_emb_array = emb_array[start_index:end_index, :]
            predictions[start_index:end_index, :] = model.predict_proba(mini_emb_array)
            print("start_index:{} end_index:{} time:{}".format(start_index, end_index, time.time() - starttime))
      
        '''
        predictions = model.predict_proba(emb_array)
        best_class_indices = np.argmax(predictions, axis=1)
        best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices]
        results = {'name': [], 'bestname': [], 'probabilities': []}
        for i in range(len(best_class_indices)):
            #print(len(class_names))
            #print(i,len(labels),labels[i])
            #print(i,len(best_class_indices),best_class_indices[i])
            print('%4d  %s:%s: %.3f' % (
            i, class_labels[i], class_names[best_class_indices[i]], best_class_probabilities[i]))
            results['name'].append(class_labels[i])
            results['bestname'].append(class_names[best_class_indices[i]])
            results['probabilities'].append(best_class_probabilities[i])
            sumConfidence += best_class_probabilities[i]
            if (class_labels[i] == class_names[best_class_indices[i]]):
                correctPrediction += 1
                correctConfidence += best_class_probabilities[i]
            else:
                inCorrectPrediction += 1
                inCorrectConfidence += best_class_probabilities[i]

        #accuracy = np.mean(np.equal(best_class_indices, labels))
        accuracy = float(correctPrediction) / (correctPrediction + inCorrectPrediction)
        Avg_Confidence = float(sumConfidence) / (correctPrediction + inCorrectPrediction)
        Avg_correctConfidence = float(correctConfidence/correctPrediction)
        Avg_inCorrectConfidence = float(inCorrectConfidence / inCorrectPrediction)
        results['name'].append('Accuracy:')
        results['bestname'].append('Accuracy:')
        results['probabilities'].append(accuracy)
        dataname = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
        data_frame = pd.DataFrame(
            data={'name': results['name'], 'bestname': results['bestname'], 'probabilities': results['probabilities']})
        data_frame.to_csv(args.data_dir + '/results_' + dataname + '.csv')

        print("Correct Prediction :" + str(correctPrediction))
        print("In-correct Prediction: " + str(inCorrectPrediction))
        print('Accuracy: %.3f' % accuracy)
        print("Avg Confidence: " + str(Avg_Confidence))
        print("Avg CorrectConfidence: " + str(Avg_correctConfidence))
        print("Avg inCorrectConfidence: " + str(Avg_inCorrectConfidence))
Example #39
0
clf_svm = SVC(probability=True)
clf_rf = RandomForestClassifier(n_estimators=100)

y_test_list = []
y_pred_list_dtcr = []
y_pred_list_svm = []
y_pred_list_rf = []

for i in range(100):
    DTCRS.Get_Train_Valid_Test()
    DTCRS.Train(use_only_seq=True, num_fc_layers=0, units_fc=256)
    y_pred_list_dtcr.append(DTCRS.y_pred)

    #Kmer
    clf_svm.fit(kmer_features[DTCRS.train[6]], np.argmax(DTCRS.train[-1], -1))
    svm_pred = clf_svm.predict_proba(kmer_features[DTCRS.test[6]])
    y_pred_list_svm.append(svm_pred)

    #RF
    clf_rf.fit(kmer_features[DTCRS.train[6]], np.argmax(DTCRS.train[-1], -1))
    rf_pred = clf_rf.predict_proba(kmer_features[DTCRS.test[6]])
    y_pred_list_rf.append(rf_pred)

    y_test_list.append(DTCRS.test[-1])

auc = []
method = []
antigen = []
for y_test, y_pred_svm, y_pred_dtcr, y_pred_rf in zip(y_test_list,
                                                      y_pred_list_svm,
                                                      y_pred_list_dtcr,
Example #40
0
#%%
#%%
clf_svc = SVC(probability=True)

#%%
clf_lgbm = LGBMClassifier()

#%%
clf_svc.fit(prepedX, y)

#%%
clf_lgbm.fit(prepedX, y)

#%%
y_svc = clf_svc.predict_proba(prepedX)

#%%
y_lgbm = clf_lgbm.predict_proba(prepedX)

#%%
eclf = VotingClassifier(estimators=[
    ('svc', clf_svc),
    ('lgbm', clf_lgbm),
],
                        voting='soft')

#%%
eclf.fit(prepedX, y)

#%%
Example #41
0
'''
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, roc_curve, roc_auc_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import matplotlib.pyplot as plt

data = load_iris()

X_train, X_test, Y_train, Y_test = train_test_split(data.data,
                                                    data.target,
                                                    test_size=0.3)
svm_model = SVC(kernel='rbf', random_state=0, probability=True)
svm_model.fit(X_train, Y_train)
y_pre = svm_model.predict(X_test)
y_pre_ = svm_model.predict_proba(X_test)

FPR, RECALL, thresholds = roc_curve(y_true=Y_test,
                                    y_score=y_pre_[:, 1],
                                    pos_label=1)
# y_score=y_pre_[:,1]   也可以用svm_model.decision_function(X_test)作为参数,根据距离决策边界来分类

auc = roc_auc_score(Y_test,
                    y_score=y_pre_,
                    multi_class='ovo',
                    labels=[0, 1, 2],
                    max_fpr=1.0)  #这里是多分类,用multi_class参数
print(auc)

import matplotlib.pyplot as plt
Example #42
0
def classificationResults(feature, results, featureDescription):

    #  The two lines below convert the lists passed into the function to arrays.
    X = np.array(feature)
    y = np.array(results)

    #  Splits the data into training and testing sets using 5 split k fold:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12345)
    skf.split(X, y)
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # Fits the data to a model. The model is initially instantiated as SVC so that the definitions of 'classifier' in
    # the 'if' statements below it aren't out of scope of the rest of the module.
    model = SVC(gamma='scale', kernel='linear',
                probability=True).fit(X_train, y_train)
    if classifier == "Logistic Regression":
        model = LogisticRegression(random_state=0,
                                   solver='lbfgs',
                                   max_iter=1000).fit(X_train, y_train)
    if classifier == "Multinomial Bayes":
        model = MultinomialNB().fit(X_train, y_train)
    if classifier == "Random Forest":
        model = RandomForestClassifier().fit(X_train, y_train)

    # Generates a prediction for each sentence, and stores them in a list called 'predictions'.
    predictions = model.predict(np.array(X_test))

    # Calculates true positives, true negatives, false positives and false negatives:
    truePositives = 0
    trueNegatives = 0
    falsePositives = 0
    falseNegatives = 0
    numberInList = 0
    for prediction in predictions:
        # Is this a formal sentence which was predicted to be formal?
        if y_test[numberInList] and prediction:
            truePositives = truePositives + 1

        # Is this an informal sentence which was predicted to be informal?
        if not y_test[numberInList] and not prediction:
            trueNegatives = trueNegatives + 1

        # Is this an informal sentence which was predicted to be formal?
        if not y_test[numberInList] and prediction:
            falsePositives = falsePositives + 1

        # Is this a formal sentence which was predicted to be informal?
        if y_test[numberInList] and not prediction:
            falseNegatives = falseNegatives + 1
        numberInList = numberInList + 1

    # Performance metrics
    if (truePositives + trueNegatives + falsePositives + falseNegatives) > 0:
        accuracy = (truePositives + trueNegatives) / (
            truePositives + trueNegatives + falsePositives + falseNegatives)
    else:
        accuracy = 0
    if (truePositives + falsePositives) > 0:
        precision = truePositives / (truePositives + falsePositives)
    else:
        precision = 0
    if (truePositives + falseNegatives) > 0:
        recall = truePositives / (truePositives + falseNegatives)
    else:
        recall = 0
    if (trueNegatives + falsePositives) > 0:
        fallout = falsePositives / (
            trueNegatives + falsePositives
        )  # 'Fallout' is the same as the false positive rate.
    else:
        fallout = 0
    balAccuracy = balanced_accuracy_score(y_test, predictions)

    # Area under roc curve calculations
    y_scores = model.predict_proba(X_test)
    y_scores = y_scores[:, 1]
    rocAreaUnderCurve = roc_auc_score(y_test, y_scores)

    # Console output
    print("\nRESULTS SUMMARY\n" + "---------------\n")
    print("Feature(s) tested: ", featureDescription)
    print("Classifier: " + classifier, "\n")
    print("Total predictions: ", numberInList)
    print("TRUE POSITIVES: ", truePositives)
    print("FALSE POSITIVES: ", falsePositives)
    print("TRUE NEGATIVES: ", trueNegatives)
    print("FALSE NEGATIVES: ", falseNegatives)

    # Division by zero is illegal, so if the denominator is zero, then 'N/A' is given as the metric's value.
    if accuracy > 0:
        print("Accuracy: %3.2f" % accuracy)
    else:
        print("Accuracy: N/A")
    if precision > 0:
        print("Precision: %3.2f" % precision)
    else:
        print("Precision: N/A")
    if recall > 0:
        print("Recall: %3.2f" % recall)
    else:
        print("Recall: N/A")
    if fallout > 0:
        print("False positive rate: %3.2f" % fallout)
    else:
        print("False positive rate: N/A")
    print("AUC: %3.2f" % rocAreaUnderCurve)
    print("Balanced accuracy: %3.2f" % balAccuracy)
def active_most_proba_svm(difficulty='EASY', num_init_label=500):
    random.seed(0)
    num_init_label_copy = num_init_label
    current_model = None

    # This function selecte
    # Input:  difficulty - the difficulty as a string, 'EAST' or "MODERATE'

    # Additionally, you will implement a random learner for performing the
    # same task and compare the performance of both algorithms

    # generate the data.
    #   XTrain is a 1 by num_samples vector of values in the interval [0,1].
    #   YTrain is a 1 by num_samples vector of labels (either 0 or 1)
    #   YTrain is the true model
    X_train, y_train = read_train_test('{}_TRAIN.csv'.format(
        difficulty.upper()))
    X_test, y_test = read_train_test('{}_TEST.csv'.format(difficulty.upper()))

    num_samples = X_train.shape[0]
    num_test = X_test.shape[0]
    num_features = X_train.shape[1]
    assert y_train.shape == (num_samples, 1)
    assert X_test.shape == (num_test, num_features)
    assert y_test.shape == (num_test, 1)

    selected_label = np.full((num_samples, 1), -1, dtype=np.int)
    selected_mask = np.full((num_samples, 1), 0, dtype=np.int)

    # fill a base number of samples to selected
    for _ in range(num_init_label):
        x = select_random_unlabeled_point(selected_mask)
        selected_mask[x, 0] = 1
        selected_label[x, 0] = y_train[x, 0]

    # continue to fill until has at least a 1 and a 0
    while not (np.any(selected_label == 0) and np.any(selected_label == 1)):
        x = select_random_unlabeled_point(selected_mask)
        selected_mask[x, 0] = 1
        selected_label[x, 0] = y_train[x, 0]

    selector = SelectKBest(chi2, k=25)
    selector.fit(select(X_train, selected_mask),
                 select(selected_label, selected_mask))

    current_model = None
    r_label = np.full((num_samples, 1), -1, dtype=np.int)
    r_mask = np.full((num_samples, 1), 0, dtype=np.int)

    for _ in range(np.sum(selected_mask)):
        x = select_random_unlabeled_point(r_mask)
        r_mask[x, 0] = 1
        r_label[x, 0] = y_train[x, 0]

    r_selector = SelectKBest(chi2, k=25)
    r_selector.fit(select(X_train, r_mask), select(r_label, r_mask))

    hB = DefaultModel()
    B_predictions = hB.predict(X_test)

    # metrics needs to be recorded
    svm_errors = []
    random_errors = []
    blank_errors = []

    svm_f1s = []
    random_f1s = []
    blank_f1s = []
    t = np.sum(selected_mask)
    while np.sum(selected_mask) < 2500:
        t = np.sum(selected_mask)

        model = SVC(class_weight='balanced', probability=True)
        labels_ = select(selected_label, selected_mask)
        model.fit(selector.transform(select(X_train, selected_mask)),
                  np.reshape(labels_, labels_.size))
        current_model = model

        predictions_with_proba = model.predict_proba(
            selector.transform(X_train))
        assert predictions_with_proba.shape == (num_samples, 2)

        classes = model.classes_
        assert classes.shape == (2, )
        pos_class_idx = np.where(classes == 1)[0][0]
        assert pos_class_idx == 0 or pos_class_idx == 1

        max_proba = 0
        max_idx = 0
        for i in range(num_samples):
            if selected_mask[i, 0] == 0:  # only consider unlabeled points
                if predictions_with_proba[i, pos_class_idx] > max_proba:
                    max_proba = predictions_with_proba[i, pos_class_idx]
                    max_idx = i

        selected_mask[max_idx, 0] = 1
        selected_label[max_idx, 0] = y_train[max_idx, 0]

        predictions = model.predict(selector.transform(X_test))
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)

        svm_error = np.sum(np.absolute(np.subtract(predictions,
                                                   y_test))) / y_test.size
        print('SVM error after {} queries is {}'.format(t, svm_error))
        svm_errors.append(svm_error)
        svm_f1_score = f1_score(y_test, predictions)
        print('SVM F1 after {} queries is {}'.format(t, svm_f1_score))
        svm_f1s.append(svm_f1_score)

        # Random selection Model
        xr = select_random_unlabeled_point(r_mask)
        r_mask[xr, 0] = 1
        r_label[xr, 0] = y_train[xr, 0]
        r = np.sum(r_mask)
        t = np.sum(selected_mask)
        if r != t:
            print("r = {}, t = {}".format(r, t))

        train_r = select(X_train, r_mask)
        train_r_label = select(y_train, r_mask)
        assert train_r.shape == (r, num_features)
        assert train_r_label.shape == (r, 1)

        model_r = SVC(class_weight='balanced')
        labels_ = select(r_label, r_mask)
        model_r.fit(r_selector.transform(select(X_train, r_mask)),
                    np.reshape(labels_, labels_.size))
        assert model_r.classes_.size == 2
        predictions = model_r.predict(selector.transform(X_test))
        if len(predictions.shape) == 1:
            predictions = np.reshape(predictions, (predictions.size, 1))
        assert predictions.shape == (num_test, 1)
        random_error = np.sum(np.absolute(np.subtract(predictions,
                                                      y_test))) / y_test.size
        print('Random error after {} queries is {}'.format(r, random_error))
        random_errors.append(random_error)
        random_f1_score = f1_score(y_test, predictions)
        print('Random F1 after {} queries is {}'.format(t, random_f1_score))
        random_f1s.append(random_f1_score)

        # Blank Model (prediction all negative from the start)
        blank_error = np.sum(np.absolute(np.subtract(B_predictions,
                                                     y_test))) / y_test.size
        print('Blank learner error queries is {}'.format(blank_error))
        blank_errors.append(blank_error)
        blank_f1_score = f1_score(y_test, B_predictions)
        print('Blank F1 after {} queries is {}'.format(t, blank_f1_score))
        blank_f1s.append(random_f1_score)

    # Final writings
    predictions = current_model.predict(selector.transform(X_test))
    if len(predictions.shape) == 1:
        predictions = np.reshape(predictions, (predictions.size, 1))
    final_error = np.sum(np.absolute(np.subtract(predictions,
                                                 y_test))) / y_test.size
    print('final SVM error is {}'.format(final_error))
    final_f1_score = f1_score(y_test, predictions)
    print('final SVM F1 is {}'.format(final_f1_score))
    print('final number of queries is'.format(t))

    feature_matrix, id_vector = read_blind('{}_BLINDED.csv'.format(
        difficulty.upper()))
    blinded_predictions = current_model.predict(
        selector.transform(feature_matrix))
    blinded_predictions = np.reshape(blinded_predictions,
                                     blinded_predictions.size)
    write_prediction(
        'FS_AMP_{}_BLINDED_PREDICTION_{}.csv'.format(difficulty.upper(),
                                                     num_init_label_copy),
        id_vector, blinded_predictions)

    with open('output/FS_AMP_{}_metrics_{}.txt'.format(difficulty.upper(),
                                                       num_init_label_copy),
              mode='w') as f:
        f.write('SVM errors\n')
        f.write(svm_errors.__str__())
        f.write('\n')
        f.write('Random errors\n')
        f.write(random_errors.__str__())
        f.write('\n')
        f.write('Blank errors\n')
        f.write(blank_errors.__str__())
        f.write('\n')

        f.write('SVM F1 scores\n')
        f.write(svm_f1s.__str__())
        f.write('\n')
        f.write('Random F1 scores\n')
        f.write(random_f1s.__str__())
        f.write('\n')
        f.write('Blank F1 scores\n')
        f.write(blank_f1s.__str__())
        f.write('\n')

        f.flush()
Example #44
0
if __name__ == "__main__":

    print "training..."

    C = 0.19
    gamma = 0.0028
    shrinking = True
    #auto_class_weights = False

    probability = True
    verbose = True

    svc = SVC(C=C,
              gamma=gamma,
              shrinking=shrinking,
              probability=probability,
              verbose=verbose)
    svc.fit(x_train, y_train)
    p = svc.predict_proba(x_test)

    p = p[:, 1]

    # make sure both y and p are of shape (n,1) and not (n,)
    ids_and_p = np.hstack((ids.reshape((-1, 1)), p.reshape((-1, 1))))
    np.savetxt(output_file,
               ids_and_p,
               fmt=['%d', '%.10f'],
               delimiter=',',
               header='UserID,Probability1',
               comments='')
Example #45
0
    def onnx_test_svm_single_classreg(self,
                                      dtype,
                                      n_targets=1,
                                      debug=False,
                                      add_noise=False,
                                      runtime='python',
                                      target_opset=None,
                                      kind='reg',
                                      level=1,
                                      **kwargs):
        iris = load_iris()
        X, y = iris.data, iris.target
        if add_noise:
            X += numpy.random.randn(X.shape[0], X.shape[1]) * 10
        if kind == 'reg':
            y = y.astype(dtype)
        elif kind == 'bin':
            y = (y % 2).astype(numpy.int64)
        elif kind == 'mcl':
            y = y.astype(numpy.int64)
        else:
            raise AssertionError("unknown '{}'".format(kind))

        if n_targets != 1:
            yn = numpy.empty((y.shape[0], n_targets), dtype=dtype)
            for i in range(n_targets):
                yn[:, i] = y + i
            y = yn
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
        X_test = X_test.astype(dtype)
        if kind in ('bin', 'mcl'):
            clr = SVC(**kwargs)
        elif kind == 'reg':
            clr = SVR(**kwargs)
        clr.fit(X_train, y_train)

        model_def = to_onnx(clr,
                            X_train.astype(dtype),
                            rewrite_ops=True,
                            target_opset=target_opset)
        if 'onnxruntime' in runtime:
            model_def.ir_version = get_ir_version_from_onnx()
        try:
            oinf = OnnxInference(model_def, runtime=runtime)
        except RuntimeError as e:
            if debug:
                raise RuntimeError(
                    "Unable to create a model\n{}".format(model_def)) from e
            raise e

        if debug:
            y = oinf.run({'X': X_test}, verbose=level, fLOG=print)
        else:
            y = oinf.run({'X': X_test})

        lexp = clr.predict(X_test)
        if kind == 'reg':
            self.assertEqual(list(sorted(y)), ['variable'])
            if dtype == numpy.float32:
                self.assertEqualArray(lexp.ravel(),
                                      y['variable'].ravel(),
                                      decimal=5)
            else:
                self.assertEqualArray(lexp, y['variable'], decimal=5)
        else:
            self.assertEqual(list(sorted(y)),
                             ['output_label', 'output_probability'])
            self.assertEqualArray(lexp, y['output_label'])
            lprob = clr.predict_proba(X_test)
            self.assertEqualArray(lprob,
                                  DataFrame(y['output_probability']).values,
                                  decimal=5)
Example #46
0
    if auroc is not None:
        arq.write("{metric:<18}{value:.4f}\n".format(metric="AUROC:",
                                                     value=auroc))
    if aupr is not None:
        arq.write("{metric:<18}{value:.4f}\n".format(metric="AUPR:",
                                                     value=aupr))


# %%
for n in range(5):

    with gzip.open(caminho + ' ' + str(n), 'rb') as arquivo:
        treino, validacao, teste = pickle.load(arquivo)

    svc_clf = SVC(probability=True, verbose=True,
                  random_state=n)  # Modifique aqui os hyperparâmetros
    svc_clf.fit(treino.iloc[:, :-2], treino.iloc[:, -2])
    svc_pred_class = svc_clf.predict(validacao.iloc[:, :-2])
    svc_pred_scores = svc_clf.predict_proba(validacao.iloc[:, :-2])

    accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics(
        validacao.iloc[:, -2], svc_pred_class, svc_pred_scores)
    print('Performance no conjunto de validação:')
    print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr)

    with open('resultados.txt', 'a+') as arq_resul:
        arq_resul.write('Resultado utilizando o banco de dados ' + str(n) +
                        ':\n')
        print_metrics_summary2(accuracy, recall, precision, f1, arq_resul,
                               auroc, aupr)
        arq_resul.write('\n')
# print(X_train)
# print(X_test)

# Training the Kernel SVM model on the Training set

classifier = SVC( C= 20, kernel ='rbf', random_state = 0, probability= True)
classifier.fit(X_train, y_train)



# Predicting the Test set results
y_pred = classifier.predict(X_test)


# ROC Curve plot
prob = classifier.predict_proba(X_test)
prob = prob[:, 1]
auc = metrics.roc_auc_score(y_test, prob)
print('AUC: {}\n'.format(auc))
fpr, tpr, thresholds = metrics.roc_curve(y_test, prob)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = {}'.format(auc))
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
shu = data
shu = scale(shu)
label1 = np.ones((150, 1))  #Value can be changed
label2 = np.zeros((150, 1))
label = np.append(label1, label2)
X = shu
y = label
sepscores = []
ytest = np.ones((1, 2)) * 0.5
yscore = np.ones((1, 2)) * 0.5
cv_clf = SVC(probability=True)
skf = StratifiedKFold(n_splits=5)
for train, test in skf.split(X, y):
    y_train = utils.to_categorical(y[train])
    hist = cv_clf.fit(X[train], y[train])
    y_score = cv_clf.predict_proba(X[test])
    yscore = np.vstack((yscore, y_score))
    y_test = utils.to_categorical(y[test])
    ytest = np.vstack((ytest, y_test))
    fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0])
    roc_auc = auc(fpr, tpr)
    y_class = utils.categorical_probas_to_classes(y_score)
    y_test_tmp = y[test]
    acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace(
        len(y_class), y_class, y_test_tmp)
    sepscores.append(
        [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc])
    print(
        'SVC:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f'
        % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc))
scores = np.array(sepscores)
Example #49
0
# ==================================

# TESTING PART:
# ------------

# Extracting Testing Data:
(X_test, y_test) = ef.fetch_test()
x_m, y_sd, X_test = ef.normalize(
    X_test, xm=x_m, ysd=y_sd
)  # Normalizes the testing data with the mean and SD of the training set.

# Fitting the classifier:
print '\nUsing %s estimators of depth %s.\n' % (str(num_est), str(tree_depth))

# Deriving the ROC curve:
y_check = clf.predict(X_test)
y_hat = clf.predict_proba(X_test)
y_hat = array([entry[1] for entry in y_hat])
fpr, tpr, thresholds = roc_curve(y_test, y_hat, pos_label=1)

# Plotting the result:
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Evaluating the AUC:
area = trapz(tpr, fpr)
print 'Area under the AUC curve:', area
Example #50
0
    print("getting accuracies %s" % i)  #Use score() function to get accuracy
    npar_pred = np.array(prediction_data)
    pred_lin = clf.score(npar_pred, prediction_labels)
    print "linear: ", pred_lin
    accur_lin.append(pred_lin)  #Store accuracy in a list
    #print("Mean value lin svm: %s" %np.mean(accur_lin)) #FGet mean accuracy of the 10 runs

    test_image = sys.argv[1]
    image = cv2.imread(test_image)  #open image
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  #convert to grayscale
    clahe_image = clahe.apply(gray)
    get_landmarks(clahe_image)
    if data['landmarks_vectorised'] == "error":
        print("no face detected on this one")
    else:
        val = clf.predict_proba([data['landmarks_vectorised']])
        anger.append(val[0][0])
        fear.append(val[0][1])
        happy.append(val[0][2])
        sadness.append(val[0][3])
    '''
    if val==0:
        print("anger")
    elif val==1:
        print("contempt")
    elif val==2:
        print("disgust")
    elif val==3:
        print("fear")
    elif val==4:
        print("happy")
Example #51
0
def main(args):

    with tf.Graph().as_default():

        with tf.Session() as sess:

            np.random.seed(seed=args.seed)

            if args.use_split_dataset:
                dataset_tmp = facenet.get_dataset(args.data_dir)
                train_set, test_set = split_dataset(dataset_tmp, args.min_nrof_images_per_class, args.nrof_train_images_per_class)
                if (args.mode=='TRAIN'):
                    dataset = train_set
                elif (args.mode=='CLASSIFY'):
                    dataset = test_set
            else:
                dataset = facenet.get_dataset(args.data_dir)

            # Check that there are at least one training image per class
            for cls in dataset:
                assert(len(cls.image_paths)>0, 'There must be at least one image for each class in the dataset')


            paths, labels = facenet.get_image_paths_and_labels(dataset)

            print('Number of classes: %d' % len(dataset))
            print('Number of images: %d' % len(paths))

            # Load the model
            print('Loading feature extraction model')
            facenet.load_model(args.model)

            # Get input and output tensors
            images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
            embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
            phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
            embedding_size = embeddings.get_shape()[1]

            # Run forward pass to calculate embeddings
            print('Calculating features for images')
            nrof_images = len(paths)
            nrof_batches_per_epoch = int(math.ceil(1.0*nrof_images / args.batch_size))
            emb_array = np.zeros((nrof_images, embedding_size))
            for i in range(nrof_batches_per_epoch):
                start_index = i*args.batch_size
                end_index = min((i+1)*args.batch_size, nrof_images)
                paths_batch = paths[start_index:end_index]
                images = facenet.load_data(paths_batch, False, False, args.image_size)
                feed_dict = { images_placeholder:images, phase_train_placeholder:False }
                emb_array[start_index:end_index,:] = sess.run(embeddings, feed_dict=feed_dict)

            classifier_filename_exp = os.path.expanduser(args.classifier_filename)

            if (args.mode=='TRAIN'):
                # Train classifier
                print('Training classifier')
                model = SVC(kernel='linear', probability=True)
                model.fit(emb_array, labels)

                # Create a list of class names
                class_names = [ cls.name.replace('_', ' ') for cls in dataset]

                # Saving classifier model
                with open(classifier_filename_exp, 'wb') as outfile:
                    pickle.dump((model, class_names), outfile)
                print('Saved classifier model to file "%s"' % classifier_filename_exp)

            elif (args.mode=='CLASSIFY'):
                # Classify images
                print('Testing classifier')
                with open(classifier_filename_exp, 'rb') as infile:
                    (model, class_names) = pickle.load(infile)

                print('Loaded classifier model from file "%s"' % classifier_filename_exp)

                predictions = model.predict_proba(emb_array)
                best_class_indices = np.argmax(predictions, axis=1)
                best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices]

                for i in range(len(best_class_indices)):
                    print('%4d  %s: %.3f' % (i, class_names[best_class_indices[i]], best_class_probabilities[i]))

                accuracy = np.mean(np.equal(best_class_indices, labels))
                print('Accuracy: %.3f' % accuracy)
# In[10]:

y_test['target'].values

# In[11]:

error = 0
for i, v in enumerate(svm.predict(X_test_std)):
    if v != y_test['target'].values[i]:
        error += 1
print(error)

# In[12]:

svm.predict_proba(X_test_std)

# In[13]:

from matplotlib.colors import ListedColormap


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # 畫出決定的平面
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
Example #53
0
## At this point, the new code begins
## I run the SVC code on the cleaned data
## I run the clf.predict to get a binary output and call that array 'bout'
## And then to generate the ROC curve I run the clf.predict_proba to get
## probabilities for each prediction

t = .001  # tolerance parameter
kp = 'rbf'  # kernel parameter
print('\n\nSupport Vector Machine classifier\n')
clf = SVC(kernel=kp, probability=True, tol=t)
clf.fit(train_features, train_targets)
print("predictions for test set:")
bout = clf.predict(test_features)
## and now the probability version
prob_out = clf.predict_proba(test_features)
print(bout)
print('actual class values:')
target_array = np.array(test_targets)
print(target_array)
print('The number of predictions that differ from actual')
print(sum(bout != target_array))

## and Now I just re-use to code from L09-AccuracyMeasures tutorial

CM = confusion_matrix(target_array, bout)
print("\n\nConfusion matrix:\n", CM)
tn, fp, fn, tp = CM.ravel()
print("\nTP, TN, FP, FN:", tp, ",", tn, ",", fp, ",", fn)
AR = accuracy_score(target_array, bout)
print("\nAccuracy rate:", AR)
Example #54
0
def mipsMain(thresh):
    print thresh
    tex = loadtxt('/home/aditya/Project/csvFiles/trainNLP.csv', delimiter=',')
    tex_dev = loadtxt('/home/aditya/Project/csvFiles/devNLP.csv',
                      delimiter=',')
    vals = tex.shape
    vals_dev = tex_dev.shape
    ty = tex[:, vals[1] - 1]
    tx = tex[:, 0:vals[1] - 2]
    ty_dev = tex_dev[:, vals_dev[1] - 1]
    tx_dev = tex_dev[:, 0:vals_dev[1] - 2]
    clf = SVC(C=4.0,
              cache_size=200,
              coef0=0.0,
              degree=3,
              gamma=9.0,
              kernel='linear',
              probability=True,
              scale_C=False,
              shrinking=True,
              tol=0.0001)
    clf.fit(tx, ty)
    #prev_pred=clf.predict(tx_dev[:,0:vals[1]-2])
    #temp = clf.predict_proba(tx_dev[:,0:vals[1]-2])
    prev_pred = clf.predict(tx[:, 0:vals[1] - 2])
    temp = clf.predict_proba(tx[:, 0:vals[1] - 2])
    (ninst, nclass) = temp.shape
    print temp.shape
    obj_coeff = temp.tolist()
    # Mixed Integer Programming
    m = Model()
    # declare Variables
    vars = {}
    for i in range(ninst):
        for j in range(nclass):
            vars[i, j] = m.addVar(obj=-1 * math.log(obj_coeff[i][j]),
                                  vtype=GRB.BINARY,
                                  name='x' + str(i) + '_' + str(j))
    m.update()
    # Add Constraints
    # everything is a Regulation:1,
    # Activation:2 and Inhibition:3 do not occur together
    # Requirement:4 participates in Activation and Inhibition
    # Binding:5 may or may not coexist Transcription:6
    for i in range(ninst):
        # constraint on number of labels predicted
        num_labels = sum(
            np.array([obj_coeff[i][jo] for jo in range(nclass)]) > thresh)
        if num_labels > 0:
            # making sure Regulation always exist
            # m.addConstr(vars[i,0]==1)
            m.addConstr(
                quicksum(vars[i, j] for j in range(nclass)) == num_labels)
            # making sure Activation and Inhibition do not occur together
            m.addConstr(vars[i, 1] + vars[i, 2] <= 1)
            m.addConstr((vars[i, 1] > 0 and vars[i, 0] >= 0)
                        or (vars[i, 1] <= 0))
            # adding a constraint restricting the co-existance of Inhibition and Transcription
            # m.addConstr(vars[i,2]+vars[i,5] <= 1)
            # m.addConstr(vars[i,1]+vars[i,3] == 2)
    m.update()
    # Optimize Model
    m._vars = vars
    m.optimize()
    final = []
    for i in range(ninst):
        final.append(
            [float(j + 1) for j in range(nclass) if m._vars[i, j].x == 1.0])
    return [final, prev_pred]
Example #55
0
def nestedCrossValidation(data, cVals, kFolds=10, mFolds=5):
    '''
    Nested 10x5 crossvalidation by default. Tests given cVals to determine
    best hyperparameters
    :param data: Data being trained on and tested on
    :param cVals: cVals being tested
    :param kFolds: number of Outer folds
    :param mFolds: number of inner folds
    '''
    target = 0
    data = data.sample(frac=1).reset_index(drop=True)

    wineData1 = data.copy()
    wineData1.loc[wineData1[target] != 1, target] = 0
    wineData2 = data.copy()
    wineData2.loc[wineData2[target] != 2, target] = 0
    wineData3 = data.copy()
    wineData3.loc[wineData3[target] != 3, target] = 0

    numRows = len(data)
    oneKthRows = round(numRows / kFolds)

    resultsTable1 = pd.DataFrame(columns=["OuterFold", "cVal", "TrainAccuracy", \
            "TrainPrecision", "TrainRecall", "TestAccuracy", "TestPrecision", "TestRecall"])
    resultsTable2 = pd.DataFrame(columns=["OuterFold", "cVal", "TrainAccuracy", \
            "TrainPrecision", "TrainRecall", "TestAccuracy", "TestPrecision", "TestRecall"])
    resultsTable3 = pd.DataFrame(columns=["OuterFold", "cVal", "TrainAccuracy", \
            "TrainPrecision", "TrainRecall", "TestAccuracy", "TestPrecision", "TestRecall"])
    combinedResults = pd.DataFrame(columns=["OuterFold", "Combined TestAccuracy", \
                                            "Combined TestPrecision", "Combined TestRecall"])

    _, ax1 = plt.subplots(1)
    _, ax2 = plt.subplots(1)
    _, ax3 = plt.subplots(1)
    ax1.set_title("ROC-AUC Curve for Class 1")
    ax2.set_title("ROC-AUC Curve for Class 2")
    ax3.set_title("ROC-AUC Curve for Class 3")

    for k in range(kFolds):

        testDataK1 = wineData1[(k * oneKthRows):oneKthRows * (k + 1)]
        trainDataK1 = wineData1.drop(
            wineData1.index[(k * oneKthRows):oneKthRows * (k + 1)])
        testDataK1 = testDataK1.reset_index(drop=True)
        trainDataK1 = trainDataK1.reset_index(drop=True)

        testDataK2 = wineData2[(k * oneKthRows):oneKthRows * (k + 1)]
        trainDataK2 = wineData2.drop(
            wineData2.index[(k * oneKthRows):oneKthRows * (k + 1)])
        testDataK2 = testDataK2.reset_index(drop=True)
        trainDataK2 = trainDataK2.reset_index(drop=True)

        testDataK3 = wineData3[(k * oneKthRows):oneKthRows * (k + 1)]
        trainDataK3 = wineData3.drop(
            wineData3.index[(k * oneKthRows):oneKthRows * (k + 1)])
        testDataK3 = testDataK3.reset_index(drop=True)
        trainDataK3 = trainDataK3.reset_index(drop=True)

        oneMthRows = round(len(trainDataK1) / mFolds)

        bestAcc1 = 0
        bestC1 = 0

        bestAcc2 = 0
        bestC2 = 0

        bestAcc3 = 0
        bestC3 = 0

        for m in range(mFolds):

            testDataM1 = trainDataK1[(m * oneMthRows):oneMthRows * (m + 1)]
            trainDataM1 = trainDataK1.drop(
                trainDataK1.index[(m * oneMthRows):oneMthRows * (m + 1)])

            testDataM2 = trainDataK2[(m * oneMthRows):oneMthRows * (m + 1)]
            trainDataM2 = trainDataK2.drop(
                trainDataK2.index[(m * oneMthRows):oneMthRows * (m + 1)])

            testDataM3 = trainDataK3[(m * oneMthRows):oneMthRows * (m + 1)]
            trainDataM3 = trainDataK3.drop(
                trainDataK3.index[(m * oneMthRows):oneMthRows * (m + 1)])

            # normalize training and test data with zscore normalization
            trainDataM1, normParams, _ = pf.zScoreNormalization(
                trainDataM1, target)
            testDataM1, _, _ = pf.zScoreNormalization(testDataM1,
                                                      target,
                                                      normParams=normParams)
            trainDataM2, normParams, _ = pf.zScoreNormalization(
                trainDataM2, target)
            testDataM2, _, _ = pf.zScoreNormalization(testDataM2,
                                                      target,
                                                      normParams=normParams)
            trainDataM3, normParams, _ = pf.zScoreNormalization(
                trainDataM3, target)
            testDataM3, _, _ = pf.zScoreNormalization(testDataM3,
                                                      target,
                                                      normParams=normParams)

            # Try all combinations of gamma and C values from the lists given at each inner fold
            for c in cVals:

                x_train1 = trainDataM1.drop(target, axis=1)
                y_train1 = trainDataM1[target]
                x_test1 = testDataM1.drop(target, axis=1)
                y_test1 = testDataM1[target]

                x_train2 = trainDataM2.drop(target, axis=1)
                y_train2 = trainDataM2[target]
                x_test2 = testDataM2.drop(target, axis=1)
                y_test2 = testDataM2[target]

                x_train3 = trainDataM3.drop(target, axis=1)
                y_train3 = trainDataM3[target]
                x_test3 = testDataM3.drop(target, axis=1)
                y_test3 = testDataM3[target]

                # gaussian kernel
                classifier1 = SVC(kernel='linear', C=c, probability=True)
                classifier2 = SVC(kernel='linear', C=c, probability=True)
                classifier3 = SVC(kernel='linear', C=c, probability=True)

                # Train model
                classifier1.fit(x_train1, y_train1)
                classifier2.fit(x_train2, y_train2)
                classifier3.fit(x_train3, y_train3)

                # TESTING model
                y_pred1 = classifier1.predict(x_test1)
                y_pred2 = classifier2.predict(x_test2)
                y_pred3 = classifier3.predict(x_test3)

                y_prob1 = classifier1.predict_proba(x_test1)
                y_prob1 = y_prob1[:, [1]]

                y_prob2 = classifier2.predict_proba(x_test2)
                y_prob2 = y_prob2[:, [1]]

                y_prob3 = classifier3.predict_proba(x_test3)
                y_prob3 = y_prob3[:, [1]]

                acc1 = metrics.accuracy_score(y_test1, y_pred1)
                acc2 = metrics.accuracy_score(y_test2, y_pred2)
                acc3 = metrics.accuracy_score(y_test3, y_pred3)

                if acc1 > bestAcc1:
                    bestAcc1 = acc1
                    bestC1 = c
                if acc2 > bestAcc2:
                    bestAcc2 = acc2
                    bestC2 = c
                if acc3 > bestAcc3:
                    bestAcc3 = acc3
                    bestC3 = c

            print("Kfold:", k + 1, "Mfold:", m + 1, "Class1BestC:", bestC1, \
                   "Class2BestC:", bestC2, "Class3BestC:", bestC3)

        # normalize training and test data with zscore normalization
        trainDataK1, normParams, _ = pf.zScoreNormalization(
            trainDataK1, target)
        testDataK1, _, _ = pf.zScoreNormalization(testDataK1,
                                                  target,
                                                  normParams=normParams)
        trainDataK2, normParams, _ = pf.zScoreNormalization(
            trainDataK2, target)
        testDataK2, _, _ = pf.zScoreNormalization(testDataK2,
                                                  target,
                                                  normParams=normParams)
        trainDataK3, normParams, _ = pf.zScoreNormalization(
            trainDataK3, target)
        testDataK3, _, _ = pf.zScoreNormalization(testDataK3,
                                                  target,
                                                  normParams=normParams)

        x_train1 = trainDataK1.drop(target, axis=1)
        y_train1 = trainDataK1[target]
        x_test1 = testDataK1.drop(target, axis=1)
        y_test1 = testDataK1[target]

        x_train2 = trainDataK2.drop(target, axis=1)
        y_train2 = trainDataK2[target]
        x_test2 = testDataK2.drop(target, axis=1)
        y_test2 = testDataK2[target]

        x_train3 = trainDataK3.drop(target, axis=1)
        y_train3 = trainDataK3[target]
        x_test3 = testDataK3.drop(target, axis=1)
        y_test3 = testDataK3[target]

        # gaussian kernel
        classifier1 = SVC(kernel='linear', C=bestC1, probability=True)
        classifier2 = SVC(kernel='linear', C=bestC2, probability=True)
        classifier3 = SVC(kernel='linear', C=bestC3, probability=True)

        # Train model
        classifier1.fit(x_train1, y_train1)
        classifier2.fit(x_train2, y_train2)
        classifier3.fit(x_train3, y_train3)

        # TESTING model
        y_pred1 = classifier1.predict(x_test1)
        y_pred2 = classifier2.predict(x_test2)
        y_pred3 = classifier3.predict(x_test3)

        y_prob1 = classifier1.predict_proba(x_test1)
        y_prob1 = y_prob1[:, [1]]

        y_prob2 = classifier2.predict_proba(x_test2)
        y_prob2 = y_prob2[:, [1]]

        y_prob3 = classifier3.predict_proba(x_test3)
        y_prob3 = y_prob3[:, [1]]

        y_preds = multiclassPredict(y_prob1, y_prob2, y_prob3)
        y_tests = multiclassPredict(np.transpose([np.array(y_test1)]), np.transpose([np.array(y_test2)])\
                               , np.transpose([np.array(y_test3)]))

        acc1 = metrics.accuracy_score(y_test1, y_pred1)
        acc2 = metrics.accuracy_score(y_test2, y_pred2)
        acc3 = metrics.accuracy_score(y_test3, y_pred3)

        prec1 = metrics.precision_score(y_test1, y_pred1)
        prec2 = metrics.precision_score(y_test2, y_pred2, average=None)
        prec2 = prec2[1]
        prec3 = metrics.precision_score(y_test3, y_pred3, average=None)
        prec3 = prec3[1]

        rec1 = metrics.recall_score(y_test1, y_pred1)
        rec2 = metrics.recall_score(y_test2, y_pred2, average=None)
        rec2 = rec2[1]
        rec3 = metrics.recall_score(y_test3, y_pred3, average=None)
        rec3 = rec3[1]

        y_pred1Tr = classifier1.predict(x_train1)
        y_pred2Tr = classifier2.predict(x_train2)
        y_pred3Tr = classifier3.predict(x_train3)

        acc1Tr = metrics.accuracy_score(y_train1, y_pred1Tr)
        acc2Tr = metrics.accuracy_score(y_train2, y_pred2Tr)
        acc3Tr = metrics.accuracy_score(y_train3, y_pred3Tr)

        prec1Tr = metrics.precision_score(y_train1, y_pred1Tr)
        prec2Tr = metrics.precision_score(y_train2, y_pred2Tr, average=None)
        prec2Tr = prec2Tr[1]
        prec3Tr = metrics.precision_score(y_train3, y_pred3Tr, average=None)
        prec3Tr = prec3Tr[1]

        rec1Tr = metrics.recall_score(y_train1, y_pred1Tr)
        rec2Tr = metrics.recall_score(y_train2, y_pred2Tr, average=None)
        rec2Tr = rec2Tr[1]
        rec3Tr = metrics.recall_score(y_train3, y_pred3Tr, average=None)
        rec3Tr = rec3Tr[1]

        testAcc = metrics.accuracy_score(y_tests, y_preds)
        testPrec = metrics.precision_score(y_tests,
                                           y_preds,
                                           average='weighted')
        testRec = metrics.recall_score(y_tests, y_preds, average='weighted')

        resultsTable1.loc[len(resultsTable1)] = [
            k + 1, bestC1, acc1Tr, prec1Tr, rec1Tr, acc1, prec1, rec1
        ]
        resultsTable2.loc[len(resultsTable2)] = [
            k + 1, bestC2, acc2Tr, prec2Tr, rec2Tr, acc2, prec2, rec2
        ]
        resultsTable3.loc[len(resultsTable3)] = [
            k + 1, bestC3, acc3Tr, prec3Tr, rec3Tr, acc3, prec3, rec3
        ]
        combinedResults.loc[len(combinedResults)] = [
            k + 1, testAcc, testPrec, testRec
        ]

        metrics.plot_roc_curve(classifier1,
                               x_test1,
                               y_test1,
                               name="AUC fold#" + str(k + 1),
                               ax=ax1)
        metrics.plot_roc_curve(classifier2,
                               x_test2,
                               y_test2,
                               name="AUC fold#" + str(k + 1),
                               ax=ax2)
        metrics.plot_roc_curve(classifier3,
                               x_test3,
                               y_test3,
                               name="AUC fold#" + str(k + 1),
                               ax=ax3)

    cList = resultsTable1["cVal"].tolist()
    accListTr = resultsTable1["TrainAccuracy"].tolist()
    precListTr = resultsTable1["TrainPrecision"].tolist()
    recListTr = resultsTable1["TrainRecall"].tolist()
    accList = resultsTable1["TestAccuracy"].tolist()
    precList = resultsTable1["TestPrecision"].tolist()
    recList = resultsTable1["TestRecall"].tolist()
    resultsTable1.loc[len(resultsTable1)] = ["Avg", st.mean(cList), \
        st.mean(accListTr), st.mean(precListTr), st.mean(recListTr), st.mean(accList), st.mean(precList), st.mean(recList)]
    resultsTable1.loc[len(resultsTable1)] = ["StdDev", st.stdev(cList), \
        st.stdev(accListTr), st.stdev(precListTr), st.stdev(recListTr), st.stdev(accList), st.stdev(precList), st.stdev(recList)]

    cList = resultsTable2["cVal"].tolist()
    accListTr = resultsTable2["TrainAccuracy"].tolist()
    precListTr = resultsTable2["TrainPrecision"].tolist()
    recListTr = resultsTable2["TrainRecall"].tolist()
    accList = resultsTable2["TestAccuracy"].tolist()
    precList = resultsTable2["TestPrecision"].tolist()
    recList = resultsTable2["TestRecall"].tolist()
    resultsTable2.loc[len(resultsTable2)] = ["Avg", st.mean(cList), \
        st.mean(accListTr), st.mean(precListTr), st.mean(recListTr), st.mean(accList), st.mean(precList), st.mean(recList)]
    resultsTable2.loc[len(resultsTable2)] = ["StdDev", st.stdev(cList), \
        st.stdev(accListTr), st.stdev(precListTr), st.stdev(recListTr), st.stdev(accList), st.stdev(precList), st.stdev(recList)]

    cList = resultsTable3["cVal"].tolist()
    accListTr = resultsTable3["TrainAccuracy"].tolist()
    precListTr = resultsTable3["TrainPrecision"].tolist()
    recListTr = resultsTable3["TrainRecall"].tolist()
    accList = resultsTable3["TestAccuracy"].tolist()
    precList = resultsTable3["TestPrecision"].tolist()
    recList = resultsTable3["TestRecall"].tolist()
    resultsTable3.loc[len(resultsTable3)] = ["Avg", st.mean(cList), \
        st.mean(accListTr), st.mean(precListTr), st.mean(recListTr), st.mean(accList), st.mean(precList), st.mean(recList)]
    resultsTable3.loc[len(resultsTable3)] = ["StdDev", st.stdev(cList), \
        st.stdev(accListTr), st.stdev(precListTr), st.stdev(recListTr), st.stdev(accList), st.stdev(precList), st.stdev(recList)]

    accList = combinedResults["Combined TestAccuracy"].tolist()
    precList = combinedResults["Combined TestPrecision"].tolist()
    recList = combinedResults["Combined TestRecall"].tolist()
    combinedResults.loc[len(combinedResults)] = [
        "Avg", st.mean(accList),
        st.mean(precList),
        st.mean(recList)
    ]
    combinedResults.loc[len(combinedResults)] = [
        "StdDev",
        st.stdev(accList),
        st.stdev(precList),
        st.stdev(recList)
    ]

    return resultsTable1, resultsTable2, resultsTable3, combinedResults
Example #56
0
class SVCT(BaseEstimator):
    """
    2-phase SVC for SASSE ERA5 polygon classification
    """
    def __init__(self,
                 args1={
                     'kernel': 'rbf',
                     'probability': True
                 },
                 args2={
                     'kernel': DotProduct(),
                     'probability': True
                 },
                 verbose=False):
        """
        ...
        """
        self.model1 = SVC(**args1)
        self.model2 = SVC(**args2)

        self.verbose = verbose

    def fit(self, X, y):
        """
        Fit in two phases
        """

        X1 = X
        y1 = y.copy()
        y1[(y1 > 0)] = 1

        X2 = X.copy()
        y2 = y.copy()
        X2 = X2[(y > 0)]
        y2 = y2[(y2 > 0)]

        if self.verbose:
            logging.info('Fitting model 1...')
        self.model1.fit(X1.values, y1)

        if self.verbose:
            logging.info('Fitting model 2...')
        self.model2.fit(X2.values, y2)

        return self

    def predict(self, X):
        """
        Predict
        """

        #y_pred_proba = self.predict_proba(X)
        #return np.argmax(y_pred_proba, axis=1)

        # Alternative, more straight forward method
        y1_ = self.model1.predict(X)
        X2_ = X[(y1_ > 0)]
        y2_ = self.model2.predict(X2_)
        y1_[(y1_ > 0)] = y2_

        return y1_

    def predict_proba(self, X):
        """
        Predict with probabilities
        """

        yp1_ = self.model1.predict_proba(X)
        yp2_ = self.model2.predict_proba(X)

        y_pred_proba = np.zeros((len(X), 3))

        y_pred_proba[:, 0] = yp1_[:, 0]
        y_pred_proba[:, 1] = np.where(yp1_[:, 1] >= .5, yp2_[:, 0], yp1_[:, 1])
        y_pred_proba[:, 2] = np.where(yp1_[:, 1] >= .5, yp2_[:, 1], yp1_[:, 1])

        return y_pred_proba
Example #57
0
def Datasets():
    """
    just wants to split the dataset into 2 sets : train_set, test_set
        train_set : to build model
        test_set : for testing model
        valid_set : optional
        
    """
    # READ CSV
    diabetes = pd.read_csv(filename)
    #print(diabetes) : to see the table

    # DEPENDENT VARs
    X = diabetes.iloc[:, [1, 2]].values
    # INDEPENDENT VAR : Outcome
    y = diabetes['Outcome'].values

    # Visualize the dataset before scalling
    sns.lmplot('Glucose',
               'BloodPressure',
               data=diabetes,
               hue='Outcome',
               palette='Set1',
               fit_reg=False,
               scatter_kws={'s': 70})

    # Splitting the data into training, and testing set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    print("training set: ", X_train.shape)
    print("testing set: ", X_test.shape)

    # Feature scalling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    #scaler = MinMaxScaler()
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)

    # Build model
    svm = SVC(kernel='rbf', probability=True)
    svm.fit(X_train, y_train)

    # Visualize support vectors of training set
    support_vectors = svm.support_vectors_

    plt.scatter(X_train[:, 0], X_train[:, 1])
    plt.scatter(support_vectors[:, 0], support_vectors[:, 1], color='blue')
    plt.title('Support Vectors')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.show()

    # Visualize training & Testing set
    # Replace X_set, y_set according to the visualization
    X_set, y_set = X_train, y_train
    X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),\
                         np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
    plt.contourf(X1,
                 X2,
                 svm.predict(np.array([X1.ravel(),
                                       X2.ravel()]).T).reshape(X1.shape),
                 alpha=0.75,
                 cmap=ListedColormap(('red', 'green')))
    plt.xlim(X1.min(), X1.max())
    plt.ylim(X2.min(), X2.max())

    for i, j in enumerate(np.unique(y_set)):
        plt.scatter(X_set[y_set == j, 0],
                    X_set[y_set == j, 1],
                    c=ListedColormap(('red', 'green'))(i),
                    label=j)

    plt.title('SVM (training)')
    plt.xlabel('Glucose')
    plt.ylabel('Blood Pressure')
    plt.legend()
    plt.show()

    print("\n")

    # Predict
    y_pred = svm.predict(X_test)
    # Accuracy
    test_accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of prediction: ", test_accuracy)

    # Figure out the confusion matrix suchas recall, precision,..
    confusionmatrix = confusion_matrix(y_test, y_pred)
    print("The confusion matrix: ", confusionmatrix)

    print("\n")
    # Evaluating by using Cross val on Training Set & Testing set
    test_scores = cross_val_score(svm,
                                  X_test,
                                  y_test,
                                  scoring='accuracy',
                                  cv=5)
    print("Accuracy of testing set by using cross val : ", test_scores)
    print("Mean of test scores: ", test_scores.mean())

    print("\n")

    # Cross checking back
    # Check the probability of prediction value
    predict_proba_value = svm.predict_proba(X_test)
    y_pred_valid = svm.predict(X_test)

    print(
        "what value of prediction probability so system can determine which class of them: ",
        predict_proba_value)
    print("what system predicts toward the validating set of svm: ",
          y_pred_valid)