コード例 #1
0
def build_and_run_rvc(X_train_scaled, y_train, X_test_scaled, y_test):
    '''
    Takes: training and testing data
    Returns: time to fit, time to predict, plus it prints
    '''

    # build RVC
    rvc_model = RVC()
    print("fitting RVM:")
    start = time.time()
    rvc_model.fit(X_train_scaled, y_train)
    delta0 = time.time() - start
    print("time to fit RVM: ", delta0)

    start = time.time()
    rvc_predict = rvc_model.predict(X_test_scaled)
    delta1 = time.time() - start
    print("time to predict with RVM: ", delta1)

    # print parameters
    print("RVM hyperparameters:")
    print(rvc_model.get_params())

    # evaluate RVC
    print(helpers.confusion_matrix(y_test, rvc_predict))
    print(classification_report(y_test, rvc_predict))

    return delta0, delta1
コード例 #2
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_fit_two_classes(self):
        """Check that fitting with two classes works directly."""
        clf = RVC()

        X = np.array([[1, 2], [2, 1]])

        y = np.array(['A', 'B'])

        clf.fit(X, y)
        np.testing.assert_array_equal(clf.classes_, np.array(['A', 'B']))
コード例 #3
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_fit_three_classes(self):
        """Check that fitting with three classes uses OneVSOne."""
        clf = RVC()

        X = np.array([[1, 2], [2, 1], [2, 2]])

        y = np.array(['A', 'B', 'C'])

        clf.fit(X, y)
        self.assertIsInstance(clf.multi_, OneVsOneClassifier)
        np.testing.assert_array_equal(clf.classes_, np.array(['A', 'B', 'C']))
コード例 #4
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_classification_three_classes(self):
        """Check classification works with three classes."""
        iris = load_iris()

        X = iris.data
        y = iris.target

        clf = RVC()
        clf.fit(X, y)

        self.assertGreater(clf.score(X, y), 0.95)
コード例 #5
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_predict_three_classes(self):
        """Check predict works with three classes."""
        clf = RVC(kernel='linear')

        X = np.array([[5, 5], [5, -5], [-5, 0]])

        y = np.array(['A', 'B', 'C'])

        clf.fit(X, y)

        prediction = clf.predict(np.array([[10, 10]]))
        np.testing.assert_array_equal(prediction, np.array(['A']))
コード例 #6
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_fit_one_class(self):
        """Check that fitting with only one class raises an exception."""
        clf = RVC()

        X = np.array([[1, 2], [2, 1]])

        y = np.array(['A', 'A'])

        try:
            clf.fit(X, y)
        except ValueError as error:
            self.assertEqual(str(error), "Need 2 or more classes.")
        else:
            self.fail()
コード例 #7
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_predict_two_classes(self):
        """Check that predict works with two classes."""
        clf = RVC(kernel='linear')

        X = np.array([
            [2, 1],
            [1, 2],
        ])

        y = np.array(['A', 'B'])

        clf.fit(X, y)

        prediction = clf.predict(np.array([[0, 3]]))
        np.testing.assert_array_equal(prediction, np.array(['A']))
コード例 #8
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_fit_two_classes_imbalanced(self):
        """Check that fitting with two classes works with unequal samples."""
        clf = RVC()

        X = np.array([
            [1, 2],
            [1, 4],
            [4, 2],
            [2, 1],
            [3, 1.5],
        ])

        y = np.array(['A', 'A', 'B', 'B', 'B'])
        clf.fit(X, y)
        np.testing.assert_array_equal(clf.classes_, np.array(['A', 'B']))
コード例 #9
0
def rvc_analysis(random_seed, save_path):
    # Load the data
    # TODO: change the path
    save_path = os.path.join(save_path, 'random_seed_%03d' %random_seed)
    print('Random seed: %03d' %random_seed)
    # Load the saved validation dataset
    project_ukbio_wd, project_data_ukbio, _ = get_paths(debug, dataset)
    with open(os.path.join(save_path, 'splitted_dataset_%s.pickle' %dataset), 'rb') as handle:
            splitted_dataset = pickle.load(handle)

    # Train the model
    model = RVC(kernel='linear')
    model.fit(splitted_dataset['Xtrain_scaled'], splitted_dataset['Ytrain'])

    # make cross validated predictions
    print('Perform prediction in test data')
    y_prediction_test = model.predict(splitted_dataset['Xtest_scaled'])

    y_prediction_validation = model.predict(splitted_dataset['Xvalidate_scaled'])

    # -----------------------------------------------------------------------------
    # Do some statistics. Calculate the confusion matrix

    # Test dataset
    # Look at the confusion matrix for test data
    class_name = np.array(['young', 'old', 'adult'], dtype='U10')
    ax, cm_test = plot_confusion_matrix(splitted_dataset['Ytest'], y_prediction_test,
                          classes=class_name,
                          normalize=True)
    # Look at accuracy
    accuracy_test = accuracy_score(splitted_dataset['Ytest'], y_prediction_test)
    plt.savefig(os.path.join(save_path, 'confusion_matrix_test_rvc.eps'))

   # Predict on the validation dataset
    ax, cm_validation = plot_confusion_matrix(splitted_dataset['Yvalidate'], y_prediction_validation,
                          classes=class_name,
                          normalize=True)
    plt.savefig(os.path.join(save_path, 'confusion_matrix_validation_rvc.eps'))
    # Look at accuracy
    accuracy_val = accuracy_score(splitted_dataset['Yvalidate'],
                                   y_prediction_validation)
    plt.savefig(os.path.join(save_path, 'confusion_matrix_test_rvc.eps'))
    return cm_test, cm_validation, accuracy_test, accuracy_val
コード例 #10
0
ファイル: test_rvm.py プロジェクト: zhaobeile/scikit-rvm
    def test_classification_two_classes(self):
        """Check classification works with two classes."""
        iris = load_iris()

        X = iris.data[:, 1:]
        y = iris.target

        # Only 2 classes needed
        X = X[y != 0]
        y = y[y != 0]

        clf = RVC()

        clf.fit(X, y)

        self.assertGreater(clf.score(X, y), 0.95)

        prob = clf.predict_proba(X[0, :])
        p_target = np.array([[0.999, 5.538e-4]])
        np.testing.assert_allclose(prob, p_target, rtol=1e-2, atol=1e-2)
コード例 #11
0
ファイル: MyRVM.py プロジェクト: wyfunique/ML18SP-Project-2
def TrainMyRVM(XEstimate,
               XValidate,
               ClassLabelsEstimate,
               ClassLabelsValidate,
               Parameters=None):

    training_labels = np.int8(np.zeros(ClassLabelsEstimate.shape[0]))
    validate_labels = np.int8(np.zeros(ClassLabelsValidate.shape[0]))
    for i in range(ClassLabelsEstimate.shape[0]):
        training_labels[i] = np.where(ClassLabelsEstimate[i] == 1)[0]
    for i in range(ClassLabelsValidate.shape[0]):
        validate_labels[i] = np.where(ClassLabelsValidate[i] == 1)[0]

    #get 4000 samples of training data
    #this will get the indices and will give the data and labels the same indices
    idx_training = np.random.choice(np.arange(len(training_labels)),
                                    4000,
                                    replace=False)
    training_labels_sampled = training_labels[idx_training]
    XEstimate_sampled = XEstimate[idx_training]

    #get 1000 samples of training data
    #this will get the indices and will give the data and labels the same indices
    idx_validate = np.random.choice(np.arange(len(validate_labels)),
                                    1000,
                                    replace=False)
    XValidate_sampled = XValidate[idx_validate]
    ClassLabelsValidate_sampled = ClassLabelsValidate[idx_validate]

    #initialize RVM with classification (RVC class)
    rvm = RVC(kernel='rbf', n_iter=1, alpha=1.e-6, beta=1.e-6, verbose=True)
    #fit RVM
    rvm.fit(XEstimate_sampled, training_labels_sampled)
    #predict and return an array of classes for each input
    Yvalidate = rvm.predict(XValidate_sampled)
    EstParameters = rvm
    Rvectors = 1
    return Yvalidate, EstParameters, Rvectors, ClassLabelsValidate_sampled, idx2
コード例 #12
0
def RVM(X_hyper, Y_hyper, X_train, Y_train, X_validate, Y_validate, params):
    clf = RVC(n_iter=100, tol=0.1)
    start = time.clock()

    X_train_reduced = X_train
    X_validate_reduced = X_validate

    train_size = params['train_size']
    test_size = params['test_size']
    train = params['train']

    if train:
        clf.fit(X_train_reduced[:train_size, :], Y_train[:train_size])
        writeObj('rvm_model.pkl', clf)

        Y_pred = clf.predict(X_validate_reduced[:test_size])
        return Y_pred, clf
    else:
        clf = readObj('rvm_model.pkl')
        Y_pred = clf.predict(X_validate_reduced[:test_size])
        return Y_pred, clf

    print "training took ", time.clock() - start, "s"
コード例 #13
0
ファイル: rvm.py プロジェクト: abhven/ML-Project
"""
Authors: Mrunmayee Deshpande, Lu Gan, Bruce Huang, Abhishek Venkataraman 

"""
import timeit
from skrvm import RVC
import numpy as np
import os.path
import scipy.io

from import_data import import_data

## Set data path
parsed_data_path = 'parsed_data/'
[X, Y, valX, valY, testX, testY] = import_data(parsed_data_path)

scipy.io.savemat('train.mat', dict(X=X, Y=Y))
scipy.io.savemat('val.mat', dict(valX=valX, valY=valY))
scipy.io.savemat('test.mat', dict(testX=testX, testY=testY))

## Train a RVM
clf = RVC(verbose=True)
print(clf)
clf.fit(valX, valY)
clf.score(testX, testY)
コード例 #14
0
#Reintegrate the validation set into the train/test sets for RVC

RXTrainHvM = HCvMCI[train_inds,]
RYTrainHvM = YHvM[train_inds]
RXTestHvM = HCvMCI[test_inds,]
RYTestHvM = YHvM[test_inds]

#resampling w/ SMOTE to account for uneven sampling

[XTrainResHvM,YTrainResHvM] = SMOTE(random_state = 100,k_neighbors = 3).fit_resample(RXTrainHvM,RYTrainHvM)
[XTestResHvM,YTestResHvM] = SMOTE(random_state = 100,k_neighbors = 3).fit_resample(RXTestHvM,RYTestHvM)

from skrvm import RVC
RVCMod = RVC(kernel = 'linear',
             verbose = True)
RVCMod.fit(XTrainResHvm,YTrainResHvM)

#create feature importance evaluation function

def RVMFeatImp(RVs):
    NumRVs = RVs.shape[0]
    SumD = 0
    for RVNum in range(1,NumRVs):
        d1 = RVs[RVNum-1,]
        d2 = sum(numpy.ndarray.flatten(
                RVs[numpy.int8(
                        numpy.setdiff1d(numpy.linspace(0,NumRVs-1,NumRVs),RVNum))]))
        SumD = SumD + (d1/d2)
    SumD = SumD/NumRVs
    return SumD
コード例 #15
0
def TrainMyClassifier(XEstimate, ClassLabels, XValidate, Parameters):
    # RVM
    if Parameters['algorithm'] == 'RVM':
        Parameters = Parameters['parameters']

        clf = RVC(alpha=Parameters.get('alpha'),
                  beta=Parameters.get('beta'),
                  n_iter=Parameters.get('n_iter'))
        clf.fit(XEstimate, ClassLabels)
        if np.shape(clf.classes_)[0] == 2:
            Yvalidate = clf.predict_proba(XValidate)
        else:
            Yvalidate = predict_proba(clf, XValidate)
        EstParameters = get_params(clf)

        return Yvalidate, EstParameters
#SVM

    elif Parameters['algorithm'] == 'SVM':

        svc = get_svc(Parameters)
        svc_train(svc, XEstimate, ClassLabels)
        prob = svc_probability(svc, XValidate)
        EstParameters = svc_get_para(svc)

        prob_std = np.ndarray.std(prob, axis=1)[:, np.newaxis]
        sigmoid = 1 - expit(prob_std)
        Yvalidate = np.concatenate([prob, sigmoid], axis=1)
        Yvalidate = Yvalidate / np.repeat(
            (sigmoid + 1), axis=1, repeats=len(svc.classes_) + 1)

        return Yvalidate, EstParameters
#GPR
    elif Parameters["algorithm"] == "GPR":
        # get the classes from the labels
        classes = np.unique(ClassLabels, axis=0)
        sorted(classes, reverse=True)
        num_class = len(classes)

        # get data and label based on classes
        data = []
        for cla in classes:
            data.append(XEstimate[ClassLabels == cla])

        target = []
        for cla in classes:
            target.append(ClassLabels[ClassLabels == cla])

        # put data and label into a matrix, so that we could do a easier calculation for probability
        # the following calculation is all based on the matrix
        data_matrix = []
        for i in range(num_class - 1):
            data_matrix.append([])
            for j in range(num_class - 1):
                data_matrix[i].append(None)

        target_matrix = []
        for i in range(num_class - 1):
            target_matrix.append([])
            for j in range(num_class - 1):
                target_matrix[i].append(None)

        for i in range(num_class - 1):
            for j in range(i, num_class - 1):
                data_matrix[i][j] = np.concatenate([data[i], data[j + 1]],
                                                   axis=0)
                target_matrix[i][j] = np.concatenate(
                    [target[i], target[j + 1]], axis=0)

        classifier_matrix = []
        for i in range(num_class - 1):
            classifier_matrix.append([])
            for j in range(num_class - 1):
                classifier_matrix[i].append(None)

        for i in range(num_class - 1):
            for j in range(i, num_class - 1):
                gpc_classifier = GaussianProcessClassifier(
                    kernel=Parameters["parameters"]["kernel"],
                    optimizer=Parameters["parameters"]["optimizer"],
                    n_restarts_optimizer=Parameters["parameters"]
                    ["n_restarts_optimizer"],
                    max_iter_predict=Parameters["parameters"]
                    ["max_iter_predict"],
                    warm_start=Parameters["parameters"]["warm_start"],
                    copy_X_train=Parameters["parameters"]["copy_X_train"],
                    random_state=Parameters["parameters"]["random_state"],
                    multi_class="one_vs_rest",
                    n_jobs=Parameters["parameters"]["n_jobs"])
                gpc_classifier.fit(data_matrix[i][j], target_matrix[i][j])
                classifier_matrix[i][j] = gpc_classifier

        out_matrix = []
        for i in range(num_class - 1):
            out_matrix.append([])
            for j in range(num_class - 1):
                out_matrix[i].append(None)

        for i in range(num_class - 1):
            for j in range(i, num_class - 1):
                out_matrix[i][j] = classifier_matrix[i][j].predict_proba(
                    XValidate)

        # calculate the whole prediction prob
        val_shape = XValidate.shape[0]
        predict_prob_list = []
        for i in range(num_class):
            predict_prob_list.append(np.zeros(shape=[val_shape, 1]))

        for i in range(num_class - 1):
            for j in range(i, num_class - 1):
                predict_prob_list[i] += out_matrix[i][j][:,
                                                         0][:, np.newaxis] / (
                                                             num_class * 2)
                predict_prob_list[
                    j +
                    1] += out_matrix[i][j][:, 1][:,
                                                 np.newaxis] / (num_class * 2)

        # get the result of num_class probability
        result = np.concatenate(predict_prob_list, axis=1)

        # calculate the probability for the one more class
        std = np.std(result, axis=1)[:, np.newaxis]
        other_prob = np.exp(-std) / (1 + np.exp(std * 5))
        result = np.concatenate([result, other_prob], axis=1)
        result = result / np.repeat(
            (other_prob + 1), axis=1, repeats=num_class + 1)

        # put all the parameters into a dict
        estParameters = {}
        estParameters["class_num"] = num_class
        estParameters["parameters"] = []
        for i in range(num_class - 1):
            for j in range(i, num_class - 1):
                estParameters["parameters"].append({
                    "log_marginal_likelihood_value_":
                    classifier_matrix[i][j].log_marginal_likelihood_value_,
                    "classes_":
                    classifier_matrix[i][j].classes_,
                    "n_classes_":
                    classifier_matrix[i][j].n_classes_,
                    "base_estimator_":
                    classifier_matrix[i][j].base_estimator_
                })

        return result, estParameters
コード例 #16
0
ファイル: rvm.py プロジェクト: TrinaZ/MixGenotype
from skrvm import RVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


def load_data():
    f = open('F:/importantfilecopy/tandomrepeat/2019.121test/5results', 'r')
    # f = open('F:/importantfilecopy/tandomrepeat/3/1x/total.txt', 'r')
    data_set = []
    label_set = []
    for line in f:
        line = line.strip().split("\t")
        line_list = line[0].strip().split(" ")
        data_set.append(line_list)
        label_set.append(line[1])
    return data_set, label_set


data, label = load_data()
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.3,
                                                    random_state=0)
clf = RVC()
# clf.fit(rvm_data, rvm_target)
# print(clf.score(rvm_data, rvm_target))
clf.fit(X_train, y_train)
scoring = 'accuracy'
scores = cross_val_score(clf, X_test, y_test, cv=7)
print(scores.mean())
コード例 #17
0
p1_classes = full_class_array[0:690]

#normalize
p1_normal_data = preprocessing.scale(p1_data)  #normalize

#PCA
pca = PCA(10, svd_solver='auto')
pca.fit(p1_normal_data)
p1_pca_data = pca.transform(p1_normal_data)  #transform data to xx components

p1_pca_data
#classes_numbered, class_numbers_names = class2numbers(p1_classes)

## RVM classification
clf1 = RVC()
clf1.fit(p1_pca_data, p1_classes)

pred = clf1.predict(p1_pca_data)

correct = 0
for i in range(np.size(pred, 0)):
    if pred[i] == p1_classes[i]:
        correct += 1

clf1

params = clf1.get_params
params.alpha_

clf1.predict_proba
コード例 #18
0
from skrvm import RVC
from sklearn.datasets import load_iris

if __name__ == "__main__":
    clf = RVC()
    clf.verbose = True  # Print iteration, alpha, beta, gamma, m, Relevance vectors
    data = load_iris()
    trainData = data.data
    trainTargets = data.target
    print(clf.fit(trainData, trainTargets))
    #print(clf.score(trainData, trainTargets))
コード例 #19
0
YTrainHvM = YHvM[TrainInds]
XValHvM = HCvMCI[ValInds, ]
YValHvM = YHvM[ValInds]
XTestHvM = HCvMCI[TestInds, ]
YTestHvM = YHvM[TestInds]

#Running RVC - HCvMCI

RXTrainHvM = HCvMCI[train_inds, ]
RYTrainHvM = YHvM[train_inds]
RXTestHvM = HCvMCI[test_inds, ]
RYTestHvM = YHvM[test_inds]

from skrvm import RVC
RVCMod = RVC(kernel='linear', verbose=True)
RVCMod.fit(RXTrainHvM, RYTrainHvM)


def RVMFeatImp(RVs):
    NumRVs = RVs.shape[0]
    SumD = 0
    for RVNum in range(1, NumRVs):
        d1 = RVs[RVNum - 1, ]
        d2 = sum(
            np.ndarray.flatten(RVs[np.int8(
                np.setdiff1d(np.linspace(0, NumRVs - 1, NumRVs), RVNum))]))
        SumD = SumD + (d1 / d2)
    SumD = SumD / NumRVs
    return SumD

コード例 #20
0
    New[i, :] = n.params
#mdl = smt.AR(d_tot[100,:]).fit(maxlag=30, ic='aic', trend='nc')
#est_order=smt.AR(d_tot[1,:]).select_order(maxlag=30, ic='aic', trend='nc')
#print(est_order)
#p_orders=np.zeros([1380,1])
#for i in range(1380):
#   X=AR(d_tot[i,:])
#  n=X.fit(maxlag=4,ic='aic')
# p_orders[i,0]=len(n.params)

#np.mean(p_orders)

#plt.scatter(New[:,1],New[:,3])

An_lab = Animal_label(tot)

#shuffle data to take them in random order
indx = random.sample(range(10350), 10350)
a = New[indx, :]
b = An_lab[indx]
#define trainning and test set
x_train = a[0:8000, :]
y_train = b[0:8000]
x_test = a[8001:, :]
y_test = b[8001:]
#SVM with 600 out of 690 samples as training data
from skrvm import RVC
clf1 = RVC(kernel='rbf')
clf1.fit(x_train, y_train)
clf1.score(x_test, y_test)
コード例 #21
0
feature_scaler = StandardScaler()  
X_train = feature_scaler.fit_transform(X_train)  
X_test = feature_scaler.transform(X_test)  
####################################################################################################################
#CROSS VALIDATION SPLIT IN K-folds
######################################################################################################################
kf = KFold(n_splits=5)
kf.get_n_splits(X_train,X_test)    
print(kf)
#CREATE WIDTHS FOR GRID SEARCH
width1= np.linspace(-5,4,10)
width=10**width1
#create matrix to input the scores and widths of each iteration
score_width=np.ones([len(width),2])
##################
#FIRST for loop gives the values for different widths, SECOND for loop does Kfold_validation
for i in range(len(width)):
    score=0
    for train_index, test_index in kf.split(X_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train1, X_test1 = X_train[train_index], X_train[test_index]
        y_train1, y_test1 =y_train[train_index], y_train[test_index]
        clf1=RVC(kernel='rbf',coef1=width[i])
        clf1.fit(X_train1,y_train1)
        score=score+clf1.score(X_test1,y_test1) 
    score_width[i,0]=score
    score_width[i,1]=width[i]
#########################################################################################################
idx=np.argmax(score_width[:,0])
best_width=score_width[idx,1]
コード例 #22
0
ファイル: untitled0.py プロジェクト: notepi/mywork
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 21:26:07 2018

@author: pan
"""

from skrvm import RVC
from sklearn.datasets import load_iris

clf = RVC()
iris = load_iris()
clf.fit(iris.data, iris.target)
#RVC(alpha=1e-06, beta=1e-06, beta_fixed=False, bias_used=True, coef0=0.0,
#coef1=None, degree=3, kernel='rbf', n_iter=3000, n_iter_posterior=50,
#threshold_alpha=1000000000.0, tol=0.001, verbose=False)
clf.score(iris.data, iris.target)
コード例 #23
0
X = full_normPCA123_array[train_indicies]
Y = full_isAnimal_array[train_indicies]

params = rvc_param_selection(X, Y, 5)
#
#params = rvc_param_selection2(X,Y,5)

####################TO TEST
test_err = np.zeros((15))
train_err = np.zeros((15))
coef = np.multiply(
    [1000, 500, 200, 150, 100, 70, 60, 50, 40, 30, 20, 10, 5, 2, 1], 1e-6)
for i in range(15):
    #coef[i] = 5e-5-(i+1)*1e-5
    clf = RVC(kernel="rbf", coef1=coef[i])  # coef1:  1=46 0.1same
    clf.fit(full_normPCA123_array[train_indicies],
            full_subClass_array[train_indicies])
    train_err[i] = clf.score(full_normPCA123_array[train_indicies],
                             full_subClass_array[train_indicies])
    test_err[i] = clf.score(full_normPCA123_array[test_indicies],
                            full_subClass_array[test_indicies])

    print(coef[i])
    print(train_err[i])
    print(test_err[i])
    print("\n\n")

#####################################################################################################################
rvm_tested_coef1 = np.load("rvm_tested_coef1.npy")
rvm_tested_coef1_again = np.load("rvm_tested_coef1_again.npy")
train1 = np.load("rvm_tested_coef1_trainerr.npy")
train2 = np.load("rvm_tested_coef1_trainerr_again.npy")
コード例 #24
0
ファイル: rvm.py プロジェクト: ckbjimmy/nlp
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# RVM
from skrvm import RVC
clf = RVC()

### read data from single csv file and transform to a list
dir = '/Users/weng/Downloads/'
# raw = pd.read_csv(os.path.join(dir, 'test.txt'), header=0, delimiter=';')
raw_list = pd.read_csv(os.path.join(dir, 'test.txt'), sep="\t", header=None)
# array format is needed for further processing (df -> list -> matrix(array) )
raw_X = raw_list[0].values.tolist()
raw_X = np.asarray(raw_X)

raw_y = raw_list[1].values.tolist()
raw_y = np.asarray(raw_y)

print "Read %d rows of data\n" % len(raw)

clf.fit(raw_X, raw_y)

from sklearn.datasets import load_iris
iris = load_iris()
iris.data
iris.target
clf.fit(iris.data, iris.target)
clf.score(iris.data, iris.target)