Example #1
0
def main():
    tweets_fname = 'your_tweets.txt'
    labels_fname = 'your_labels.txt'
    # fill in with your own tweets and labels
    
    dictionary = get_words(tweets_fname)
    feature_vectors = get_features(tweets_fname, dictionary)
    labels = get_vectors(labels_fname)
    first_few_features = feature_vectors[:560]
    first_few_labels = labels[:560]
    last_few_features = feature_vectors[560:]
    last_few_labels = labels[560:]
    k = 5
    
    # this finds how accurate your measure is
    # metrics = [ "accuracy", "f1_score", "auroc", "precision",
    #             "sensitivity", "specificity" ]
    # for metric in metrics : 
    #     print str(metric) + ": "
    #     c, gamma = get_rbf(first_few_features, first_few_labels,
    #                                 k, metric)
    #     print "best c: " + str(c) + ", best gamma: " + str(gamma)
    #     rbf_clf = SVC(kernel='rbf', C=c, gamma=gamma)
    #     rbf_clf.fit(first_few_features, first_few_labels)
    #     perf, lower, upper = get_confidence_interval(rbf_clf, last_70_features,
    #                                         last_70_labels, metric)
    #     print "peformance: " + str(perf) + ", lower: " + str(lower) + ", upper: " + str(upper)
        
    X_test = get_features('your_other_tweets.txt', dictionary)
    clf = SVC(kernel='rbf', gamma=0.01, C=100, probability=True)
    clf.fit(feature_vectors,labels)
    y_pred = np.sign(clf.decision_function(X_test))
    print(y_pred)
Example #2
0
def condition_on_grades(user="******"):
	c = new_conn.cursor()
	models = [None, None, None, None, None, None]
	for i in range(6):
		c.execute('SELECT easiness, ret_reps, ret_reps_since_lapse, lapses, pred_grade, acq_reps from discrete_log where user_id="%s" and grade=%d' % (user, i))
		x_train = np.array(c.fetchall())
		c.execute('SELECT interval_bucket from discrete_log where user_id="%s" and grade=%d' % (user, i))
		y_train = np.array(c.fetchall())[:,0]
		clf = SVC()
		clf.fit(x_train, y_train)
		print clf.score(x_train, y_train)
		models[i] = clf
	print "====================="
	c.execute('SELECT user_id from (select user_id, count(distinct grade) as cnt from discrete_log group by user_id) where cnt = 6 limit 5')
	users = [row[0] for row in c.fetchall()]
	scores = [0, 0, 0, 0, 0, 0]
	for user in users:
		for i in range(6):
			c.execute('SELECT easiness, ret_reps, ret_reps_since_lapse, lapses, pred_grade, acq_reps from discrete_log where user_id="%s" and grade=%d' % (user, i))
			x_train = np.array(c.fetchall())
			c.execute('SELECT interval_bucket from discrete_log where user_id="%s" and grade=%d' % (user, i))
			y_train = np.array(c.fetchall())[:,0]
			scores[i] += models[i].score(x_train, y_train)
	for i in range(6):
		scores[i] /= len(users);
		print scores[i]
Example #3
0
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert_equal(len(rfe.ranking_), X.shape[1])

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert_equal(X_r.shape, iris.data.shape)
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target))
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
def multi_SVM(needcv = False):
	NeedReFetch = NEED_REFETCH
	OnlyNeedReGenerate = ONLY_NEED_REGENERATE
	allGenreSongsTrain,allGenreSongsTest = fetchData_CM(NUM_NEED_PER_GENRE,GENRES,NeedReFetch,OnlyNeedReGenerate,USED_GENRES)
	# allGenreSongsTrain,allGenreSongsTest = featureSelection (allGenreSongsTrain,allGenreSongsTest,method = 'mean',testmode = True,n_features_to_select = 4)


	# assert(len(allGenreSongsTrain[0][0]) == 106)

	TrainX = []
	TrainY = []
	TestX = []
	TestY = []
	for i in range(sum(USED_GENRES)):
		for j in allGenreSongsTrain[i]:
			TrainX.append(j)
			TrainY.append(i)
		for k in allGenreSongsTest[i]:
			TestX.append(k)
			TestY.append(i)
	confuseMat = [[0 for i in range(sum(USED_GENRES))] for j in range(sum(USED_GENRES))];
	if not needcv:
		print "Start SVM training ... "
		model = SVC(probability=True,decision_function_shape='ovo',kernel = 'rbf',gamma = 0.0078125, C = 8)
		model.fit(TrainX,TrainY)
		print "Start SVM predicting ... "
		PredY = model.predict(TestX)
		for i in range(len(TestY)):
			confuseMat[TestY[i]][PredY[i]] += 1
		print(clfr(TestY, PredY))
	else:
		tuned_parameters = [															## remained to be play with
							{'kernel': ['rbf'], 'gamma': [2**i for i in range(-8,8)], 'C': [2**i for i in range(-8,8)]},
		 					# {'kernel': ['linear'], 'C': [2**i for i in range(-8,9,2)]},
		 					# {'kernel': ['poly'], 'gamma': [2**i for i in range(-8,9,2)], 'C': [2**i for i in range(-8,9,2)], 'degree':[2,3,4]},
		 					]
		print "Start SVM CV ... "
		clf = GSCV(SVC(decision_function_shape='ovo'), tuned_parameters, cv=5)
		clf.fit(TrainX, TrainY)


		print("Best parameters set found on development set:")
		print(clf.best_params_)
		# print("Grid scores on development set:")
		# print()
		# for params, mean_score, scores in clf.grid_scores_:
		# 	print("%0.4f (+/-%0.03f) for %r" % (mean_score, scores.std(), params))
		# print()

		print "Start SVM predicting ... "

		PredY = clf.predict(TestX)


		print(clfr(TestY, PredY))

		for i in range(len(TestY)):
			confuseMat[TestY[i]][PredY[i]] += 1

	return confuseMat
Example #5
0
def buildAndEvaluateSvm(p_kernel):
    model = SVC(kernel=p_kernel)

    model.fit(features, target)
    expected = target
    predicted = model.predict(features)
    print("SCORE: %f" % metrics.roc_auc_score(expected,predicted))
class SVMPredictor(object):
    """"
    A simple application of SVM classifier

    @author: Shaun
    """

    def __init__(self):
        self.clf = SVC(probability=True)

    @abstractmethod
    def fit(self, X, y):
        """
        Method to fit the model.

        Parameters:
        X - 2d numpy array of training data
        y - 1d numpy array of training labels
        """
        self.clf = self.clf.fit(X, y)

    @abstractmethod
    def predict(self, X):
        """
        Method to apply the model data

        Parameters:
        X - 2d numpy array of test data
        """
        return self.clf.predict_proba(X)[:, 1]
Example #7
0
def main():
    print 'MIFS'
    filename = ['../data/arcene.mat', '../data/gisette.mat', '../data/madelon.mat']
    for f_num in range(len(filename)):
        print filename[f_num]
        mat = scipy.io.loadmat(filename[f_num])
        X = mat['X']    # data
        y = mat['Y']    # label
        y = y[:, 0]
        X = X.astype(float)
        n_sample, n_features = X.shape
        # split data
        ss = cross_validation.KFold(n_sample, n_folds=10, shuffle=True)
        # choose SVM as the classifier
        clf = SVC()
        num_fea = np.linspace(5, 300, 60)
        correct = np.zeros(len(num_fea))
        for train, test in ss:
            # select features
            F = MIFS.mifs(X[train], y[train], n_selected_features=300)
            for n in range(len(num_fea)):
                fea_idx = F[0:num_fea[n]]
                features = X[:, fea_idx]
                clf.fit(features[train], y[train])
                y_predict = clf.predict(features[test])
                acc = accuracy_score(y[test], y_predict)
                correct[n] += acc
        correct.astype(float)
        correct /= 10
        for i in range(len(num_fea)):
            print num_fea[i], correct[i]
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=1000000.0, gamma=0.0, kernel='rbf')
    svm.fit(X, y)
    return svm
Example #9
0
def cvalidate():
    from sklearn import cross_validation

    trainset = np.genfromtxt(open('train','r'), delimiter=' ')
    targetset = np.genfromtxt(open('target','r'), delimiter=' ')
    X = np.array([x[0:64] for x in trainset])
    y = np.array([x for x in targetset])
    #print X,y
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    #X_train, X_test = decomposition_pca(X_train, X_test)
    

    X_train, X_test = decomposition_pca(X_train, X_test)
    c_range = 10.0 ** np.arange(6.5,7.5,1)
    gamma_range = 10.0 ** np.arange(-2.5,0.5,1)
    #parameters = {'kernel':['rbf'], 'C':c_range} 
    parameters = {'kernel':['rbf'], 'C':c_range,  'gamma':gamma_range} 
    svr = SVC(kernel = 'rbf', C = 0.72, gamma = 0.299)

    #clf = grid_search.GridSearchCV(svr, parameters)

    #print clf.estimator
    ##clf = Pipeline([('scale', Scaler()), ('svm', SVC())])

    svr.fit(X_train, y_train)
    print svr.score(X_test, y_test)
Example #10
0
def get_classes_accury(data, target, test_times = 10, test_size=0.1):
    target_list = list(set(target))
    target_list.sort()
    scores = np.zeros((test_times,len(target_list)))
    for t in range(test_times):
        clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,   tol=0.001, verbose=False)
        Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=test_size,
                                                    random_state=t)
        clf.fit(Xtrain, ytrain)
        print(t, clf.score(Xtest, ytest))
        pre = clf.predict(Xtest)
        for i,c in enumerate(target_list):
            s = np.logical_and(pre==c, ytest==c).sum()/ (ytest==c).sum()
            scores[t, i] = s

    ##### 生成表格
    print('|'+'class'+'|'+'|'.join([str(i) for i  in target_list]) +'|')
    print('|'+'-'+'|')
    for i,score in enumerate(scores):
        print( '|'+str(i)+'|'+ '|'.join(['{:.4f}'.format(_) for _ in score])+ '|' )
    print( '|'+'max'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.max(axis=0)])+ '|' )
    print( '|'+'min'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.min(axis=0)])+ '|' )
    print( '|'+'mean'+'|'+  '|'.join(['{:.4f}'.format(_) for _ in scores.mean(axis=0)])+ '|' )

    return scores
Example #11
0
class PcaGmm(BaseEstimator):
    def __init__(self, X_all,
                 pca_components = 12, gmm_components = 4,
                 covariance_type = "full", min_covar = 0.1,
                 gamma = 0, C = 1.0):
        self.pca_components = pca_components
        self.gmm_components = gmm_components
        self.covariance_type = covariance_type
        self.min_covar = min_covar
        self.gamma = gamma
        self.C = C
        self.X_all = X_all
        X_all = X_all[:, :pca_components]
        self.gmm = GMM(n_components = gmm_components,
                       covariance_type = covariance_type,
                       min_covar = min_covar)
        self.gmm.fit(X_all)
    def fit(self, X, y):
        X = X[:, :self.pca_components]
        X = self.gmm.predict_proba(X)
        self.svm = SVC(C = self.C, gamma = self.gamma)
        self.svm.fit(X, y)
    def predict(self, X):
        X = X[:, :self.pca_components]
        return self.svm.predict(self.gmm.predict_proba(X))
    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)
    def transform(self, X, y = None):
        X = X[:, :self.pca_components]
        return self.gmm.predict_proba(X)
    def __str__(self):
        return "PCA(%d)-GMM(%d, %s, %f)-SVM(C=%f, gamma=%f)" % (self.pca_components, self.gmm_components,self.covariance_type, self.min_covar,self.C, self.gamma)
Example #12
0
    def predict(self):
        if self.stage != Stage.SIMILARITY:
            print '没有计算相似度'
        import numpy as np
        
        utiles.clearList(self.predictPC)
        utiles.clearList(self.evaluation)
        for i in range(len(self.forest)):
            similarityPCList = []
            for sim in self.similarity[i]:
                similarityPCList.append(int(sim.powerConsume*100000))
            similarity_X = np.array(self.similarityList[i])
            similarity_y = np.array(similarityPCList)
#            print similarity_y
            predictModel = SVC()
            predictModel.fit(similarity_X, similarity_y)
            
            forest_X = np.array(self.forestList[i])
            forest_X = forest_X.reshape(1,-1)
            predicted = predictModel.predict(forest_X)
            
            self.forest[i].powerConsume = float(predicted[0]/100000.0)
            self.predictPC.append(float(predicted[0]/100000.0))
        
        for i in range(len(self.expect)):
            self.evaluation.append((self.expect[i]-self.predictPC[i])/self.expect[i])
Example #13
0
def get_optimize_result(training_data, validation_data, important_cols_result):
    """ Get the number of cols that gets the best score """
    last_score = 0.0
    new_score = 0.0
    number_of_cols = 1
    decreases = 0
    optimal_result = {'score': 0.0, 'number_of_cols': 1}
    # Extract labels from data frames
    training_data_label, training_data = separate_labels(training_data)
    validation_data_label, validation_data = separate_labels(validation_data)
    while True:
        cols = important_cols_result.index[0: number_of_cols]
        # Fit models and test
        clf = SVC()
        clf.fit(training_data.iloc[:, cols], training_data_label)
        predictions = clf.predict(validation_data.iloc[:, cols])
        new_score = accuracy_score(validation_data_label, predictions)
        print(new_score)
        if new_score < optimal_result['score']:
            optimal_result['score'] = new_score
            optimal_result['number_of_cols'] = number_of_cols
            print(optimal_result)
        if last_score > new_score:
            decreases += 1
            if decreases > 5:
                break
        last_score = new_score
        number_of_cols += 5
    cols = important_cols_result.index[0: number_of_cols]
    print(optimal_result)
    export_optimal_result(training_data.iloc[:, cols].columns)
Example #14
0
    def test_cv(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        n_folds = 2

        # = With EPAC
        wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
            # idx_train, idx_test  = cv.__iter__().next()
            X_train = X[idx_train, :]
            X_test = X[idx_test, :]
            y_train = y[idx_train, :]
            clf.fit(X_train, y_train)
            r_sklearn.append(clf.predict(X_test))

        # = Comparison
        key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        for icv in range(n_folds):
            comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0]))
            self.assertTrue(comp, u"Diff CV: EPAC vs sklearn")

        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u"Diff CV: EPAC reduce")
Example #15
0
    def test_perm(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        n_perms = 2
        rnd = 0
        # = With EPAC
        wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None)
        r_epac = wf.top_down(X=X, y=y)
        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd):
            y_p = y[perm, :]
            clf.fit(X, y_p)
            r_sklearn.append(clf.predict(X))
        key2cmp = "y" + conf.SEP + conf.PREDICTION

        # = Comparison
        for iperm in range(n_perms):
            comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm]))
            self.assertTrue(comp, u"Diff Perm: EPAC vs sklearn")
        # test reduce
        for iperm in range(n_perms):
            r_epac_reduce = wf.reduce().values()[iperm][key2cmp]
            comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm]))
            self.assertTrue(comp, u"Diff Perm: EPAC reduce")
def main():
    images, labels = load_labeled_training(flatten=True)
    images = standardize(images)
    unl = load_unlabeled_training(flatten=True)
    unl = standardize(unl)
    test = load_public_test(flatten=True)
    test = standardize(test)
    shuffle_in_unison(images, labels)
    #d = DictionaryLearning().fit(images)
    d = MiniBatchDictionaryLearning(n_components=500, n_iter=500, verbose=True).fit(images)
    s = SparseCoder(d.components_)
    proj_test = s.transform(images)
    pt = s.transform(test)
    #kpca = KernelPCA(kernel="rbf")
    #kpca.fit(unl)
    #test_proj = kpca.transform(images)
    #pt = kpca.transform(test)
    #spca = SparsePCA().fit(unl)
    #test_proj = spca.transform(images)
    #pt = spca.transform(test)
    svc = SVC()
    scores = cross_validation.cross_val_score(svc, proj_test, labels, cv=10)
    print scores
    print np.mean(scores)
    print np.var(scores)
    svc.fit(proj_test, labels)
    pred = svc.predict(pt)
    write_results(pred, '../svm_res.csv')
Example #17
0
def svm_solver(train_data, train_label, validation, test, dimreduce, convertbinary) :
    """
    """
    logging.info ('begin to train the svm classifier')

    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test = dimreduce(train_data, train_label, validation, test)
    # print new_train_data.shape
    train_data, validation, test = convertbinary(train_data, validation, test)

    """
    svc = SVC ()
    params_rbf = {"kernel": ['rbf'],
             "class_weight": ['auto'],
             "C": [0.1 ,0.2 ,0.3 ,0.5 ,1, 2, 3, 5, 10],
             "gamma": [0.01, 0.03,  0.05, 0.1, 0.2, 0.3, 0.5],
             "tol": 10.0** -np.arange(1, 5),
             "random_state": [1000000007]}
    logging.info ("Hyperparameter opimization using RandomizedSearchCV...")
    rand_search_result = RandomizedSearchCV (svc, param_distributions = params_rbf, n_jobs = -1, cv = 3, n_iter = 30)
    # rand_search_result = GridSearchCV (svc , param_grid = params_rbf , n_jobs = 8  , cv = 3)
    rand_search_result.fit (train_data , train_label)
    params = tools.report (rand_search_result.grid_scores_)
    """
    params = {'kernel': 'poly', 'C': 0.1, 'random_state': 1000000007, 'tol': 0.001, 'gamma': 0.1, 'class_weight': 'auto'}
    svc = SVC (probability = True, **params)

    svc.fit (train_data , train_label)
    evaluate.get_auc (svc.predict_proba (validation)[:,1])
    return svc.predict_proba (test)[:,1]
def classify_chushikoku():
    u"""中四国の学生に悩み事検出アルゴリズムを適用する

    """
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC

    data=pd.read_csv("no_taigaku_train4.csv",sep=",",header=0)
    train=data.values
    x=train[:,2:]
    y=train[:,1]

    data_t=pd.read_csv("no_taigaku_chushikoku.csv",sep=",",header=0)
    test=data_t.values
    x_test=test[:,1:]
    name_test=test[:,0]
    print name_test
    print len(x_test)

    best_forest=RandomForestClassifier(bootstrap=True,class_weight=None,criterion="gini",
                                  max_depth=5,max_features="sqrt",max_leaf_nodes=None,
                                  min_samples_leaf=1,min_samples_split=3,
                                  min_weight_fraction_leaf=0.001,n_estimators=30,n_jobs=-1,
                                  oob_score=False,random_state=None,verbose=0,warm_start=False)
    best_svm=SVC(C=100,kernel="linear",gamma=0.001)

    clf=best_svm.fit(x,y)
    label_predict=clf.predict(x_test)

    print name_test[label_predict=="chutai"]
    print len(name_test[label_predict=="chutai"])
Example #19
0
def train_model(data, seed = None):
    '''
        Осуществить обучение модели на выборке.
        Аргументы:
            data - DataFrame с обучающей выборкой
            seed - семя генератора псевдослучайных чисел
        Возвращает: обученную модель
    '''

    # Перемешать выборку:
    if seed is not None:
        np.random.seed(seed)
        data = data.iloc[np.random.permutation(len(data))]

    # Выбрать все записи, значение целевой функции которых равно 1,
    # и ровно столько же записей со значением целевой функции 0:
    pos_data = data[data["similar"] == 1.0]
    neg_data = data[data["similar"] == 0.0][:len(pos_data)]
    data = ps.concat([pos_data, neg_data])
    print("Размер учебной выборки: {}".format(len(data)))

    # Выделить данные и значения целевой функции:
    X = data.drop(["id1", "id2", "similar"], axis=1)
    y = data["similar"]
    
    # Обучить модель:
    #model = LogisticRegression(penalty='l1', tol=0.28, random_state=seed)
    #model = RandomForestClassifier(60, 'entropy', 7, random_state=seed)
    #model = KNeighborsClassifier(28, "uniform")
    model = SVC(C=0.1, kernel='poly', random_state=seed)
    model.fit(X, y)
    #print(X.columns, '\n', model.feature_importances_)
    return model
Example #20
0
def columnSvm(data, columns):
	loo = cross_validation.LeaveOneOut(len(data))
	totalAccurateNum = 0
	singleExpression = [0] * 7
	singleExpressionAccurate = [0] * 7
	eachAccuracy = [0.0] * 7
	for i in range(1, 8):
		singleExpression[i - 1] = sum(data[:, 43] == i)
	
	for trainIndex, testIndex in loo:
		xTrain, yTrain = data[trainIndex, 0 : 43], data[trainIndex, 43]
		xTest, yTest = data[testIndex, 0 : 43], data[testIndex, 43]
		xTrain = xTrain[:, columns]
		xTest = xTest[:, columns]
		svm = SVC(kernel = "rbf", C = 4.0, gamma = 0.01)
		svm.fit(xTrain, yTrain)
		yPredict = svm.predict(xTest)
		
		if yTest == yPredict :
			totalAccurateNum += 1
			singleExpressionAccurate[yTest[0] - 1] += 1
		else :
			print yTest, yPredict
	print "total accuracy: ", float(totalAccurateNum) / len(data)
	for i in range(0, 7):
		eachAccuracy[i] = float(singleExpressionAccurate[i]) / singleExpression[i]
		print i+1, eachAccuracy[i]
	
	return totalAccurateNum, eachAccuracy
Example #21
0
def svm_grid_search():

	#get data
	training_input,training_target,validation_input,validation_target = prepare_input()

	#set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false,
	#and log-loss requires a probability
	log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

	training_input = training_input[:100000]
	training_target = training_target[:100000]

	print training_input.shape[0]
	print training_target.shape[0]

	start = time.time()
	svm = SVC(random_state=31,probability=True)
	
	
	svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]}
	svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1)
	svm_grid_obj = svm_grid_obj.fit(training_input,training_target)
	svm = svm_grid_obj.best_estimator_
	print "Best params: " + str(svm_grid_obj.best_params_)	
	svm_train_error = log_loss(training_target,svm.predict_proba(training_input))
	svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input))
	print "Best SVM training error: {:02.4f}".format(svm_train_error)
	print "Best SVM validation error: {:02.4f}".format(svm_validation_error)
	end = time.time()
	print "RF grid search took {:02.4f} seconds".format(end-start)

	return svm
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    testing_file = file('test.p', 'r')
    training_file = file('train.p', 'r')

    train = pickle.load(training_file)
    test = pickle.load(testing_file)

    testing_file.close()
    training_file.close()
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Downsampling ...' % (time.strftime('%H:%M:%S'))
    trainX = downsample_features(trainX)
    testX = downsample_features(testX)

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters')
    clf = SVC()
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
Example #23
0
def main() :
    np.random.seed(1234)

    # read the tweets and its labels
    dictionary = extract_dictionary('../data/tweets.txt')
    X = extract_feature_vectors('../data/tweets.txt', dictionary)
    y = read_vector_file('../data/labels.txt')

    metric_list = ["accuracy", "f1_score", "auroc"]

    ### ========== TODO : START ========== ###
    # part 1: split data into training (training + cross-validation) and testing set
    s1 = slice(0, 560, 1)
    s2 = slice(560, 630, 1)
    training_data = X[s1]
    test_data = X[s2]

    training_label = y[s1]
    test_label = y[s2]

    # part 2: create stratified folds (5-fold CV)
    kf = StratifiedKFold(training_label, n_folds=5)
    # part 2: for each metric, select optimal hyperparameter for linear-kernel SVM using CV
    for metric in metric_list:
        val = select_param_linear(training_data, training_label, kf, metric)
        print("max c: " + str(val))

    # part 3: train linear-kernel SVMs with selected hyperparameters
    Model = SVC(kernel='linear', C=10)
    Model.fit(training_data, training_label)
    # part 3: report performance on test data
    for metric in metric_list:
        perf = performance_test(Model, test_data, test_label, metric)
        print 'Performance of ' + str(metric) + ': ' + str(perf)
Example #24
0
def classify_rbf_tf(train_vector, train_label):
    classifier_RBFSvc_tf = SVC(kernel='rbf', gamma=2)
    rbf_clf_tf = classifier_RBFSvc_tf.fit(train_vector, train_label)
    save_clf2 = open("RBFSvcTf.pickle","wb")
    pickle.dump(rbf_clf_tf,save_clf2)
    save_clf2.close()
    return rbf_clf_tf
Example #25
0
def classify_poly_tf(train_vector, train_label):
    classifier_polySvc_tf = SVC(kernel='poly', degree=2, gamma=2)
    poly_clf_tf = classifier_polySvc_tf.fit(train_vector, train_label)
    save_clf3 = open("polySvcTf.pickle","wb")
    pickle.dump(poly_clf_tf,save_clf3)
    save_clf3.close()
    return poly_clf_tf
Example #26
0
def experiment_three_feature_selection(svm,train_set,test_set,c_star):

    # Find the weight vectors
    weight_vectors = svm.coef_
    accuracies = np.zeros(58)

    # Sort the indexes of the weight vectors, then flip to largest to smallest.
    sorted_index = np.argsort(weight_vectors)
    reverse_index = np.fliplr(sorted_index)[0]

    # Randomizing the indices for the experiment
    np.random.shuffle(reverse_index)
    for i in range(2,len(reverse_index)+1):
        selector = reverse_index[:i]
        split_train = train_set[:,selector]
        split_test = test_set[:,selector]

        # Create the new SVM
        SVM_one = SVC(kernel='linear', C=c_star, probability=True)
        SVM_one.fit(split_train, train_set[:, -1])
        new_predictions = SVM_one.predict(split_test)

        # Calculate the accuracy, precision, and recall of the SVM
        accuracy = metrics.accuracy_score(test_set[:,-1], new_predictions)
        accuracies[i] = accuracy

        # Was not entirely sure if this part was needed.
        # Reshuffled this way every time it was a randomized set of m features.
        # np.random.shuffle(reverse_index)
    return accuracies
Example #27
0
def classify_linear_tf(train_vector, train_label):
    classifier_linearSvc_tf = SVC(kernel='linear')
    linear_clf_tf = classifier_linearSvc_tf.fit(train_vector, train_label)
    save_clf1 = open("linearSvcTf.pickle","wb")
    pickle.dump(linear_clf_tf,save_clf1)
    save_clf1.close()
    return linear_clf_tf
def fitall(X,y):
    m1 =  RandomForestClassifier(n_estimators=500)
    m2 =  LogisticRegression()
    m3 =  SVC(probability=True)
    lib = [m1, m2, m3]
    m4 = sl.SuperLearner(lib, loss = "nloglik")
    return m1.fit(X,y), m2.fit(X,y), m3.fit(X,y), m4.fit(X, y)
Example #29
0
def experiment_two_feature_selection(svm,train_set,test_set,c_star):

    # Find the weight vectors
    weight_vectors = svm.coef_
    # Absolute these for argsort to accurately find the max to min vector
    weight_vectors = np.absolute(weight_vectors)
    accuracies = np.zeros(58)

    # Sort the indexes of the weight vectors, then flip to largest to smallest.
    sorted_index = np.argsort(weight_vectors)
    reverse_index = np.fliplr(sorted_index)[0]

    # Print for the Experiment 2 to find best 5 features.
    print("Top 5 weight vectors for Experiment 2: ", reverse_index[:5])
    for i in range(2,len(reverse_index)+1):
        selector = reverse_index[:i]
        split_train = train_set[:,selector]
        split_test = test_set[:,selector]    # Create the new SVM
        SVM_one = SVC(kernel='linear', C=c_star, probability=True)
        SVM_one.fit(split_train, train_set[:, -1])
        new_predictions = SVM_one.predict(split_test)

        # Calculate the accuracy, precision, and recall of the SVM
        accuracy = metrics.accuracy_score(test_set[:,-1], new_predictions)
        accuracies[i] = accuracy
    return accuracies
def svc_classifier(X_train, categories,X_test, test_categories):
    from sklearn.svm import SVC
    svm_classifier = SVC(C=100, gamma=0.1).fit(X_train, categories)
    y_svm_predicted = svm_classifier.predict(X_test)
    print '\n Here is the classification report for support vector machine classiffier:'
    print metrics.classification_report(test_categories, y_svm_predicted)
    to_latex(test_categories, y_svm_predicted)  
Example #31
0
def train(args):
    print("train call")
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1),
                 map(os.path.split,
                     map(os.path.dirname, labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GridSearchSvm':
        print("""
        Warning: In our experiences, using a grid search over SVM hyper-parameters only
        gives marginally better performance than a linear SVM with C=1 and
        is not worth the extra computations of performing a grid search.
        """)
        param_grid = [
            {'C': [1, 10, 100, 1000],
             'kernel': ['linear']},
            {'C': [1, 10, 100, 1000],
             'gamma': [0.001, 0.0001],
             'kernel': ['rbf']}
        ]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    elif args.classifier == 'GMM':  # Doesn't work best
        clf = GMM(n_components=nClasses)

    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
        # works better with C = 1 and gamma = 2
        clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
    elif args.classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif args.classifier == 'GaussianNB':
        clf = GaussianNB()

    # ref: https://jessesw.com/Deep-Learning/
    elif args.classifier == 'DBN':
        from nolearn.dbn import DBN
        clf = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1],  # i/p nodes, hidden nodes, o/p nodes
                  learn_rates=0.3,
                  # Smaller steps mean a possibly more accurate result, but the
                  # training will take longer
                  learn_rate_decays=0.9,
                  # a factor the initial learning rate will be multiplied by
                  # after each iteration of the training
                  epochs=300,  # no of iternation
                  # dropouts = 0.25, # Express the percentage of nodes that
                  # will be randomly dropped as a decimal.
                  verbose=1)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                        ('clf', clf_final)])

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
from sklearn.metrics import accuracy_score
print('accuracy score :',accuracy_score(y_pred,y_test))

"""### **Support Vector Machine(SVM)**"""

#Support Vector Machine(SVM)
#importing the library
from sklearn.svm import SVC
#creating local variable classifier
classifier = SVC(kernel='linear',random_state=0)
#Training the model
classifier.fit(X_train,y_train)

#predicting the value of Y
y_pred = classifier.predict(X_test)

#importing metrics for evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
Example #33
0
t0 = time()
x_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

print("done in %0.3fs" % (time() - t0))


###############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
#clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False)
clf = clf.fit(x_train_pca, y_train)
#clf = cv2.createFisherFaceRecognizer()
#clf.train(x_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
#print(clf.best_estimator_)

# Save the classifier
joblib.dump(clf, "recognition_clf.pkl", compress=3)




###############################################################################
Example #34
0
X = dataset.iloc[:, [2,3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel ='rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
Example #35
0
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

## Data without information about depth
X_train_ND, X_test_ND = np.delete(arr=X_train, obj=[0, 4, 6],
                                  axis=1), np.delete(arr=X_test,
                                                     obj=[0, 4, 6],
                                                     axis=1)

####### II: Classification #######

# Define Classifiers
nb = GaussianNB()
knn = KNeighborsClassifier()
svc = SVC(probability=True)

## Fit Classifiers without depth:
fit_nb_ND = nb.fit(X_train_ND, y_train)
fit_knn_ND = knn.fit(X_train_ND, y_train)
fit_svc_ND = svc.fit(X_train_ND, y_train)

# Predict with Classifiers
## Save methods in dict to iterate over them.
methods = {"Naive Bayes": nb, "KNN": knn, "SVM": svc}

## With Depth:
accuracies = []
precisions = []

for method_name, method in methods.items():
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=6, max_features=7)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc))
print(classification_report(y_test, pred_rfc))
print(accuracy_score(y_test, pred_rfc))
rfc.fit(X_train_all, y_train_all)
pred_all_rfc = rfc.predict(X_test_all)
sub_rfc = pd.DataFrame()
sub_rfc['PassengerId'] = df_test['PassengerId']
sub_rfc['Survived'] = pred_all_rfc
#sub_rfc.to_csv('randforest.csv',index=False)

from sklearn.svm import SVC
svc = SVC(gamma = 0.01, C = 100)#, probability=True)
svc.fit(X_train_sc, y_train_sc)
pred_svc = svc.predict(X_test_sc)
print(confusion_matrix(y_test_sc, pred_svc))
print(classification_report(y_test_sc, pred_svc))
print(accuracy_score(y_test_sc, pred_svc))

svc.fit(X_train_all_sc, y_train_all_sc)
pred_all_svc = svc.predict(X_test_all_sc)

sub_svc = pd.DataFrame()
sub_svc['PassengerId'] = df_test['PassengerId']
sub_svc['Survived'] = pred_all_svc
sub_svc.to_csv('svc.csv',index=False)

Example #37
0
##### splitting data into train and test set
x_train, x_test, y_train, y_test = train_test_split(data['cleaned_text'],
                                                    data['labels'],
                                                    test_size=0.2,
                                                    random_state=10)

############### fit frequency based word embeddings into our data set to turn text into wordvectors

vectorizer = TfidfVectorizer(lowercase=True, stop_words=STOPWORDS)
vectorizer.fit(x_train)
x_train_vect = vectorizer.transform(x_train)
x_test_vect = vectorizer.transform(x_test)

############# Build our classifier with Linear Support vector machine

model = SVC(C=1, kernel='linear', class_weight='balanced')
model.fit(x_train_vect, y_train)

y_pred = model.predict(x_test_vect)

cm = confusion_matrix(y_test, y_pred)  ########## confusion matrix for test set

pipeline = make_pipeline(
    vectorizer,
    model)  #### save our model with pipeline function for future analysis


def predict(text):

    score = pipeline.predict([clean_text(text)])
Example #38
0
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Fitting the SVM to the Training Set
from sklearn.svm import SVC

cl = SVC(kernel='linear', random_state=0)
cl.fit(X_train, Y_train)

# Predicint the test set results
y_pred = cl.predict(X_test)

# Making the confustion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap

X_set, Y_set = X_train, Y_train
X1, X2 = np.meshgrid(
Example #39
0
feats=[]
humor = []
for key in dict.keys():
    value = dict[key]
    feats.append(value[0].tolist())
    humor.append(value[1].tolist())
feats = np.array(feats)
humor = np.array(humor)



if options.clf == 'GaussianProc':
    clf = GaussianProcessClassifier()
elif options.clf == "SVC":
    clf = SVC()
elif options.clf == "LinearSVC":
    clf = LinearSVC(max_iter=10000,dual=False)
elif options.clf == "DecisionTree":
    clf = DecisionTreeClassifier()
elif options.clf == "RandomForest":
    clf = RandomForestClassifier()
elif options.clf == "AdaBoost":
    clf = AdaBoostClassifier(n_estimators=100)
elif options.clf == "XGBoost":
    clf = XGBClassifier()
elif options.clf == "KNN":
    clf = KNeighborsClassifier(n_neighbors=5)
elif options.clf == "GaussianNB":
    clf = GaussianNB()
elif options.clf == "RBF":
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC

    print ' '
    print '============================='
    print 'Bernoulli SVC Classifier:'
    classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
    classifierBi.classify_many(test)

    for pdist in classifierBi.prob_classify_many(test):
        print pdist.prob('human'), pdist.prob('auto')

    for i in range(len(classifierBi.classify_many(test))):
        print classifierBi.classify_many(test)[i]

    classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set)
    classifierSVC.classify_many(test)

    # svc = nltk.classify.accuracy(classifierSVC, test_set)
    # print 'accuracy is %.2f' %round(svc*100,4), '%'
    def SVC():
        classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
        return classifierSVC.classify_many(test)

    print "Performance of running Bernoulli SVC Classifier on test set: ", timeit.timeit(
        "SVC", setup="from __main__ import SVC", number=1)

    print ' '
    print '============================='
    print 'Linear SVC Classifier:'
    classifierLinSVC = SklearnClassifier(LinearSVC(),
# Provided to give you a starting point. Try a variety of classifiers.
# Stratified ShuffleSplit cross-validator.
# Provides train/test indices to split data in train/test sets.
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
# which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class.

# NaiveBayes
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()

# SVM
from sklearn.svm import SVC

svm_clf = SVC()

# DecisionTree
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

# RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=25)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier()
Example #42
0
def get_res(x_train, y_train, x_test, y_test):

    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)

    lg = LogisticRegression(penalty='l2')
    lg.fit(x_train, y_train)

    dtc = DecisionTreeClassifier()
    dtc.fit(x_train, y_train)

    gb = GradientBoostingClassifier(n_estimators=200)
    gb.fit(x_train, y_train)

    ab = AdaBoostClassifier()
    ab.fit(x_train, y_train)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)

    svm = SVC()
    svm.fit(x_train, y_train)

    mnb = MultinomialNB(alpha=0.01)
    mnb.fit(x_train, y_train)

    bnb = BernoulliNB(alpha=1.0,
                      binarize=0.31,
                      fit_prior=True,
                      class_prior=None)
    bnb.fit(x_train, y_train)

    rtc = RandomForestClassifier(n_estimators=10,
                                 max_depth=20,
                                 random_state=47)
    rtc.fit(x_train, y_train)

    num_list = [
        knn.score(x_test, y_test),
        lg.score(x_test, y_test),
        dtc.score(x_test, y_test),
        gb.score(x_test, y_test),
        ab.score(x_test, y_test),
        gnb.score(x_test, y_test),
        svm.score(x_test, y_test),
        mnb.score(x_test, y_test),
        bnb.score(x_test, y_test),
        rtc.score(x_test, y_test)
    ]
    name_list = [
        'KNN', 'Logistic', 'DecisionTree', 'GradientBoosting', 'AdaBoost',
        'GaussianNB', 'SVC', 'MultinomialNB', 'BernoulliNB', 'RandomForest'
    ]
    plt.title('title')
    num_list = np.around(num_list, decimals=3)
    autolabel(
        plt.bar(range(len(num_list)),
                num_list,
                color='rb',
                tick_label=name_list,
                width=0.4))
    plt.show()
Example #43
0
nifti_masker = NiftiMasker(mask_img=mask_filename, sessions=session,
                           smoothing_fwhm=4, standardize=True,
                           memory="nilearn_cache", memory_level=1)
func_filename = haxby_dataset.func[0]
X = nifti_masker.fit_transform(func_filename)
# Restrict to non rest data
X = X[condition_mask]
session = session[condition_mask]

###########################################################################
# Build the decoder that we will use

# Define the prediction function to be used.
# Here we use a Support Vector Classification, with a linear kernel
from sklearn.svm import SVC
svc = SVC(kernel='linear')


# Define the dimension reduction to be used.
# Here we use a classical univariate feature selection based on F-test,
# namely Anova. We set the number of features to be selected to 500
from sklearn.feature_selection import SelectKBest, f_classif
feature_selection = SelectKBest(f_classif, k=500)

# We have our classifier (SVC), our feature selection (SelectKBest), and now,
# we can plug them together in a *pipeline* that performs the two operations
# successively:
from sklearn.pipeline import Pipeline
anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)])

###########################################################################
# In[ ]:

from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=100)
logmodel.fit(X_train, y_train)
ypred = logmodel.predict(X_test)
print(logmodel.score(X_train, y_train))
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))

# *4. SVM*

# In[ ]:

from sklearn.svm import SVC
modelsvc = SVC(probability=True, gamma='auto')
modelsvc.fit(X_train, y_train)
ypred = modelsvc.predict(X_test)
print(modelsvc.score(X_train, y_train))
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))

# *6. Decision Tree*

# In[ ]:

from sklearn.tree import DecisionTreeClassifier
dmodel = DecisionTreeClassifier()
dmodel.fit(X_train, y_train)
ypred = dmodel.predict(X_test)
print(dmodel.score(X_train, y_train))
#from sklearn import preprocessing
#le = preprocessing.LabelEncoder()
#bankdata = bankdata.apply(le.fit_transform)

droplist = ['class']
X = bankdata.drop(droplist, axis=1)
y = bankdata['class']

#从这儿开始才是算法,上面是处理输入的数据csv
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
#labels = np.unique(X); print(labels)

from sklearn.svm import SVC
clf = SVC()  #kernel='rbf'
#clf = SVC(kernel='poly',degree=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#y_pred = svclassifier.predict(X_test)
#
#from sklearn.metrics import classification_report, confusion_matrix
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred)) 
#
#
def main():
    st.title("Binary Classification Web App")
    st.sidebar.title("Binary Classification Web App")
    st.markdown("Are your mushrooms edible or poisonous? 🍄")
    st.sidebar.markdown("Are your mushrooms edible or poisonous? 🍄")

    #st.cache :
    #until and unless the function name and arguments are chaged the data is cached
    # just use the cached data to rerun

    #Label Encoding :
    #refers to converting the labels into numeric form
    #so as to convert it into the machine-readable form. Machine learning algorithms
    #can then decide in a better way on how those labels must be operated.
    #It is an important pre-processing step for the structured dataset in supervised learning.

    @st.cache(persist=True)
    def load_data():
        data = pd.read_csv("mushrooms.csv")
        labelencoder = LabelEncoder()
        for col in data.columns:
            data[col] = labelencoder.fit_transform(data[col])
        #st.write(data)  #to check the dataset after label encoding
        return data

    @st.cache(persist=True)
    def split(df):
        y = df.type
        x = df.drop(columns=['type'])
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=0)
        return x_train, x_test, y_train, y_test

    def plot_metrics(metrics_list):
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model,
                                  x_test,
                                  y_test,
                                  display_labels=class_names)
            st.pyplot()

        if 'ROC Curve' in metrics_list:
            st.subheader("ROC Curve")
            plot_roc_curve(model, x_test, y_test)
            st.pyplot()

        if 'Precision-Recall Curve' in metrics_list:
            st.subheader('Precision-Recall Curve')
            plot_precision_recall_curve(model, x_test, y_test)
            st.pyplot()

    df = load_data()
    class_names = ['edible', 'poisonous']  #for confusion matrix

    x_train, x_test, y_train, y_test = split(df)

    #take user input of hyperparameters
    st.sidebar.subheader("Choose Classifier")
    classifier = st.sidebar.selectbox("Classifier",
                                      ("Support Vector Machine (SVM)",
                                       "Logistic Regression", "Random Forest"))

    if classifier == 'Support Vector Machine (SVM)':
        st.sidebar.subheader("Model Hyperparameters")
        #choose parameters
        C = st.sidebar.number_input("C (Regularization parameter)",
                                    0.01,
                                    10.0,
                                    step=0.01,
                                    key='C_SVM')
        kernel = st.sidebar.radio("Kernel", ("rbf", "linear"), key='kernel')
        gamma = st.sidebar.radio("Gamma (Kernel Coefficient)",
                                 ("scale", "auto"),
                                 key='gamma')

        metrics = st.sidebar.multiselect(
            "What metrics to plot?",
            ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Support Vector Machine (SVM) Results")
            model = SVC(C=C, kernel=kernel, gamma=gamma)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write(
                "Precision: ",
                precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ",
                     recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if classifier == 'Logistic Regression':
        st.sidebar.subheader("Model Hyperparameters")
        C = st.sidebar.number_input("C (Regularization parameter)",
                                    0.01,
                                    10.0,
                                    step=0.01,
                                    key='C_LR')
        max_iter = st.sidebar.slider("Maximum number of iterations",
                                     100,
                                     500,
                                     key='max_iter')

        metrics = st.sidebar.multiselect(
            "What metrics to plot?",
            ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Logistic Regression Results")
            model = LogisticRegression(C=C, penalty='l2', max_iter=max_iter)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write(
                "Precision: ",
                precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ",
                     recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if classifier == 'Random Forest':
        st.sidebar.subheader("Model Hyperparameters")
        n_estimators = st.sidebar.number_input(
            "The number of trees in the forest",
            100,
            5000,
            step=10,
            key='n_estimators')
        max_depth = st.sidebar.number_input("The maximum depth of the tree",
                                            1,
                                            20,
                                            step=1,
                                            key='n_estimators')
        bootstrap = st.sidebar.radio("Bootstrap samples when building trees",
                                     ('True', 'False'),
                                     key='bootstrap')
        metrics = st.sidebar.multiselect(
            "What metrics to plot?",
            ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Random Forest Results")
            model = RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           bootstrap=bootstrap,
                                           n_jobs=-1)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write(
                "Precision: ",
                precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ",
                     recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if st.sidebar.checkbox("Show raw data", False):
        st.subheader("Mushroom Data Set (Classification)")
        st.write(df)
        st.markdown(
            "This [data set](https://archive.ics.uci.edu/ml/datasets/Mushroom) includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms "
            "in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, "
            "or of unknown edibility and not recommended. This latter class was combined with the poisonous one."
        )
    # Hyperparameter search over all possible dimensions for PCA reduction
    # 'pca__n_components': np.arange(1, 17),

    # 'svm__gamma': np.arange(0.001, 0.1, 0.001)
}

svm_classification_pipeline = Pipeline(
        [
            # Apply PCA to SVM Classification
            #('pca', PCA()),

            # Apply scaling to SVM Classification
            #('scale', StandardScaler()),

            ('svm', SVC())
        ]
    )

_accuracy_grid_search(values_train, hdi_class_train,
                        svm_classification_pipeline,
                        classification_svm_parameters)


# ## u)

# In[17]:


classification_svm_parameters = {
    # Use linear kernel for SVM Classification
y = dataset.iloc[:, 4].values

#Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#Feature Scaling (Zscore, it standardizes the data) no need in 
from sklearn.preprocessing import StandardScalar
sc_X = StandardScalar()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Fitting regression model to the Training set
#Create regression model
from sklearn.svm import SVC
classifier = SVC(kernal = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

#Predicting the Test set results
y_pred = classifier.predict(X_test)

#Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Visualizing the Training set results (use this to see test set results by changing the variable)
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrind(np.arage(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
Example #49
0
second_pc = pca.components_[1]

#print var, sum(var), eigenfaces.shape, ei_mean.shape, X_train_pca.shape

###############################################################################
# Train a SVM classification model
print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
#Grid encuantra el mejor parametro de C y gamma pa ser utilizado con el kernel rbf
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced', probability=True), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

###############################################################################
# Quantitative evaluation of the model quality on the test set
print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
y_proba = clf.predict_proba(X_test_pca)
print "done in %0.3fs" % (time() - t0)

#Guardar Variables del modelo ya entrenado
with open('Clasificador.pkl', 'w') as f:  # Python 3: open(..., 'wb')
Example #50
0
X = data[:, 0:4]
Y = data[:, 4]

val_size = 0.2
scoring = "accuracy"

(X_train, X_val, Y_train,
 Y_val) = model_selection.train_test_split(X, Y, test_size=val_size)

models = {
    "LR": LogisticRegression(solver="lbfgs", multi_class="auto"),
    "LDA": LinearDiscriminantAnalysis(solver='lsqr'),
    "KNN": KNeighborsClassifier(),
    "DTC": DecisionTreeClassifier(),
    "NB": GaussianNB(),
    "SVC": SVC(),
    "MLP": MLPClassifier(),
}

results = []
for name, model in models.items():
    kfold = model_selection.KFold(n_splits=10)
    cross_res = model_selection.cross_val_score(model,
                                                X_train,
                                                Y_train,
                                                cv=kfold,
                                                scoring=scoring)
    results.append((name, cross_res))

for name, res in results:
    print("{:6} {:2.4} {:2.4}").format(name, res.mean(), res.std())
for alp in np.linspace(0.0001,1,alp_count):
    ridge = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance, alpha=alp)
    ridge.fit(trainXCon, trainY)
    print('ALPHA: ', alp, 'ACC: ', ridge.score(testXCon, testY))
'''

#%% 4. Construct CLassifiers
# Choose model
# from sklearn import gaussian_process
# Gaussian = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
# GaussianProcessRegressor
# from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
svm = SVC(kernel='linear')
from sklearn.svm import LinearSVC
svmLinear = LinearSVC()
from sklearn import tree
cartTree = tree.DecisionTreeClassifier()

linear_square = lm.SGDClassifier(loss='squared_loss',
                                 penalty='none',
                                 max_iter=maxIter,
                                 tol=tolerance)
ridge = lm.SGDClassifier(loss='squared_loss',
                         penalty='l2',
                         max_iter=maxIter,
                         tol=tolerance,
                         alpha=0.5)
# ridgel1 = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance)
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

print "--------different model accuray evaluation--------"
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	model.fit(X_train, Y_train)
	predictions = model.predict(X_validation)
	msg = "%s: %f (%f), accuracy score: %f" % (name, cv_results.mean(), cv_results.std(), accuracy_score(Y_validation, predictions))
	print(msg)
Example #53
0
         ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1,
         stop_words = 'english')
 
 # Fit TFIDF
 tfv.fit(traindata)
 X =  tfv.transform(traindata) 
 X_test = tfv.transform(testdata)
 
 # Initialize SVD
 svd = TruncatedSVD()
 
 # Initialize the standard scaler 
 scl = StandardScaler()
 
 # We will use SVM here..
 svm_model = SVC()
 
 # Create the pipeline 
 clf = pipeline.Pipeline([('svd', svd),
 						 ('scl', scl),
                 	     ('svm', svm_model)])
 
 # Create a parameter grid to search for best parameters for everything in the pipeline
 param_grid = {'svd__n_components' : [200, 400],
               'svm__C': [10, 12]}
 
 # Kappa Scorer 
 kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)
 
 # Initialize Grid Search Model
 model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

# features_train = features_train[:len(features_train)/100]
# labels_train = labels_train[:len(labels_train)/100]

#########################################################
### your code goes here ###
print()
print("Classifier: SVC")
# import the sklearn module for GaussianNB
from sklearn.svm import SVC

# create classifier
clf = SVC(gamma='auto', C=10000.0, kernel='rbf') #TODO

# fit the classifier on the training features and labels
t0 = time()
clf.fit(features_train, labels_train) #TODO
print "training time:", round(time()-t0, 3), "s"

# predict labels for the test features
t1 = time()
pred = clf.predict(features_test) #TODO
print "prediction time:", round(time()-t1, 3), "s"

import collections
counter = collections.Counter(pred)
print "no. of emails predicted Chris': "+str(counter[1])
print "no. of emails predicted to be Sara's:"+str(counter[0])
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

# Loading some example data
iris = datasets.load_iris()
X = iris.data[:, [0, 2]]
y = iris.target

print y

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=1)
clf2 = DecisionTreeClassifier(max_depth=4)
clf3 = SVC(kernel='rbf',random_state=0, gamma=1.0, C=1.0)
eclf = SVC(kernel='rbf',random_state=0, gamma=100.0, C=1.0)

"""
# Training classifiers
clf1 = RandomForestClassifier(max_depth=1)
clf2 = DecisionTreeClassifier(max_depth=2)
clf3 = DecisionTreeClassifier(max_depth=5)
eclf = DecisionTreeClassifier(max_depth=10)

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
                                    ('svc', clf3)],
X = stdata.drop('eligible', axis=1)
y = stdata['eligible']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

clf.fit(X_train, y_train)

prediction = clf.predict(X_test)

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score

classification_report(y_test, prediction)

f = f1_score(y_test, prediction, average=None)
print("F1 Score ::", f[0])
Example #57
0
import pandas as pd

dataset=pd.read_csv('tenureTime.csv').values

data=dataset[:,0:11]
target=dataset[:,11]

from sklearn.model_selection import train_test_split
#dataset splitting function

train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.3)

#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

algorithm=SVC(kernel='poly',degree=2)
#loading the SVM algorithm into "algorithm"

algorithm.fit(train_data,train_target)
#training

result=algorithm.predict(test_data)
#testing

#print('Actual Target:',test_target)
#print('Predicted Target:',result)

from sklearn.metrics import accuracy_score

acc=accuracy_score(test_target,result)
Example #58
0
# 追加
emotion_dataset = load_emotion_dataset()
emotion_dataset.features = scale_clf.transform(emotion_dataset.features)
emotion_dataset.features = emotion_dataset.features[:,
                                                    emotion_dataset.
                                                    features_label_list.
                                                    isin(selected_features)]

#-------------------------------
# 予測
#-------------------------------
# モデル設定
#clf = load(model)
clf = SVC(decision_function_shape='ovo',
          kernel="linear",
          C=0.9545484566618342,
          probability=True,
          max_iter=-1,
          random_state=1)
accuracy = clf.fit(emotion_dataset.features,
                   emotion_dataset.targets).predict(selected_test)
print("accuracy: {}".format(accuracy))

digit_score = clf.predict_proba(selected_test)
print(digit_score)
np.savetxt(r"C:\Users\akito\Desktop\shibata_2.csv", digit_score, delimiter=",")

# 描画
time_record = pd.read_excel(time_record_path, header=0, index_col=0)
end_time = (time_record.loc["Amusement", "FinishDatetime"] -
            time_record.loc["Neutral", "StartDatetime"]).total_seconds()
stress_start = (time_record.loc["Stress archimetic", "FinishDatetime"] -
    (train, test) = train_test(men)
    X = train.loc[:, best_columns[:-1]]
    Y = train['diagnosis']
    x = test.loc[:, best_columns[:-1]]
    y = test['diagnosis']
    knn = train_knn(X, Y, x, y)
    del X, Y, x, y

    X = men.loc[:, best_columns[:-1]]
    Y = men['diagnosis']

    skf = StratifiedKFold(Y, random_state=1, n_folds=10)

    lm = LogisticRegression(random_state=1)
    gnb = GaussianNB()
    svc = SVC()
    rfc = RandomForestClassifier(random_state=1)

    classifiers = [lm, knn, gnb, svc, rfc]
    plot_crossvalidated_roc(skf, X, Y, classifiers)

    # lm.fit(X, Y)
    # gnb.fit(X, Y)
    # svc.fit(X, Y)
    # rfc.fit(X, Y)

    # class_report(x, y, lm)
    # class_report(x, y, knn)
    # class_report(x, y, gnb)
    # class_report(x, y, svc)
    # class_report(x, y, rfc)
Example #60
0
# SVM
from sklearn.svm import SVC

X = fruits_df[['width', 'height']]
y = fruits_df['fruit_label'].copy()

# 将不是apple的标签设置为0
y[y != 1] = 0

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=0)

c_value = [0.001, 1, 100]
for c_value in c_value:
    # 建立模型
    svm_model = SVC(C=c_value)

    # 训练模型
    svm_model.fit(X_train, y_train)

    # 验证模型
    y_pred = svm_model.predict(X_test)
    acc= accuracy_score(y_test, y_pred)
    print('C={},模型精准率为:{}'.format(c_value, acc))


print('*******************************决策树*******************************')
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)