print(X_valid_features.shape)
print(Y_train.shape)
print(Y_valid.shape)

from sklearn import svm
from sklearn.model_selection import GridSearchCV
import time

# Hyper parameters
# C penalty parameter of error term. Smaller values -> stronger regularization.
param_grid = {'C': [1e-1, 1e0], 'max_iter': [500, 1000]}

# Create model and fit to training data. 
# Do grid search CV to find the best hyperparameters
start_time = time.time()
svm_orig = svm.LinearSVC(max_iter=1000, dual=False)
svm_orig = GridSearchCV(svm_orig, param_grid)
Y_Train_Array = np.argmax(Y_train, axis=1)
print(Y_Train_Array.shape)

svm_orig.fit(X=X_train_features, y=Y_Train_Array)
print("--- %s seconds ---" % (time.time() - start_time))

# Print model with chosen hyperparameters
print(svm_orig)

# Predict on test data
svm_predict_orig = svm_orig.predict(X_valid_features)

# Get accuracy
svm_acc_orig = (svm_predict_orig == np.argmax(Y_valid, axis=1)).mean()
Ejemplo n.º 2
0
from sklearn.model_selection import train_test_split
import os

#reading data
csv = pd.read_csv("data.csv")

#choose data
csv_data = csv[["temperature", "humidity"]]
csv_label = csv["label"]

#split data into train and test
data_train, data_test, label_train, label_test = \
    train_test_split(csv_data, csv_label)

#training data
clf = svm.LinearSVC()
clf.fit(data_train, label_train)

#predict data
predict = clf.predict(data_test)

#test out
ac_score = metrics.accuracy_score(label_test, predict)
# cl_report + metrics.classification_report(label_test, prediction)
print("Model occuracy =", ac_score)

#get test_set.csv file from influxdb
os.system(
    'influx -database tstest -format csv -execute \'select * from table03\' > test_set.csv'
)
print("Finish querying")
Ejemplo n.º 3
0
# annot=True显示每个方格的数据
sns.heatmap(corr, annot=True)
plt.show()

# 特征选择
# features_remain = ['radius_mean','texture_mean', 'smoothness_mean','compactness_mean','symmetry_mean', 'fractal_dimension_mean']
features_remain = data.columns[1:31]
print(features_remain)
print('-' * 100)
# 抽取30%的数据作为测试集,其余作为训练集
# in this our main data is splitted into train and test
train, test = train_test_split(data, test_size=0.3)
# 抽取特征选择的数值作为训练和测试数据
train_X = train[features_remain]
train_y = train['diagnosis']
test_X = test[features_remain]
test_y = test['diagnosis']

# 采用Z-Score规范化数据,保证每个特征维度的数据均值为0,方差为1
ss = StandardScaler()
train_X = ss.fit_transform(train_X)
test_X = ss.transform(test_X)

# 创建SVM分类器
model = svm.LinearSVC()
# 用训练集做训练
model.fit(train_X, train_y)
# 用测试集做预测
prediction = model.predict(test_X)
print('准确率: ', metrics.accuracy_score(prediction, test_y))
np.save(
    '/neurospin/brainomics/2016_classif_hallu_fmri/unsupervised_fmri/clustering_only_hallu/cluster_randomB/subject_clusterB.npy',
    subject)
np.save(
    '/neurospin/brainomics/2016_classif_hallu_fmri/unsupervised_fmri/clustering_only_hallu/cluster_randomB/y_clusterB.npy',
    y)

#SVM & Leave one subject-out - no feature selection - WITH IMA samples
###########################################################################

n = 0
list_predict = list()
list_true = list()
coef = np.zeros((23, 63966))
#coef=np.zeros((24,8028))
clf = svm.LinearSVC(C=1e-3, fit_intercept=True, class_weight='auto')

for i in range(1, 24):
    test_bool = (subject == i)
    train_bool = (subject != i)
    Xtest = T[test_bool, :]
    ytest = y[test_bool]
    Xtrain = np.vstack((T_IMA_diff, T[train_bool, :]))
    ytrain = np.hstack((y_IMA, y[train_bool]))
    list_true.append(ytest.ravel())
    scaler = preprocessing.StandardScaler().fit(Xtrain)
    Xtrain = scaler.transform(Xtrain)
    Xtest = scaler.transform(Xtest)
    clf.fit(Xtrain, ytrain.ravel())
    coef[n, :] = clf.coef_
    pred = (clf.predict(Xtest))
Ejemplo n.º 5
0
 def __init__(self, parameters={}):
     self.weight = svm.LinearSVC()
     self.params = {'regwgt': 0.0}
     self.reset(parameters)
Ejemplo n.º 6
0
# get the score of the model
score = logisticRegr.score(X_test, y_test)
# achieves score of 0.989090

## LINEAR DISCRIMINANT ANALYSIS
linearDA = LinearDiscriminantAnalysis()
# fit linear discriminant model
linearDA.fit(X_train, y_train)
# make predictions
lda_predictions = linearDA.predict(X_test)
# get the score of the model
score_lda = linearDA.score(X_test, y_test)
# achieves score of 0.974545

## SUPPORT VECTOR MACHINE
supportVecMach = svm.LinearSVC()
# fit support vector machine
supportVecMach.fit(X_train, y_train)
# make predictions
svm_predictions = supportVecMach.predict(X_test)
# get the score of the model
score_svm = supportVecMach.score(X_test, y_test)
# achieves score of 0.989009

## DECISION TREE
decisionTree = tree.DecisionTreeClassifier()
# fit decision tree
decisionTree.fit(X_train, y_train)
# make predictions
tree_predictions = decisionTree.predict(X_test)
# get the score of the model
Ejemplo n.º 7
0
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of Naive Bayes Classifier')
plt.legend(loc="lower right")
plt.show()

# #### In conclusion, the Naïve Bayes Classifier works well for the price range 1

# ##### b) Support Vector Machine (SVM)

# SVM
svm_clf = svm.LinearSVC(C=5.0, max_iter=10000)
svm_clf.fit(train_x, train_y)
pred_y = svm_clf.predict(test_x)

#Calculate accuracy
accuracy_svm_clf = []
class_correct = list(0. for i in range(num_classes))
class_total = list(0. for i in range(num_classes))
for i in range(len(test_y)):
    label = test_y[i]
    class_correct[label] += (test_y[i] == pred_y[i])
    class_total[label] += 1

for i in range(num_classes):
    accuracy_svm_clf.append(
        np.round(100 * class_correct[i] / class_total[i], 2))
Ejemplo n.º 8
0
def getData(brand_num):

    modified_data = shapeCsv(brand_num,True)

    # 要素数の設定
    count_m = len(modified_data)

    # 最終日のデータを削除
    successive_data = np.delete(modified_data ,count_m - 1, axis=0)

    # データの正規化        
    ms = MinMaxScaler()
    ms.fit(successive_data)
    successive_data = ms.transform(successive_data)

    # データの標準化
    sc = StandardScaler()
    sc.fit(successive_data)
    successive_data = sc.transform(successive_data)

    # 正解値を格納するリスト 価格上昇: 1 価格低下:0
    answers = []

    # 正解値の格納
    for i in range(1, count_m):
        # 上昇率が0以上なら1、そうでないなら0を格納
        if modified_data[i,2] > 0:
            answers.append(1)
        else:
            answers.append(0)

    # データの分割(データの80%を訓練用に、20%をテスト用に分割する)
    X_train, X_test, y_train, y_test = train_test_split(successive_data, answers, test_size=0.2, random_state=1)

    parameters = {'C':[1, 3, 5],'loss':('hinge', 'squared_hinge')}

    # グリッドサーチを実行
    clf = GridSearchCV(svm.LinearSVC(), parameters)
    clf.fit(X_train, y_train) 
 
    # グリッドサーチ結果(最適パラメータ)を取得
    GS_C, GS_loss = clf.best_params_.values()

    # 最適パラメータを指定して学習
    clf = svm.LinearSVC(loss=GS_loss, C=GS_C,random_state=1)
    clf.fit(X_train , y_train)
    
    #2/7までのデータを予想させる
    target_data = shapeCsv(brand_num,False)

    # データの正規化        
    ms = MinMaxScaler()
    ms.fit(target_data)
    target_data = ms.transform(target_data)

    # データの標準化
    sc = StandardScaler()
    sc.fit(successive_data)
    target_data = sc.transform(target_data)

    target_len = len(target_data)
    target_predict = clf.predict(target_data)

    #2/8以降の予想を返す
    return target_predict[target_len-1]
Ejemplo n.º 9
0
            X_devel = pd.read_csv(features_path + task_name + '.' + i + '.devel.csv', sep=sep, header=header, usecols=range(ind_off,num_feat+ind_off), dtype=np.float32)
#                 X_test  = pd.read_csv(features_path + task_name + '.' + x + '.test.csv',  sep=sep, header=header, usecols=range(ind_off,num_feat+ind_off), dtype=np.float32).values
        X_train_fused = pd.concat((X_train_fused,X_train),axis=1)
        X_devel_fused = pd.concat((X_devel_fused,X_devel),axis=1)


    # Feature normalisation
    scaler       = MinMaxScaler()
    X_train      = scaler.fit_transform(X_train_fused)
    X_devel      = scaler.transform(X_devel_fused)
    # Train SVM model with different complexities and evaluate
    uar_scores = []
    print(f'current features set is: {feat_fusion_4[i]}')
    for comp in complexities:
        print('\nComplexity {0:.6f}'.format(comp))
        clf = svm.LinearSVC(C=comp, random_state=0, max_iter=100000)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_devel)
        uar_scores.append( recall_score(y_devel, y_pred, labels=classes, average='macro') )
        print('UAR on Devel {0:.1f}'.format(uar_scores[-1]*100))
        if show_confusion:
            print('Confusion matrix (Devel):')
            print(classes)
            print(confusion_matrix(y_devel, y_pred, labels=classes))

    # Train SVM model on the whole training data with optimum complexity and get predictions on test data
    optimum_complexity = complexities[np.argmax(uar_scores)]
    print('\nOptimum complexity: {0:.6f}, maximum UAR on Devel {1:.1f}\n'.format(optimum_complexity, np.max(uar_scores)*100))
    UAR.extend(np.max(uar_scores)*100)
    
Ejemplo n.º 10
0
labelIdx = 2
import handleClassLabels
print "Class Label Vector Y Extraction Started"
YLabels = handleClassLabels.extractClassLabels(filename, labelIdx)
print "Class Label Vector Y of size ", len(YLabels), " extracted"

#Setting up scaler for standardisation
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

# Training SVM
from sklearn import svm
from sklearn import linear_model
print "Declaring SVM"
#clf = svm.LinearSVC(); # linearsvc1
clf = svm.LinearSVC(C=1000.0, class_weight='auto', penalty='l1', dual=0)
# linearsvc2
#clf = svm.SVC(cache_size = 1000, class_weight='auto', kernel = 'poly'); # Predicts all as POSITIVE :((
#clf = linear_model.SGDClassifier();  # not tried yet
print "standardising training data"
XFeatures = scaler.fit_transform(XFeatures, YLabels)
print "Fitting Data To SVM"
clf.fit(XFeatures, YLabels)
print "SVM trained"

# Saving Trained Classifier
from sklearn.externals import joblib
print "Saving SVM"
fileToSave = "UnigramBigramSVMClassifier.joblib.pkl"
_ = joblib.dump(clf, fileToSave, compress=9)
print "Classifier SAVED!"
insertSql(sql)

"""
SVM

"""
clfSVC = svm.SVC()
clfSVC.fit(X_train, y_train)
predict_values=clfSVC.predict(X_test)
svm_score=r2_score(y_test,predict_values)
print "Accuracy of SVM", svm_score
sql = "INSERT INTO earthquakefour(Name,pydata) VALUES ('svm_score','"+str(svm_score)+"')"
"""print dt_score"""
insertSql(sql)

"""
svm.LinearSVC()
"""

clfLSVC = svm.LinearSVC()
clfLSVC.fit(X_train, y_train)
predict_values=clfLSVC.predict(X_test)
svmlc_score=r2_score(y_test,predict_values)
print "Accuracy of Linear " , svmlc_score
sql = "INSERT INTO earthquakefour(Name,pydata) VALUES ('svmlc_score','"+str(svmlc_score)+"')"
"""print dt_score"""
insertSql(sql)

"""
naive bayes
Ejemplo n.º 12
0
def svm_with_rho_squared(X_train, Y_train, X_test, Y_test, upper_params_norm_sq, use_bias, 
                         weight_decay=None):
    """ Train Support Vector Machine

    Trains an SVM that has params with squared norm roughly equals (and no larger) than
    upper_params_norm_sq. It works by doing binary search on the weight_decay.

    Parameters
    ----------
    X_train : np.ndarray of shape (instances, dimensions)
        Input training features
    Y_train : np.ndarray of shape (instances,)
        Input training labels
    X_test : np.ndarray of shape (instances, dimensions)
        Input testing features
    Y_test : np.ndarray of shape (instances)
        Input testing labels
    upper_params_norm_sq : ???
    use_bias : ???
    weight_decay : float

    Returns
    -------
    train_loss : float
        Training loss
    train_acc : float
        Training accuracy
    test_loss : float
        Testing loss
    test_acc : float
        Testing accuracu
    params_norm_sq : ???
    weight_decay : ???
    params : np.ndarray of shape (dimensions,)
        Fit coefficients
    bias : float
        Fit intercept
    svm_model : ???
        Trained Support Vector Machine model

    """
    rho_sq_tol = 0.01
    params_norm_sq = None

    if weight_decay is None:
        lower_wd_bound = 0.001
        upper_wd_bound = 256.0
    else:
        lower_wd_bound = 0.001
        upper_wd_bound = 2 * (weight_decay) - lower_wd_bound
        if upper_wd_bound < lower_wd_bound:
            upper_wd_bound = lower_wd_bound

    lower_weight_decay = lower_wd_bound
    upper_weight_decay = upper_wd_bound
    weight_decay = (upper_weight_decay + lower_weight_decay) / 2

    while (
      (params_norm_sq is None) or 
      (upper_params_norm_sq > params_norm_sq) or 
      (np.abs(upper_params_norm_sq - params_norm_sq) > rho_sq_tol)):

        print('Trying weight_decay %s..' % weight_decay)

        C = 1.0 / (X_train.shape[0] * weight_decay)        
        svm_model = svm.LinearSVC(
            C=C,
            tol=1e-6,
            loss='hinge',
            fit_intercept=use_bias,
            random_state=24,
            max_iter=100000,
            verbose=True)
        svm_model.fit(X_train, Y_train)

        params = np.reshape(svm_model.coef_, -1)
        bias = svm_model.intercept_[0]
        params_norm_sq = np.linalg.norm(params)**2 + bias**2

        if upper_params_norm_sq is None:
            break

        print('Current params norm sq = %s. Target = %s.' % (params_norm_sq, upper_params_norm_sq))
        # Current params are too small; need to make them bigger
        # So we should reduce weight_decay
        if upper_params_norm_sq > params_norm_sq:
            upper_weight_decay = weight_decay

            # And if we are too close to the lower bound, we give up
            if weight_decay < lower_wd_bound + 1e-5:
                print('Too close to lower bound, breaking')                
                break

        # Current params are too big; need to make them smaller
        # So we should increase weight_decay
        else:
            lower_weight_decay = weight_decay

            # And if we are already too close to the upper bound, we should bump up the upper bound
            if weight_decay > upper_wd_bound - 1e-5:
                upper_wd_bound *= 2
                upper_weight_decay *= 2       

        if (
          (upper_params_norm_sq > params_norm_sq) or 
          (np.abs(upper_params_norm_sq - params_norm_sq) > rho_sq_tol)):
            weight_decay = (upper_weight_decay + lower_weight_decay) / 2

    train_loss = hinge_loss(params, bias, X_train, Y_train)
    test_loss = hinge_loss(params, bias, X_test, Y_test)

    train_acc = svm_model.score(X_train, Y_train)
    test_acc = svm_model.score(X_test, Y_test)

    print('  Train loss             : ', train_loss)
    print('  Train acc              : ', train_acc)
    print('  Test loss              : ', test_loss)
    print('  Test acc               : ', test_acc)
    print('  Sq norm of params+bias : ', params_norm_sq)

    print('\n')

    return train_loss, train_acc, test_loss, test_acc, params_norm_sq, weight_decay, \
      params, bias, svm_model
from numpy import array

bestKvalue = 0
bestKValueAcc = 0
highestKAccuracy = 0
highestk = 0

for p in range(1, 96, 10):
    NUM_OF_ITERATIONS = 5000
    K = p  # num of points for uniform crossover
    print("K Value: " + str(K))

    errorRates = []
    accuracies = []
    labels = []
    lsvm = svm.LinearSVC()

    # Retrieve feature vectors
    featureVectors = FileUtil.createFeatureVectors(
        "../../Feature Vectors/outputNormalizedCAS.txt")

    # Retrieve training set of random 25 population
    originalTrainingSet = ElitistGeneticAlgorithm.determineStartingPopulation(
        featureVectors)

    # Separate labels and data
    currentDataSet, labels = ElitistGeneticAlgorithm.separateLabels(
        originalTrainingSet)

    # create initial population
    population = ElitistGeneticAlgorithm.createIndividuals()
Ejemplo n.º 14
0
    neighbor_count_target, tfidf_cos
]).T
print(training_features)

# scale
training_features = preprocessing.scale(training_features)

# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set]
labels = list(labels)
labels_array = np.array(labels)

print("evaluating")

# evaluation
kf = KFold(len(training_set), n_folds=10)
sumf1 = 0
for train_index, test_index in kf:
    X_train, X_test = training_features[train_index], training_features[
        test_index]
    y_train, y_test = labels_array[train_index], labels_array[test_index]
    # initialize basic SVM
    classifier = svm.LinearSVC()
    # train
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    sumf1 += f1_score(pred, y_test)

print("\n\n")
print(sumf1 / 10.0)
Ejemplo n.º 15
0
    if zoom_out[i] == 1:
        count_7 = count_7 + 1

trainning_data = np.column_stack((gx, gy, ax, ay, az))
#print(trainning_data)
print(count)
print(count_1)
print(count_2)
print(count_3)
print(count_4)
print(count_5)
print('zoom in ', count_6)
print('zoom out', count_7)
print(len(trainning_data))
print(len(gx))
clf_0 = svm.LinearSVC(max_iter=100000)
clf_0.fit(trainning_data, scoll_down)

clf_1 = svm.LinearSVC(max_iter=100000)
clf_1.fit(trainning_data, scoll_up)

clf_2 = svm.LinearSVC(max_iter=100000)
clf_2.fit(trainning_data, zoom_in)

clf_3 = svm.LinearSVC(max_iter=100000)
clf_3.fit(trainning_data, zoom_out)

clf_4 = svm.NuSVC(nu=0.1)
clf_4.fit(trainning_data, scoll_down)
#print(clf.decision_function(trainning_data))
Ejemplo n.º 16
0
        thisPCA = PCA(n_components=i)
        pcaTrainArr.append(thisPCA.fit_transform(trainData))
        pcaTestArr.append(thisPCA.transform(testData))

    return (pcaTrainArr, pcaTestArr)


# 0. calculate the PCA and LDA reductions of the dataset
pcaIterArr = np.arange(70, 171, 20)
ldaIterArr = np.arange(3, 10, 1)
ldaTrainArr, ldaTestArr = LDAreduct(ldaIterArr)
pcaTrainArr, pcaTestArr = PCAreduct(pcaIterArr)

# Section 1: Use the different SVM without any dimension reduction and get the results
# 1. Use the Linear SVM without any dimension reduction
thisSVM = svm.LinearSVC()
thisSVM.fit(trainData, trainLabels)
svmRaw = thisSVM.predict(testData)
print("After using Linear SVM for classification, we have: ")
printSummary(svmRaw, testLabels)

# 2. Use the Polynomial kernal SVM without any dimension reduction
thisSVM = svm.SVC(kernel='poly')
thisSVM.fit(trainData, trainLabels)
svmRaw = thisSVM.predict(testData)
print("After using poly kernal SVM for classification, we have: ")
printSummary(svmRaw, testLabels)

# 3. Use the RBF kernal SVM without any dimension reduction
thisSVM = svm.SVC(kernel='rbf')
thisSVM.fit(trainData, trainLabels)
Ejemplo n.º 17
0
 },
 "sgd": {
     "model_name":
     "SGD",
     "model_package":
     "sklearn.linear_model",
     "model":
     linear_model.RandomForestClassifier(n_estimators=200,
                                         n_jobs=-1,
                                         verbose=2),
     "param_grid": {}
 },
 "liner_svc": {
     "model_name": "LinearSVC",
     "model_package": "sklearn.svm",
     "model": svm.LinearSVC(),
     "param_grid": {
         "C": [0.1, 1, 10, 100, 1000],
         "gamma": [5, 1, 0.1, 0.01, 0.001, 0.0001],
         "kernel": ["rbf", 'poly', 'linear', 'sigmoid'],
     }
 },
 "svc": {
     "model_name": "SVM",
     "model_package": "sklearn.svm",
     "model": svm.SVC(),
     "param_grid": {
         "C": [0.1, 1, 10, 100, 1000],
         "gamma": [5, 1, 0.1, 0.01, 0.001, 0.0001],
         "kernel": ["rbf", 'poly', 'linear', 'sigmoid'],
     }
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import svm
import numpy as np
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris['data'],
                                                    iris['target'],
                                                    random_state=0)
C = 1.0  # SVM regularization parameter
# LinearSVC (linear kernel)
lin_svc = svm.LinearSVC(C=C).fit(X_train, y_train)
y_pred = lin_svc.predict(X_test)
print(y_pred)
print(y_test)
classifier_score = np.mean(y_pred == y_test)
print(classifier_score)
    def train(self, inputdir, cache, clusters, modelout):
        # First, we want to train the classifier
        training_gold = open(inputdir + '/training.gold.tsv')
        training_tokens = open(inputdir + '/training.tokens')
        dev_gold = open(inputdir + '/dev.gold.tsv')
        dev_tokens = open(inputdir + '/dev.tokens')
        test_input = open(inputdir + '/test.input.tsv')
        test_tokens = open(inputdir + '/test.tokens')

        gold_lines = [line.strip() for line in training_gold]
        token_lines = [line.strip() for line in training_tokens]
        gold_lines += [line.strip() for line in dev_gold]
        token_lines += [line.strip() for line in dev_tokens]
        test_input_lines = [line for line in test_input]
        test_token_lines = [line.strip() for line in test_tokens]
        assert (len(gold_lines) == len(token_lines))
        print "Loaded %s training examples." % len(gold_lines)

        label_to_int = {
            '"positive"': 0,
            '"neutral"': 1,
            '"objective-OR-neutral"': 1,
            '"negative"': 2
        }
        int_to_label = {0: 'positive', 1: 'neutral', 2: 'negative'}

        training_positive = []
        training_negative = []
        training_neutral = []

        training_corpus = map(lambda x: x.split('\t'), token_lines)
        word_ngrams, nonc_ngrams, char_ngrams = self._corpus_ngrams(
            training_corpus)
        print "Generated ngram encodings for training corpus."

        print "Contains %s @ mentions." % len(
            filter(lambda x: len(x) == 1 and x[0][0] == '@',
                   word_ngrams.keys()))
        #print "Contains %s used only once." % len(filter(lambda x: ngram_counts[x] == 1, word_ngrams.keys()))
        print "Contains %s URLs." % len(
            filter(lambda x: len(x) == 1 and x[0][:4] == 'http',
                   word_ngrams.keys()))

        lexicons = self._load_lexicons(cache)
        print "Loaded the lexicons."

        w2c, c2w, cids = self._load_clusters(clusters)
        print "Loaded the clusters."

        training_features = []
        training_classes = []

        for gold_line, tokenized_line in zip(gold_lines, token_lines):
            _, _, label, _ = gold_line.split('\t')
            tweet = tokenized_line.split('\t')[0]

            features = self.generate_features(tweet, w2c, cids, word_ngrams,
                                              nonc_ngrams, char_ngrams,
                                              lexicons)
            training_features.append(features)
            training_classes.append(label_to_int[label])

            if len(training_features) % 1000 == 0:
                print "Loaded %s feature vectors." % len(training_features)

        test_features = []
        for tokenized_line in test_token_lines:
            tweet = tokenized_line.split('\t')[3]
            features = self.generate_features(tweet, w2c, cids, word_ngrams,
                                              nonc_ngrams, char_ngrams,
                                              lexicons)
            test_features.append(features)

        classifier = svm.LinearSVC(C=0.005)
        print "Created classifier. Training..."
        classifier.fit(training_features, training_classes)
        print "Trained classifier."
        print "Predicting %s test cases." % len(test_features)
        test_predictions = classifier.predict(test_features)
        print "Finished prediction. Outputting now."

        with open('test_predictions.txt', 'w') as fout:
            for (prediction, line) in zip(test_predictions, test_input_lines):
                col1, col2, _, tweet = line.split('\t')
                label = int_to_label[prediction]
                fout.write('%s\t%s\t%s\t%s' % (col1, col2, label, tweet))
        print "Done outputting predictions."

        print "Saving model..."
        with open(modelout, 'wb') as savefile:
            model = {
                'label_to_int': label_to_int,
                'int_to_label': int_to_label,
                'word_ngrams': word_ngrams,
                'nonc_ngrams': nonc_ngrams,
                'char_ngrams': char_ngrams,
                'lexicons': lexicons,
                'w2c': w2c,
                'c2w': c2w,
                'cids': cids,
                'classifier': classifier
            }
            pickle.dump(model, savefile)
Ejemplo n.º 20
0
def mymain(dataset):
    dict = {}
    np.random.seed(1337)
    #unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
        #bigrams = utils.top_n_bigrams(dataset, BIGRAM_SIZE)
    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print ('Extracting features & training batches')
    clf = svm.LinearSVC(C=0.1)
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
        utils.write_status(i, n_train_batches)
        i += 1
        if FEAT_TYPE == 'frequency':
            tfidf = apply_tf_idf(training_set_X)
            training_set_X = tfidf.transform(training_set_X)
        clf.fit(training_set_X, training_set_y)
    print ('\n')
    print ('Testing')
    if TRAIN:
        correct, total = 0, len(val_tweets)
        i = 1
        batch_size = len(val_tweets)
        n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size)))
        for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
            if FEAT_TYPE == 'frequency':
                val_set_X = tfidf.transform(val_set_X)
            prediction = clf.predict(val_set_X)
            correct += np.sum(prediction == val_set_y)
            utils.write_status(i, n_val_batches)
            i += 1
        dict.update({'dataset': dataset})
        dict.update({'correct': correct})
        dict.update({'total': total})
        rslt = correct * 100. / total
        dict.update({'result': round(rslt, 2)})
        #print('Dictionary Result ',dict)
        print ('\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total))
        #return dict
    else:
        del train_tweets
        #test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True)
        test_tweets = process_tweets(dataset, test_file=True)
        n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size)))
        predictions = np.array([])
        print ('Predicting batches')
        i = 1
        for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE):
            if FEAT_TYPE == 'frequency':
                test_set_X = tfidf.transform(test_set_X)
            prediction = clf.predict(test_set_X)
            predictions = np.concatenate((predictions, prediction))
            utils.write_status(i, n_test_batches)
            i += 1
        predictions = [(str(j), int(predictions[j]))
                       for j in range(len(test_tweets))]
        utils.save_results_to_csv(predictions, 'svm.csv')
        print ('\nSaved to svm.csv')
    return dict
Ejemplo n.º 21
0
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    #Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()
]
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(
    n_splits=10, test_size=.3, train_size=.6, random_state=0
Ejemplo n.º 22
0
# -*- coding: utf-8 -*-
from sklearn import svm, datasets, neighbors

iris = datasets.load_iris()

svc = svm.LinearSVC()
svc.fit(iris.data, iris.target)  # 学习
print(svc.predict([[5.0, 3.0, 5.0, 2.0]]))

knn = neighbors.KNeighborsClassifier()
# 从已有数据中学习
knn.fit(iris.data, iris.target)
# 利用分类模型进行未知数据的预测(确定标签)
print(knn.predict([[5.0, 3.0, 5.0, 2.0]]))
Ejemplo n.º 23
0
    def train_main(self):
        data = pd.DataFrame()
        model_dict = dict()
        train_data_path = self.train_data_path

        for i in train_data_path:
            data_tmp = pd.read_excel(i, header=0)
            data_tmp.columns = ["pid", "label", "context"]

            data = pd.concat([data, data_tmp])

        data = shuffle(data)

        data["context_ngram"] = data[["context"]].applymap(ngram_process)
        context = data["context_ngram"].values

        label = data[["label"]].applymap(fun_map).values

        data_test = pd.read_excel(self.test_data_path, header=0)
        data_test.columns = ["pid", "label", "context"]

        data_test["context_ngram"] = data_test[["context"]].applymap(ngram_process)

        test_context = data_test["context_ngram"].values
        test_label = data_test[["label"]].applymap(fun_map).values

        # tf idf
        tf_idf = TfidfVectorizer(analyzer=fun_1, min_df=50)
        tf_idf.fit(context)

        model_dict["model_1"] = pickle.dumps(tf_idf)

        feature_names = tf_idf.get_feature_names()
        model_dict["feature_names"] = pickle.dumps(feature_names)
        print("feature num", len(feature_names))

        x_train = tf_idf.transform(context)
        x_test = tf_idf.transform(test_context)

        # chi
        model = SelectKBest(chi2, k="all")
        model.fit(x_train, label)

        model_dict["model_2"] = pickle.dumps(model)

        x_train = model.transform(x_train)
        x_test = model.transform(x_test)

        classify = svm.LinearSVC(C=0.9)

        # param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
        # grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2)
        # grid = xgb.XGBClassifier()
        # print(grid.best_params_)

        classify = calibration.CalibratedClassifierCV(classify, cv=10)

        classify.fit(x_train, label)
        y_predict = classify.predict(x_test)

        print(metrics.classification_report(test_label, y_predict))
        print("accuracy:", metrics.accuracy_score(test_label, y_predict))

        model_dict["model_3"] = pickle.dumps(classify)

        with open(self.model_path, mode='wb') as fm:
            joblib.dump(model_dict, fm)
Ejemplo n.º 24
0
    pass
elif FEATURE_EXTRACTION == 'pca':
    t0 = time()
    pca = decomposition.PCA(n_components=100)
    train_X = sp.sparse.coo_matrix(pca.fit_transform(train_X.todense()))
    test_X = sp.sparse.coo_matrix(pca.transform(test_X.todense()))
    print 'pca done in %0.3f' % (time() - t0)
elif FEATURE_EXTRACTION == 'ica':
    t0 = time()
    ica = decomposition.FastICA(n_components=100)
    train_X = sp.sparse.coo_matrix(ica.fit_transform(train_X.todense()))
    test_X = sp.sparse.coo_matrix(ica.transform(test_X.todense()))
    print 'ica done in %0.3f' % (time() - t0)
elif FEATURE_EXTRACTION == 'l1-svc':
    t0 = time()
    l1svc = svm.LinearSVC(C=1, penalty='l1', dual=False)
    l1svc.fit(train_X, train_y)
    train_X = l1svc.transform(train_X)
    test_X = l1svc.transform(test_X)
    print 'l1-svc feature selection done in %0.3f' % (time() - t0)
else:
    raise RuntimeError('unknown feature extraction method')

# <codecell>

## define feature names from tfidf vectorizer
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print train_X.shape, test_X.shape

# <codecell>
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file
    , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max
    , zipout_dir, zipcode_dir, zip_file_name
    , mongo_tuples
    , training_fraction, jobname, uploadtype, description_file): 

    
    # zip func in other files for Spark workers ================= ================
    zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util')
    print "INFO: zip_file_path=",zip_file_path

    # get_spark_context
    sc=ml_util.ml_get_spark_context(sp_master
        , spark_rdd_compress
        , spark_driver_maxResultSize
        , sp_exe_memory
        , sp_core_max
        , jobname
        , [zip_file_path])
   
    
    t0 = time()
    
    # get feature seq mapping from mongo
    if uploadtype == "MD5 List IN-dynamic":
        ### connect to database to get the column list which contains all column number of the corresponding feature
        key = "dict_dynamic"
        jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
        jstr_proj='{"value":1}'

        # get parent dataset's data
        if ds_id != row_id_str:
            jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'        
        
        doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
        dic_list = doc['value']
        
        dic_all_columns = {}
        max_feature = 0
        # reverse dict{hashes:sequence number} ====== 
        for i in range(0, len(dic_list)):
            for key in dic_list[i]:
                dic_all_columns[eval(dic_list[i][key])] = key
                if eval(dic_list[i][key]) > max_feature:
                    max_feature = eval(dic_list[i][key])
        print "INFO: max_feature=",max_feature
        #print "dic_all_columns=",dic_all_columns # fid:numb,numb
    
    
    dirFile_loc = os.path.join(hdfs_feat_dir , "metadata")
    dirFolders = sc.textFile(dirFile_loc)
    
    hash_Folders = dirFolders.collect()
    #print "INFO: dirFile_loc=",dirFile_loc,", hash_Folders=",hash_Folders
    folder_list = [x.encode('UTF8') for x in hash_Folders]
    print "INFO: hdfs folder_list=",folder_list #['dirty/', 'clean/']
    
    # source libsvm filename  
    libsvm_data_file = os.path.join(hdfs_feat_dir , hdfs_file_name)
    print "INFO: libsvm_data_file=", libsvm_data_file
    
    # load feature count file
    #feat_count_file=libsvm_data_file+"_feat_count"
    #feature_count=zip_feature_util.get_feature_count(sc,feat_count_file)
    #print "INFO: feature_count=",feature_count

    # load sample RDD from text file   
    #samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count \
    #    , excluded_feat_cslist=None)
    samples_rdd=sc.textFile(libsvm_data_file).cache()

    # collect all data to local for processing ===============
    all_data = samples_rdd.collect()
    all_list = [ ln.split(' ') for ln in all_data ]
    sample_count=len(all_data)

    # label array
    #labels_list_all = [x.label for x,_ in all_data]
    #print "INFO: labels_list_all=",labels_list_all

    # get feature seq : ngram hash mapping ==================================
    key = "dic_seq_hashes"  #{"123":"136,345"}
    jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'

    jstr_proj='{"value":1}'
    
    # get parent dataset's data
    if ds_id != row_id_str:
        jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
            
    doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
    dic_list = doc['value']
    
    dic_all_columns = dic_list
    feature_count = len(dic_list)

    # get hash : raw string mapping ==================================
    key = "dic_hash_str"  #{"123":"openFile"}
    jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}'
    jstr_proj='{"value":1}'
    # get parent dataset's data
    if ds_id != row_id_str:
        jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}'
        
    doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj)
    dic_hash_str = doc['value']
    
    
    
    features_training = []
    labels_training = []
    names_training = []
    row_training = []
    col_training = []
    max_feat_training = 0
    row_num_training = 0
    features_testing = []
    labels_testing = []
    names_testing = []
    row_testing = []
    col_testing = []
    max_feat_testing = 0
    row_num_testing = 0
    
    # loop through hdfs folders; TBD 
    for idx, folder in enumerate(folder_list):
        print "INFO: folder=", folder
        label = folder_list.index(folder) + 1
        print 'INFO: label=', label

        #logFile_name = os.path.join( hdfs_feat_dir, folder , mtx_name_list)
        #print "XXXXXXXXXXlogFile_name=",logFile_name
        #logFile_data = os.path.join( hdfs_feat_dir , folder , mtx_libsvm)
        #print "XXXXXXXXXXlogFile_data=",logFile_data

        '''
        logNames = sc.textFile(logFile_name).cache()
        logData = sc.textFile(logFile_data).cache()
        
        names = logNames.collect()
        data = logData.collect()
        
        name_l = [x.encode('UTF8') for x in names]
        feature_l = [x.encode('UTF8') for x in data]
        name_list = [names.strip() for names in name_l]
        feature_list = [features.strip() for features in feature_l]
        '''
        
        feature_list = [ l[2:] for l in all_list if int(l[1])==idx]
        # hash array
        name_list = [ l[2] for l in all_list if int(l[1])==idx ]
        #print "feature_list=",feature_list
        #print "name_list=",name_list
        
        ##########data seperation######
        id_perm = data_seperation_random(name_list)
        
        
        num_names = len(name_list)
        print 'INFO: num of samples=', num_names
        num_train = int(training_portion * num_names)
        print 'INFO: num_train = ', num_train

        
        ########generate training data#########
        i = 0;
        #print "INFO: generate training data"
        #print "INFO: len(id_perm)=",len(id_perm)
        while i < num_train:
            #print i, id_perm[i]
            features = feature_list[id_perm[i]]
            
            #features = features.strip()
            #feature_array = features.split(' ')
            feature_array=features
            labels_training.append(label)
            
            length = len(feature_array)
            j = 0
            while j < length:
                feature = feature_array[j]
                feat, value = feature.split(':', 2)
                row_training.append(i + row_num_training)
                col_training.append(int(feat) - 1)
                features_training.append(int(value))
                max_feat_training = max(max_feat_training, int(feat))
                j = j+1
            i = i+1
        row_num_training = row_num_training + num_train
        i = num_train
        ########generate testing data#########
        while i < num_names:
            
            
            ####for generating testing data folder####
            test_file_name = name_list[id_perm[i]]
            
  
            features = feature_list[id_perm[i]]

            #features = features.strip()
            #feature_array = features.split(' ')
            feature_array=features
            labels_testing.append(label)
            
            length = len(feature_array)
            j = 0
            while j < length:
                feature = feature_array[j]
                feat, value = feature.split(':', 2)
                row_testing.append(i - num_train + row_num_testing)
                col_testing.append(int(feat) - 1)
                features_testing.append(int(value))
                max_feat_testing = max(max_feat_testing, int(feat))
                j = j+1
            i = i+1
        row_num_testing = row_num_testing + (num_names - num_train)
    
    # end for loop here ========================
        
    col_num = max(max_feat_training, max_feat_testing)
    if max_feat_training < col_num:
        for i in range (0, row_num_training):
            for j in range(max_feat_training, col_num):
                features_training.append(0)
                row_training.append(i)
                col_training.append(j)
    elif max_feat_testing < col_num:
        for i in range (0, row_num_testing):
            for j in range(max_feat_testing, col_num):
                features_testing.append(0)
                row_testing.append(i)
                col_testing.append(j)

    features_training = array(features_training)
    row_training = array(row_training)
    col_training = array(col_training)
    #print "row_training:", row_training
    #print "INFO: col_training:", col_training
    len_col = len(col_training)
    print "INFO: col_num:", col_num
    labels_training = array(labels_training)

    features_testing = array(features_testing)
    row_testing = array(row_testing)

    col_testing = array(col_testing)
    labels_testing = array(labels_testing)

    
    sparse_mtx = csc_matrix((features_training,(row_training,col_training)), shape=(row_num_training,col_num))
    #print "sparse_mtx.todense(), sparse_mtx.shape=",sparse_mtx.todense(), sparse_mtx.shape
    
    sparse_test = csc_matrix((features_testing,(row_testing,col_testing)), shape=(row_num_testing,col_num))
    #print " sparse_test.todense(), sparse_test.shape=",sparse_test.todense(), sparse_test.shape
    
    clf = svm.LinearSVC()
    #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)
    #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None)
    #print "labels_training=",labels_training
    #print "sparse_mtx=",sparse_mtx
    clf.fit(sparse_mtx, labels_training)
    
    #print "INFO: model:intercept=",clf.intercept_
    #print "INFO: model:coef=",clf.coef_
    
    labels_pred = clf.predict(sparse_test)
    #print "labels_pred:", labels_pred
    
    accuracy = clf.score(sparse_test, labels_testing)
    #print "INFO: data folder=", hdfs_feat_dir
    print "INFO: accuracy=", accuracy
    
    #####################################################################
    ##################calculate feature importance with predication labels#######################
    #####################################################################
    AA = sparse_mtx.todense()
    BB = sparse_test.todense()
    labels_train_pred = clf.predict(sparse_mtx)
    labels_test_pred = labels_pred
    
    
    #print "###################################################################################"
    print "INFO: ======= Calculate feature importance with predication labels =================="
    #print "###################################################################################"
    dic_importance_label = {}
    
    for j in range (0, col_num):  ###for all features in the loop
    

        ##############################
        #print "====new way with sparse matrix========="
        curr_col_train = sparse_mtx.getcol(j)
        sum_col = curr_col_train.sum(0)
        positive_feature_number = int(sum_col.tolist()[0][0])
        
        labels_value = 3 - labels_train_pred
        dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train)
        sum_product = dot_product.sum(1)
        labels_positive_sum = int(sum_product.tolist()[0][0])        
        
        sum_label_values = sum(labels_value)
        labels_negitive_sum = sum_label_values - labels_positive_sum
        

        ##############################
        #print "====new way with sparse matrix========="
        curr_col_test = sparse_test.getcol(j)
        sum_col = curr_col_test.sum(0)
        positive_feature_number = positive_feature_number + int(sum_col.tolist()[0][0])
        
        labels_value = 3 - labels_test_pred
        dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test)
        sum_product = dot_product.sum(1)
        labels_positive_sum = labels_positive_sum + int(sum_product.tolist()[0][0])
        
        sum_label_values = sum(labels_value)
        labels_negitive_sum = labels_negitive_sum + sum_label_values - int(sum_product.tolist()[0][0])
        
        
        
        
        n_total = row_num_training + row_num_testing
        negitive_feature_number  = n_total - positive_feature_number
        if positive_feature_number == 0:
            #print "feature ", j+1, "all 0s!" 
            dic_importance_label[j+1] = -100
        elif negitive_feature_number == 0:
            #print "feature ", j+1, "all 1s!" 
            dic_importance_label[j+1] = -200
        else:
            q_positive = float(labels_positive_sum)/positive_feature_number
            q_negitive = float(labels_negitive_sum)/negitive_feature_number
        
            
            Q = (q_positive - q_negitive)*sqrt(float(q_positive)*q_negitive/float(n_total)/float(n_total))
            dic_importance_label[j+1] = Q
            
    
    sorted_importance = sorted(dic_importance_label.items(), key=operator.itemgetter(1), reverse=True)   
    print "INFO: ======= Feature Importance(FIRM score) ================"
    
    
    if os.path.exists(local_score_file):
        try:
            os.remove(local_score_file)
        except OSError, e:
            print ("ERROR: %s - %s." % (e.local_score_file,e.strerror))
users = query_db('''select * from user''')
for u in users:
    print 'building an SVM for ' + u['username']
    uid = u['user_id']
    lib = query_db('''select * from library where user_id = ?''', [uid])
    pids = [x['paper_id'] for x in lib]  # raw pids without version
    posix = [xtoi[p] for p in pids]

    if not posix:
        break  # empty library for this user maybe?

    print posix

    y = np.zeros(X.shape[0])
    for p in pids:
        y[xtoi[p]] = 1

    #__init__(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source]
    clf = svm.LinearSVC(class_weight='auto',
                        verbose=True,
                        max_iter=10000,
                        tol=1e-6)
    clf.fit(X, y)
    s = clf.decision_function(X)

    sortix = np.argsort(-s)
    user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)]

print 'writing user_sim.p'
pickle.dump(user_sim, open("user_sim.p", "wb"))
Ejemplo n.º 27
0
print 
print'''

# print "Test part"
# test_data =[]
# for s in test_sents:
# 	test_data.extend(sent2features(s))

# test_vectors = vec.transform(test_data)

# test_labels = []
# for s in test_sents:
# 	test_labels.extend(sent2labels(s))

#classifier_rbf = svm.SVC(kernel='linear')
classifier_rbf = svm.LinearSVC()
print "Fitting"
classifier_rbf.fit(train_vectors, train_labels)
print "Dumping"

# save the classifier
with open('my_dumped_SVMTimexTypeclassifier.pkl', 'wb') as fid:
    pickle.dump(classifier_rbf, fid)
    pickle.dump(vec, fid)
'''
# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
    gnb_loaded = cPickle.load(fid)
prediction_rbf = classifier_rbf.predict(test_vectors)

prediction_rbf = list(prediction_rbf)
Ejemplo n.º 28
0
import cv2
import os
import random
import argparse
import numpy as np
from sklearn import svm

########## Variables ##########

random_seed = 42
random.seed(random_seed)
target_img_size = (32, 32)
np.random.seed(random_seed)

classifiers = {'SVM': svm.LinearSVC(random_state=random_seed)}

########## Methods ##########


def extract_hog_features(img):
    img = cv2.resize(img, target_img_size)
    win_size = (32, 32)
    cell_size = (4, 4)
    block_size_in_cells = (2, 2)

    block_size = (block_size_in_cells[1] * cell_size[1],
                  block_size_in_cells[0] * cell_size[0])
    block_stride = (cell_size[1], cell_size[0])
    nbins = 9
    hog = cv2.HOGDescriptor(win_size, block_size, block_stride, cell_size,
                            nbins)
Ejemplo n.º 29
0
iris = datasets.load_iris()

# fetch the first two features
x = iris.data[:, :2]
y = iris.target

# step size in the mesh
h = 0.02

# SVM regularization parameter
C = 1.0

svc = svm.SVC(kernel = 'linear', C = C).fit(x, y)
rbf_svc = svm.SVC(kernel = 'rbf', gamma = 0.7, C = C).fit(x, y)
poly_svc = svm.SVC(kernel = 'poly', degree = 3, C = C).fit(x, y)
lin_svc = svm.LinearSVC(C = C).fit(x, y)

# create a mesh to plot
x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

titles = ['SVC with linear kernel', 'LinearSVC (linear kernel)', 'SVC with RBF kernel', 'SVC with polynomial (degree = 3) kernel']


for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
	# plot the decision boundary
	plt.subplot(2, 2, i + 1)
	plt.subplots_adjust(wspace = 0.4, hspace = 0.4)
Ejemplo n.º 30
0
TRAIN = True
C = 1
MAX_ITER = 1000

if TRAIN:
    X_train = np.load(TRAIN_FEATURES_FILE)
    y_train = loadtxt(TRAIN_LABELS_FILE, dtype=float).astype(int)

    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.1)

    print X_train.shape, y_train.shape, X_val.shape, y_val.shape

    if CLASSIFIER == 'SVM':
        model = svm.LinearSVC(C=C, verbose=1, max_iter=MAX_ITER)
        model.fit(X_train, y_train)

    print model
    del X_train
    del y_train
    with open(MODEL_FILE, 'wb') as mf:
        pickle.dump(model, mf)
    val_preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, val_preds)
    print("Val Accuracy: %.2f%%" % (accuracy * 100.0))

else:
    with open(MODEL_FILE, 'rb') as mf:
        model = pickle.load(mf)
    X_test = np.load(TEST_FEATURES_FILE)