print(X_valid_features.shape) print(Y_train.shape) print(Y_valid.shape) from sklearn import svm from sklearn.model_selection import GridSearchCV import time # Hyper parameters # C penalty parameter of error term. Smaller values -> stronger regularization. param_grid = {'C': [1e-1, 1e0], 'max_iter': [500, 1000]} # Create model and fit to training data. # Do grid search CV to find the best hyperparameters start_time = time.time() svm_orig = svm.LinearSVC(max_iter=1000, dual=False) svm_orig = GridSearchCV(svm_orig, param_grid) Y_Train_Array = np.argmax(Y_train, axis=1) print(Y_Train_Array.shape) svm_orig.fit(X=X_train_features, y=Y_Train_Array) print("--- %s seconds ---" % (time.time() - start_time)) # Print model with chosen hyperparameters print(svm_orig) # Predict on test data svm_predict_orig = svm_orig.predict(X_valid_features) # Get accuracy svm_acc_orig = (svm_predict_orig == np.argmax(Y_valid, axis=1)).mean()
from sklearn.model_selection import train_test_split import os #reading data csv = pd.read_csv("data.csv") #choose data csv_data = csv[["temperature", "humidity"]] csv_label = csv["label"] #split data into train and test data_train, data_test, label_train, label_test = \ train_test_split(csv_data, csv_label) #training data clf = svm.LinearSVC() clf.fit(data_train, label_train) #predict data predict = clf.predict(data_test) #test out ac_score = metrics.accuracy_score(label_test, predict) # cl_report + metrics.classification_report(label_test, prediction) print("Model occuracy =", ac_score) #get test_set.csv file from influxdb os.system( 'influx -database tstest -format csv -execute \'select * from table03\' > test_set.csv' ) print("Finish querying")
# annot=True显示每个方格的数据 sns.heatmap(corr, annot=True) plt.show() # 特征选择 # features_remain = ['radius_mean','texture_mean', 'smoothness_mean','compactness_mean','symmetry_mean', 'fractal_dimension_mean'] features_remain = data.columns[1:31] print(features_remain) print('-' * 100) # 抽取30%的数据作为测试集,其余作为训练集 # in this our main data is splitted into train and test train, test = train_test_split(data, test_size=0.3) # 抽取特征选择的数值作为训练和测试数据 train_X = train[features_remain] train_y = train['diagnosis'] test_X = test[features_remain] test_y = test['diagnosis'] # 采用Z-Score规范化数据,保证每个特征维度的数据均值为0,方差为1 ss = StandardScaler() train_X = ss.fit_transform(train_X) test_X = ss.transform(test_X) # 创建SVM分类器 model = svm.LinearSVC() # 用训练集做训练 model.fit(train_X, train_y) # 用测试集做预测 prediction = model.predict(test_X) print('准确率: ', metrics.accuracy_score(prediction, test_y))
np.save( '/neurospin/brainomics/2016_classif_hallu_fmri/unsupervised_fmri/clustering_only_hallu/cluster_randomB/subject_clusterB.npy', subject) np.save( '/neurospin/brainomics/2016_classif_hallu_fmri/unsupervised_fmri/clustering_only_hallu/cluster_randomB/y_clusterB.npy', y) #SVM & Leave one subject-out - no feature selection - WITH IMA samples ########################################################################### n = 0 list_predict = list() list_true = list() coef = np.zeros((23, 63966)) #coef=np.zeros((24,8028)) clf = svm.LinearSVC(C=1e-3, fit_intercept=True, class_weight='auto') for i in range(1, 24): test_bool = (subject == i) train_bool = (subject != i) Xtest = T[test_bool, :] ytest = y[test_bool] Xtrain = np.vstack((T_IMA_diff, T[train_bool, :])) ytrain = np.hstack((y_IMA, y[train_bool])) list_true.append(ytest.ravel()) scaler = preprocessing.StandardScaler().fit(Xtrain) Xtrain = scaler.transform(Xtrain) Xtest = scaler.transform(Xtest) clf.fit(Xtrain, ytrain.ravel()) coef[n, :] = clf.coef_ pred = (clf.predict(Xtest))
def __init__(self, parameters={}): self.weight = svm.LinearSVC() self.params = {'regwgt': 0.0} self.reset(parameters)
# get the score of the model score = logisticRegr.score(X_test, y_test) # achieves score of 0.989090 ## LINEAR DISCRIMINANT ANALYSIS linearDA = LinearDiscriminantAnalysis() # fit linear discriminant model linearDA.fit(X_train, y_train) # make predictions lda_predictions = linearDA.predict(X_test) # get the score of the model score_lda = linearDA.score(X_test, y_test) # achieves score of 0.974545 ## SUPPORT VECTOR MACHINE supportVecMach = svm.LinearSVC() # fit support vector machine supportVecMach.fit(X_train, y_train) # make predictions svm_predictions = supportVecMach.predict(X_test) # get the score of the model score_svm = supportVecMach.score(X_test, y_test) # achieves score of 0.989009 ## DECISION TREE decisionTree = tree.DecisionTreeClassifier() # fit decision tree decisionTree.fit(X_train, y_train) # make predictions tree_predictions = decisionTree.predict(X_test) # get the score of the model
plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve of Naive Bayes Classifier') plt.legend(loc="lower right") plt.show() # #### In conclusion, the Naïve Bayes Classifier works well for the price range 1 # ##### b) Support Vector Machine (SVM) # SVM svm_clf = svm.LinearSVC(C=5.0, max_iter=10000) svm_clf.fit(train_x, train_y) pred_y = svm_clf.predict(test_x) #Calculate accuracy accuracy_svm_clf = [] class_correct = list(0. for i in range(num_classes)) class_total = list(0. for i in range(num_classes)) for i in range(len(test_y)): label = test_y[i] class_correct[label] += (test_y[i] == pred_y[i]) class_total[label] += 1 for i in range(num_classes): accuracy_svm_clf.append( np.round(100 * class_correct[i] / class_total[i], 2))
def getData(brand_num): modified_data = shapeCsv(brand_num,True) # 要素数の設定 count_m = len(modified_data) # 最終日のデータを削除 successive_data = np.delete(modified_data ,count_m - 1, axis=0) # データの正規化 ms = MinMaxScaler() ms.fit(successive_data) successive_data = ms.transform(successive_data) # データの標準化 sc = StandardScaler() sc.fit(successive_data) successive_data = sc.transform(successive_data) # 正解値を格納するリスト 価格上昇: 1 価格低下:0 answers = [] # 正解値の格納 for i in range(1, count_m): # 上昇率が0以上なら1、そうでないなら0を格納 if modified_data[i,2] > 0: answers.append(1) else: answers.append(0) # データの分割(データの80%を訓練用に、20%をテスト用に分割する) X_train, X_test, y_train, y_test = train_test_split(successive_data, answers, test_size=0.2, random_state=1) parameters = {'C':[1, 3, 5],'loss':('hinge', 'squared_hinge')} # グリッドサーチを実行 clf = GridSearchCV(svm.LinearSVC(), parameters) clf.fit(X_train, y_train) # グリッドサーチ結果(最適パラメータ)を取得 GS_C, GS_loss = clf.best_params_.values() # 最適パラメータを指定して学習 clf = svm.LinearSVC(loss=GS_loss, C=GS_C,random_state=1) clf.fit(X_train , y_train) #2/7までのデータを予想させる target_data = shapeCsv(brand_num,False) # データの正規化 ms = MinMaxScaler() ms.fit(target_data) target_data = ms.transform(target_data) # データの標準化 sc = StandardScaler() sc.fit(successive_data) target_data = sc.transform(target_data) target_len = len(target_data) target_predict = clf.predict(target_data) #2/8以降の予想を返す return target_predict[target_len-1]
X_devel = pd.read_csv(features_path + task_name + '.' + i + '.devel.csv', sep=sep, header=header, usecols=range(ind_off,num_feat+ind_off), dtype=np.float32) # X_test = pd.read_csv(features_path + task_name + '.' + x + '.test.csv', sep=sep, header=header, usecols=range(ind_off,num_feat+ind_off), dtype=np.float32).values X_train_fused = pd.concat((X_train_fused,X_train),axis=1) X_devel_fused = pd.concat((X_devel_fused,X_devel),axis=1) # Feature normalisation scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train_fused) X_devel = scaler.transform(X_devel_fused) # Train SVM model with different complexities and evaluate uar_scores = [] print(f'current features set is: {feat_fusion_4[i]}') for comp in complexities: print('\nComplexity {0:.6f}'.format(comp)) clf = svm.LinearSVC(C=comp, random_state=0, max_iter=100000) clf.fit(X_train, y_train) y_pred = clf.predict(X_devel) uar_scores.append( recall_score(y_devel, y_pred, labels=classes, average='macro') ) print('UAR on Devel {0:.1f}'.format(uar_scores[-1]*100)) if show_confusion: print('Confusion matrix (Devel):') print(classes) print(confusion_matrix(y_devel, y_pred, labels=classes)) # Train SVM model on the whole training data with optimum complexity and get predictions on test data optimum_complexity = complexities[np.argmax(uar_scores)] print('\nOptimum complexity: {0:.6f}, maximum UAR on Devel {1:.1f}\n'.format(optimum_complexity, np.max(uar_scores)*100)) UAR.extend(np.max(uar_scores)*100)
labelIdx = 2 import handleClassLabels print "Class Label Vector Y Extraction Started" YLabels = handleClassLabels.extractClassLabels(filename, labelIdx) print "Class Label Vector Y of size ", len(YLabels), " extracted" #Setting up scaler for standardisation from sklearn import preprocessing scaler = preprocessing.StandardScaler() # Training SVM from sklearn import svm from sklearn import linear_model print "Declaring SVM" #clf = svm.LinearSVC(); # linearsvc1 clf = svm.LinearSVC(C=1000.0, class_weight='auto', penalty='l1', dual=0) # linearsvc2 #clf = svm.SVC(cache_size = 1000, class_weight='auto', kernel = 'poly'); # Predicts all as POSITIVE :(( #clf = linear_model.SGDClassifier(); # not tried yet print "standardising training data" XFeatures = scaler.fit_transform(XFeatures, YLabels) print "Fitting Data To SVM" clf.fit(XFeatures, YLabels) print "SVM trained" # Saving Trained Classifier from sklearn.externals import joblib print "Saving SVM" fileToSave = "UnigramBigramSVMClassifier.joblib.pkl" _ = joblib.dump(clf, fileToSave, compress=9) print "Classifier SAVED!"
insertSql(sql) """ SVM """ clfSVC = svm.SVC() clfSVC.fit(X_train, y_train) predict_values=clfSVC.predict(X_test) svm_score=r2_score(y_test,predict_values) print "Accuracy of SVM", svm_score sql = "INSERT INTO earthquakefour(Name,pydata) VALUES ('svm_score','"+str(svm_score)+"')" """print dt_score""" insertSql(sql) """ svm.LinearSVC() """ clfLSVC = svm.LinearSVC() clfLSVC.fit(X_train, y_train) predict_values=clfLSVC.predict(X_test) svmlc_score=r2_score(y_test,predict_values) print "Accuracy of Linear " , svmlc_score sql = "INSERT INTO earthquakefour(Name,pydata) VALUES ('svmlc_score','"+str(svmlc_score)+"')" """print dt_score""" insertSql(sql) """ naive bayes
def svm_with_rho_squared(X_train, Y_train, X_test, Y_test, upper_params_norm_sq, use_bias, weight_decay=None): """ Train Support Vector Machine Trains an SVM that has params with squared norm roughly equals (and no larger) than upper_params_norm_sq. It works by doing binary search on the weight_decay. Parameters ---------- X_train : np.ndarray of shape (instances, dimensions) Input training features Y_train : np.ndarray of shape (instances,) Input training labels X_test : np.ndarray of shape (instances, dimensions) Input testing features Y_test : np.ndarray of shape (instances) Input testing labels upper_params_norm_sq : ??? use_bias : ??? weight_decay : float Returns ------- train_loss : float Training loss train_acc : float Training accuracy test_loss : float Testing loss test_acc : float Testing accuracu params_norm_sq : ??? weight_decay : ??? params : np.ndarray of shape (dimensions,) Fit coefficients bias : float Fit intercept svm_model : ??? Trained Support Vector Machine model """ rho_sq_tol = 0.01 params_norm_sq = None if weight_decay is None: lower_wd_bound = 0.001 upper_wd_bound = 256.0 else: lower_wd_bound = 0.001 upper_wd_bound = 2 * (weight_decay) - lower_wd_bound if upper_wd_bound < lower_wd_bound: upper_wd_bound = lower_wd_bound lower_weight_decay = lower_wd_bound upper_weight_decay = upper_wd_bound weight_decay = (upper_weight_decay + lower_weight_decay) / 2 while ( (params_norm_sq is None) or (upper_params_norm_sq > params_norm_sq) or (np.abs(upper_params_norm_sq - params_norm_sq) > rho_sq_tol)): print('Trying weight_decay %s..' % weight_decay) C = 1.0 / (X_train.shape[0] * weight_decay) svm_model = svm.LinearSVC( C=C, tol=1e-6, loss='hinge', fit_intercept=use_bias, random_state=24, max_iter=100000, verbose=True) svm_model.fit(X_train, Y_train) params = np.reshape(svm_model.coef_, -1) bias = svm_model.intercept_[0] params_norm_sq = np.linalg.norm(params)**2 + bias**2 if upper_params_norm_sq is None: break print('Current params norm sq = %s. Target = %s.' % (params_norm_sq, upper_params_norm_sq)) # Current params are too small; need to make them bigger # So we should reduce weight_decay if upper_params_norm_sq > params_norm_sq: upper_weight_decay = weight_decay # And if we are too close to the lower bound, we give up if weight_decay < lower_wd_bound + 1e-5: print('Too close to lower bound, breaking') break # Current params are too big; need to make them smaller # So we should increase weight_decay else: lower_weight_decay = weight_decay # And if we are already too close to the upper bound, we should bump up the upper bound if weight_decay > upper_wd_bound - 1e-5: upper_wd_bound *= 2 upper_weight_decay *= 2 if ( (upper_params_norm_sq > params_norm_sq) or (np.abs(upper_params_norm_sq - params_norm_sq) > rho_sq_tol)): weight_decay = (upper_weight_decay + lower_weight_decay) / 2 train_loss = hinge_loss(params, bias, X_train, Y_train) test_loss = hinge_loss(params, bias, X_test, Y_test) train_acc = svm_model.score(X_train, Y_train) test_acc = svm_model.score(X_test, Y_test) print(' Train loss : ', train_loss) print(' Train acc : ', train_acc) print(' Test loss : ', test_loss) print(' Test acc : ', test_acc) print(' Sq norm of params+bias : ', params_norm_sq) print('\n') return train_loss, train_acc, test_loss, test_acc, params_norm_sq, weight_decay, \ params, bias, svm_model
from numpy import array bestKvalue = 0 bestKValueAcc = 0 highestKAccuracy = 0 highestk = 0 for p in range(1, 96, 10): NUM_OF_ITERATIONS = 5000 K = p # num of points for uniform crossover print("K Value: " + str(K)) errorRates = [] accuracies = [] labels = [] lsvm = svm.LinearSVC() # Retrieve feature vectors featureVectors = FileUtil.createFeatureVectors( "../../Feature Vectors/outputNormalizedCAS.txt") # Retrieve training set of random 25 population originalTrainingSet = ElitistGeneticAlgorithm.determineStartingPopulation( featureVectors) # Separate labels and data currentDataSet, labels = ElitistGeneticAlgorithm.separateLabels( originalTrainingSet) # create initial population population = ElitistGeneticAlgorithm.createIndividuals()
neighbor_count_target, tfidf_cos ]).T print(training_features) # scale training_features = preprocessing.scale(training_features) # convert labels into integers then into column array labels = [int(element[2]) for element in training_set] labels = list(labels) labels_array = np.array(labels) print("evaluating") # evaluation kf = KFold(len(training_set), n_folds=10) sumf1 = 0 for train_index, test_index in kf: X_train, X_test = training_features[train_index], training_features[ test_index] y_train, y_test = labels_array[train_index], labels_array[test_index] # initialize basic SVM classifier = svm.LinearSVC() # train classifier.fit(X_train, y_train) pred = classifier.predict(X_test) sumf1 += f1_score(pred, y_test) print("\n\n") print(sumf1 / 10.0)
if zoom_out[i] == 1: count_7 = count_7 + 1 trainning_data = np.column_stack((gx, gy, ax, ay, az)) #print(trainning_data) print(count) print(count_1) print(count_2) print(count_3) print(count_4) print(count_5) print('zoom in ', count_6) print('zoom out', count_7) print(len(trainning_data)) print(len(gx)) clf_0 = svm.LinearSVC(max_iter=100000) clf_0.fit(trainning_data, scoll_down) clf_1 = svm.LinearSVC(max_iter=100000) clf_1.fit(trainning_data, scoll_up) clf_2 = svm.LinearSVC(max_iter=100000) clf_2.fit(trainning_data, zoom_in) clf_3 = svm.LinearSVC(max_iter=100000) clf_3.fit(trainning_data, zoom_out) clf_4 = svm.NuSVC(nu=0.1) clf_4.fit(trainning_data, scoll_down) #print(clf.decision_function(trainning_data))
thisPCA = PCA(n_components=i) pcaTrainArr.append(thisPCA.fit_transform(trainData)) pcaTestArr.append(thisPCA.transform(testData)) return (pcaTrainArr, pcaTestArr) # 0. calculate the PCA and LDA reductions of the dataset pcaIterArr = np.arange(70, 171, 20) ldaIterArr = np.arange(3, 10, 1) ldaTrainArr, ldaTestArr = LDAreduct(ldaIterArr) pcaTrainArr, pcaTestArr = PCAreduct(pcaIterArr) # Section 1: Use the different SVM without any dimension reduction and get the results # 1. Use the Linear SVM without any dimension reduction thisSVM = svm.LinearSVC() thisSVM.fit(trainData, trainLabels) svmRaw = thisSVM.predict(testData) print("After using Linear SVM for classification, we have: ") printSummary(svmRaw, testLabels) # 2. Use the Polynomial kernal SVM without any dimension reduction thisSVM = svm.SVC(kernel='poly') thisSVM.fit(trainData, trainLabels) svmRaw = thisSVM.predict(testData) print("After using poly kernal SVM for classification, we have: ") printSummary(svmRaw, testLabels) # 3. Use the RBF kernal SVM without any dimension reduction thisSVM = svm.SVC(kernel='rbf') thisSVM.fit(trainData, trainLabels)
}, "sgd": { "model_name": "SGD", "model_package": "sklearn.linear_model", "model": linear_model.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2), "param_grid": {} }, "liner_svc": { "model_name": "LinearSVC", "model_package": "sklearn.svm", "model": svm.LinearSVC(), "param_grid": { "C": [0.1, 1, 10, 100, 1000], "gamma": [5, 1, 0.1, 0.01, 0.001, 0.0001], "kernel": ["rbf", 'poly', 'linear', 'sigmoid'], } }, "svc": { "model_name": "SVM", "model_package": "sklearn.svm", "model": svm.SVC(), "param_grid": { "C": [0.1, 1, 10, 100, 1000], "gamma": [5, 1, 0.1, 0.01, 0.001, 0.0001], "kernel": ["rbf", 'poly', 'linear', 'sigmoid'], }
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn import svm import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=0) C = 1.0 # SVM regularization parameter # LinearSVC (linear kernel) lin_svc = svm.LinearSVC(C=C).fit(X_train, y_train) y_pred = lin_svc.predict(X_test) print(y_pred) print(y_test) classifier_score = np.mean(y_pred == y_test) print(classifier_score)
def train(self, inputdir, cache, clusters, modelout): # First, we want to train the classifier training_gold = open(inputdir + '/training.gold.tsv') training_tokens = open(inputdir + '/training.tokens') dev_gold = open(inputdir + '/dev.gold.tsv') dev_tokens = open(inputdir + '/dev.tokens') test_input = open(inputdir + '/test.input.tsv') test_tokens = open(inputdir + '/test.tokens') gold_lines = [line.strip() for line in training_gold] token_lines = [line.strip() for line in training_tokens] gold_lines += [line.strip() for line in dev_gold] token_lines += [line.strip() for line in dev_tokens] test_input_lines = [line for line in test_input] test_token_lines = [line.strip() for line in test_tokens] assert (len(gold_lines) == len(token_lines)) print "Loaded %s training examples." % len(gold_lines) label_to_int = { '"positive"': 0, '"neutral"': 1, '"objective-OR-neutral"': 1, '"negative"': 2 } int_to_label = {0: 'positive', 1: 'neutral', 2: 'negative'} training_positive = [] training_negative = [] training_neutral = [] training_corpus = map(lambda x: x.split('\t'), token_lines) word_ngrams, nonc_ngrams, char_ngrams = self._corpus_ngrams( training_corpus) print "Generated ngram encodings for training corpus." print "Contains %s @ mentions." % len( filter(lambda x: len(x) == 1 and x[0][0] == '@', word_ngrams.keys())) #print "Contains %s used only once." % len(filter(lambda x: ngram_counts[x] == 1, word_ngrams.keys())) print "Contains %s URLs." % len( filter(lambda x: len(x) == 1 and x[0][:4] == 'http', word_ngrams.keys())) lexicons = self._load_lexicons(cache) print "Loaded the lexicons." w2c, c2w, cids = self._load_clusters(clusters) print "Loaded the clusters." training_features = [] training_classes = [] for gold_line, tokenized_line in zip(gold_lines, token_lines): _, _, label, _ = gold_line.split('\t') tweet = tokenized_line.split('\t')[0] features = self.generate_features(tweet, w2c, cids, word_ngrams, nonc_ngrams, char_ngrams, lexicons) training_features.append(features) training_classes.append(label_to_int[label]) if len(training_features) % 1000 == 0: print "Loaded %s feature vectors." % len(training_features) test_features = [] for tokenized_line in test_token_lines: tweet = tokenized_line.split('\t')[3] features = self.generate_features(tweet, w2c, cids, word_ngrams, nonc_ngrams, char_ngrams, lexicons) test_features.append(features) classifier = svm.LinearSVC(C=0.005) print "Created classifier. Training..." classifier.fit(training_features, training_classes) print "Trained classifier." print "Predicting %s test cases." % len(test_features) test_predictions = classifier.predict(test_features) print "Finished prediction. Outputting now." with open('test_predictions.txt', 'w') as fout: for (prediction, line) in zip(test_predictions, test_input_lines): col1, col2, _, tweet = line.split('\t') label = int_to_label[prediction] fout.write('%s\t%s\t%s\t%s' % (col1, col2, label, tweet)) print "Done outputting predictions." print "Saving model..." with open(modelout, 'wb') as savefile: model = { 'label_to_int': label_to_int, 'int_to_label': int_to_label, 'word_ngrams': word_ngrams, 'nonc_ngrams': nonc_ngrams, 'char_ngrams': char_ngrams, 'lexicons': lexicons, 'w2c': w2c, 'c2w': c2w, 'cids': cids, 'classifier': classifier } pickle.dump(model, savefile)
def mymain(dataset): dict = {} np.random.seed(1337) #unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE) if USE_BIGRAMS: bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE) #bigrams = utils.top_n_bigrams(dataset, BIGRAM_SIZE) tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False) if TRAIN: train_tweets, val_tweets = utils.split_data(tweets) else: random.shuffle(tweets) train_tweets = tweets del tweets print ('Extracting features & training batches') clf = svm.LinearSVC(C=0.1) batch_size = len(train_tweets) i = 1 n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size))) for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): utils.write_status(i, n_train_batches) i += 1 if FEAT_TYPE == 'frequency': tfidf = apply_tf_idf(training_set_X) training_set_X = tfidf.transform(training_set_X) clf.fit(training_set_X, training_set_y) print ('\n') print ('Testing') if TRAIN: correct, total = 0, len(val_tweets) i = 1 batch_size = len(val_tweets) n_val_batches = int(np.ceil(len(val_tweets) / float(batch_size))) for val_set_X, val_set_y in extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size): if FEAT_TYPE == 'frequency': val_set_X = tfidf.transform(val_set_X) prediction = clf.predict(val_set_X) correct += np.sum(prediction == val_set_y) utils.write_status(i, n_val_batches) i += 1 dict.update({'dataset': dataset}) dict.update({'correct': correct}) dict.update({'total': total}) rslt = correct * 100. / total dict.update({'result': round(rslt, 2)}) #print('Dictionary Result ',dict) print ('\nCorrect: %d/%d = %.4f %%' % (correct, total, correct * 100. / total)) #return dict else: del train_tweets #test_tweets = process_tweets(TEST_PROCESSED_FILE, test_file=True) test_tweets = process_tweets(dataset, test_file=True) n_test_batches = int(np.ceil(len(test_tweets) / float(batch_size))) predictions = np.array([]) print ('Predicting batches') i = 1 for test_set_X, _ in extract_features(test_tweets, test_file=True, feat_type=FEAT_TYPE): if FEAT_TYPE == 'frequency': test_set_X = tfidf.transform(test_set_X) prediction = clf.predict(test_set_X) predictions = np.concatenate((predictions, prediction)) utils.write_status(i, n_test_batches) i += 1 predictions = [(str(j), int(predictions[j])) for j in range(len(test_tweets))] utils.save_results_to_csv(predictions, 'svm.csv') print ('\nSaved to svm.csv') return dict
linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), #Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] #note: this is an alternative to train_test_split cv_split = model_selection.ShuffleSplit( n_splits=10, test_size=.3, train_size=.6, random_state=0
# -*- coding: utf-8 -*- from sklearn import svm, datasets, neighbors iris = datasets.load_iris() svc = svm.LinearSVC() svc.fit(iris.data, iris.target) # 学习 print(svc.predict([[5.0, 3.0, 5.0, 2.0]])) knn = neighbors.KNeighborsClassifier() # 从已有数据中学习 knn.fit(iris.data, iris.target) # 利用分类模型进行未知数据的预测(确定标签) print(knn.predict([[5.0, 3.0, 5.0, 2.0]]))
def train_main(self): data = pd.DataFrame() model_dict = dict() train_data_path = self.train_data_path for i in train_data_path: data_tmp = pd.read_excel(i, header=0) data_tmp.columns = ["pid", "label", "context"] data = pd.concat([data, data_tmp]) data = shuffle(data) data["context_ngram"] = data[["context"]].applymap(ngram_process) context = data["context_ngram"].values label = data[["label"]].applymap(fun_map).values data_test = pd.read_excel(self.test_data_path, header=0) data_test.columns = ["pid", "label", "context"] data_test["context_ngram"] = data_test[["context"]].applymap(ngram_process) test_context = data_test["context_ngram"].values test_label = data_test[["label"]].applymap(fun_map).values # tf idf tf_idf = TfidfVectorizer(analyzer=fun_1, min_df=50) tf_idf.fit(context) model_dict["model_1"] = pickle.dumps(tf_idf) feature_names = tf_idf.get_feature_names() model_dict["feature_names"] = pickle.dumps(feature_names) print("feature num", len(feature_names)) x_train = tf_idf.transform(context) x_test = tf_idf.transform(test_context) # chi model = SelectKBest(chi2, k="all") model.fit(x_train, label) model_dict["model_2"] = pickle.dumps(model) x_train = model.transform(x_train) x_test = model.transform(x_test) classify = svm.LinearSVC(C=0.9) # param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} # grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2) # grid = xgb.XGBClassifier() # print(grid.best_params_) classify = calibration.CalibratedClassifierCV(classify, cv=10) classify.fit(x_train, label) y_predict = classify.predict(x_test) print(metrics.classification_report(test_label, y_predict)) print("accuracy:", metrics.accuracy_score(test_label, y_predict)) model_dict["model_3"] = pickle.dumps(classify) with open(self.model_path, mode='wb') as fm: joblib.dump(model_dict, fm)
pass elif FEATURE_EXTRACTION == 'pca': t0 = time() pca = decomposition.PCA(n_components=100) train_X = sp.sparse.coo_matrix(pca.fit_transform(train_X.todense())) test_X = sp.sparse.coo_matrix(pca.transform(test_X.todense())) print 'pca done in %0.3f' % (time() - t0) elif FEATURE_EXTRACTION == 'ica': t0 = time() ica = decomposition.FastICA(n_components=100) train_X = sp.sparse.coo_matrix(ica.fit_transform(train_X.todense())) test_X = sp.sparse.coo_matrix(ica.transform(test_X.todense())) print 'ica done in %0.3f' % (time() - t0) elif FEATURE_EXTRACTION == 'l1-svc': t0 = time() l1svc = svm.LinearSVC(C=1, penalty='l1', dual=False) l1svc.fit(train_X, train_y) train_X = l1svc.transform(train_X) test_X = l1svc.transform(test_X) print 'l1-svc feature selection done in %0.3f' % (time() - t0) else: raise RuntimeError('unknown feature extraction method') # <codecell> ## define feature names from tfidf vectorizer tfidf_feature_names = tfidf_vectorizer.get_feature_names() print train_X.shape, test_X.shape # <codecell>
def feat_importance_firm(row_id_str, ds_id, hdfs_feat_dir, local_score_file , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples , training_fraction, jobname, uploadtype, description_file): # zip func in other files for Spark workers ================= ================ zip_file_path = ml_util.ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path # get_spark_context sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) t0 = time() # get feature seq mapping from mongo if uploadtype == "MD5 List IN-dynamic": ### connect to database to get the column list which contains all column number of the corresponding feature key = "dict_dynamic" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = {} max_feature = 0 # reverse dict{hashes:sequence number} ====== for i in range(0, len(dic_list)): for key in dic_list[i]: dic_all_columns[eval(dic_list[i][key])] = key if eval(dic_list[i][key]) > max_feature: max_feature = eval(dic_list[i][key]) print "INFO: max_feature=",max_feature #print "dic_all_columns=",dic_all_columns # fid:numb,numb dirFile_loc = os.path.join(hdfs_feat_dir , "metadata") dirFolders = sc.textFile(dirFile_loc) hash_Folders = dirFolders.collect() #print "INFO: dirFile_loc=",dirFile_loc,", hash_Folders=",hash_Folders folder_list = [x.encode('UTF8') for x in hash_Folders] print "INFO: hdfs folder_list=",folder_list #['dirty/', 'clean/'] # source libsvm filename libsvm_data_file = os.path.join(hdfs_feat_dir , hdfs_file_name) print "INFO: libsvm_data_file=", libsvm_data_file # load feature count file #feat_count_file=libsvm_data_file+"_feat_count" #feature_count=zip_feature_util.get_feature_count(sc,feat_count_file) #print "INFO: feature_count=",feature_count # load sample RDD from text file #samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count \ # , excluded_feat_cslist=None) samples_rdd=sc.textFile(libsvm_data_file).cache() # collect all data to local for processing =============== all_data = samples_rdd.collect() all_list = [ ln.split(' ') for ln in all_data ] sample_count=len(all_data) # label array #labels_list_all = [x.label for x,_ in all_data] #print "INFO: labels_list_all=",labels_list_all # get feature seq : ngram hash mapping ================================== key = "dic_seq_hashes" #{"123":"136,345"} jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] dic_all_columns = dic_list feature_count = len(dic_list) # get hash : raw string mapping ================================== key = "dic_hash_str" #{"123":"openFile"} jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_hash_str = doc['value'] features_training = [] labels_training = [] names_training = [] row_training = [] col_training = [] max_feat_training = 0 row_num_training = 0 features_testing = [] labels_testing = [] names_testing = [] row_testing = [] col_testing = [] max_feat_testing = 0 row_num_testing = 0 # loop through hdfs folders; TBD for idx, folder in enumerate(folder_list): print "INFO: folder=", folder label = folder_list.index(folder) + 1 print 'INFO: label=', label #logFile_name = os.path.join( hdfs_feat_dir, folder , mtx_name_list) #print "XXXXXXXXXXlogFile_name=",logFile_name #logFile_data = os.path.join( hdfs_feat_dir , folder , mtx_libsvm) #print "XXXXXXXXXXlogFile_data=",logFile_data ''' logNames = sc.textFile(logFile_name).cache() logData = sc.textFile(logFile_data).cache() names = logNames.collect() data = logData.collect() name_l = [x.encode('UTF8') for x in names] feature_l = [x.encode('UTF8') for x in data] name_list = [names.strip() for names in name_l] feature_list = [features.strip() for features in feature_l] ''' feature_list = [ l[2:] for l in all_list if int(l[1])==idx] # hash array name_list = [ l[2] for l in all_list if int(l[1])==idx ] #print "feature_list=",feature_list #print "name_list=",name_list ##########data seperation###### id_perm = data_seperation_random(name_list) num_names = len(name_list) print 'INFO: num of samples=', num_names num_train = int(training_portion * num_names) print 'INFO: num_train = ', num_train ########generate training data######### i = 0; #print "INFO: generate training data" #print "INFO: len(id_perm)=",len(id_perm) while i < num_train: #print i, id_perm[i] features = feature_list[id_perm[i]] #features = features.strip() #feature_array = features.split(' ') feature_array=features labels_training.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_training.append(i + row_num_training) col_training.append(int(feat) - 1) features_training.append(int(value)) max_feat_training = max(max_feat_training, int(feat)) j = j+1 i = i+1 row_num_training = row_num_training + num_train i = num_train ########generate testing data######### while i < num_names: ####for generating testing data folder#### test_file_name = name_list[id_perm[i]] features = feature_list[id_perm[i]] #features = features.strip() #feature_array = features.split(' ') feature_array=features labels_testing.append(label) length = len(feature_array) j = 0 while j < length: feature = feature_array[j] feat, value = feature.split(':', 2) row_testing.append(i - num_train + row_num_testing) col_testing.append(int(feat) - 1) features_testing.append(int(value)) max_feat_testing = max(max_feat_testing, int(feat)) j = j+1 i = i+1 row_num_testing = row_num_testing + (num_names - num_train) # end for loop here ======================== col_num = max(max_feat_training, max_feat_testing) if max_feat_training < col_num: for i in range (0, row_num_training): for j in range(max_feat_training, col_num): features_training.append(0) row_training.append(i) col_training.append(j) elif max_feat_testing < col_num: for i in range (0, row_num_testing): for j in range(max_feat_testing, col_num): features_testing.append(0) row_testing.append(i) col_testing.append(j) features_training = array(features_training) row_training = array(row_training) col_training = array(col_training) #print "row_training:", row_training #print "INFO: col_training:", col_training len_col = len(col_training) print "INFO: col_num:", col_num labels_training = array(labels_training) features_testing = array(features_testing) row_testing = array(row_testing) col_testing = array(col_testing) labels_testing = array(labels_testing) sparse_mtx = csc_matrix((features_training,(row_training,col_training)), shape=(row_num_training,col_num)) #print "sparse_mtx.todense(), sparse_mtx.shape=",sparse_mtx.todense(), sparse_mtx.shape sparse_test = csc_matrix((features_testing,(row_testing,col_testing)), shape=(row_num_testing,col_num)) #print " sparse_test.todense(), sparse_test.shape=",sparse_test.todense(), sparse_test.shape clf = svm.LinearSVC() #clf = svm.SVC(C=0.1, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None) #clf = svm.NuSVC(nu=0.3, kernel='rbf', degree=3, gamma=0.05, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, verbose=False, max_iter=-1, random_state=None) #print "labels_training=",labels_training #print "sparse_mtx=",sparse_mtx clf.fit(sparse_mtx, labels_training) #print "INFO: model:intercept=",clf.intercept_ #print "INFO: model:coef=",clf.coef_ labels_pred = clf.predict(sparse_test) #print "labels_pred:", labels_pred accuracy = clf.score(sparse_test, labels_testing) #print "INFO: data folder=", hdfs_feat_dir print "INFO: accuracy=", accuracy ##################################################################### ##################calculate feature importance with predication labels####################### ##################################################################### AA = sparse_mtx.todense() BB = sparse_test.todense() labels_train_pred = clf.predict(sparse_mtx) labels_test_pred = labels_pred #print "###################################################################################" print "INFO: ======= Calculate feature importance with predication labels ==================" #print "###################################################################################" dic_importance_label = {} for j in range (0, col_num): ###for all features in the loop ############################## #print "====new way with sparse matrix=========" curr_col_train = sparse_mtx.getcol(j) sum_col = curr_col_train.sum(0) positive_feature_number = int(sum_col.tolist()[0][0]) labels_value = 3 - labels_train_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_train) sum_product = dot_product.sum(1) labels_positive_sum = int(sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = sum_label_values - labels_positive_sum ############################## #print "====new way with sparse matrix=========" curr_col_test = sparse_test.getcol(j) sum_col = curr_col_test.sum(0) positive_feature_number = positive_feature_number + int(sum_col.tolist()[0][0]) labels_value = 3 - labels_test_pred dot_product = csr_matrix(np.array(labels_value)).dot(curr_col_test) sum_product = dot_product.sum(1) labels_positive_sum = labels_positive_sum + int(sum_product.tolist()[0][0]) sum_label_values = sum(labels_value) labels_negitive_sum = labels_negitive_sum + sum_label_values - int(sum_product.tolist()[0][0]) n_total = row_num_training + row_num_testing negitive_feature_number = n_total - positive_feature_number if positive_feature_number == 0: #print "feature ", j+1, "all 0s!" dic_importance_label[j+1] = -100 elif negitive_feature_number == 0: #print "feature ", j+1, "all 1s!" dic_importance_label[j+1] = -200 else: q_positive = float(labels_positive_sum)/positive_feature_number q_negitive = float(labels_negitive_sum)/negitive_feature_number Q = (q_positive - q_negitive)*sqrt(float(q_positive)*q_negitive/float(n_total)/float(n_total)) dic_importance_label[j+1] = Q sorted_importance = sorted(dic_importance_label.items(), key=operator.itemgetter(1), reverse=True) print "INFO: ======= Feature Importance(FIRM score) ================" if os.path.exists(local_score_file): try: os.remove(local_score_file) except OSError, e: print ("ERROR: %s - %s." % (e.local_score_file,e.strerror))
users = query_db('''select * from user''') for u in users: print 'building an SVM for ' + u['username'] uid = u['user_id'] lib = query_db('''select * from library where user_id = ?''', [uid]) pids = [x['paper_id'] for x in lib] # raw pids without version posix = [xtoi[p] for p in pids] if not posix: break # empty library for this user maybe? print posix y = np.zeros(X.shape[0]) for p in pids: y[xtoi[p]] = 1 #__init__(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)[source] clf = svm.LinearSVC(class_weight='auto', verbose=True, max_iter=10000, tol=1e-6) clf.fit(X, y) s = clf.decision_function(X) sortix = np.argsort(-s) user_sim[uid] = [strip_version(meta['pids'][ix]) for ix in list(sortix)] print 'writing user_sim.p' pickle.dump(user_sim, open("user_sim.p", "wb"))
print print''' # print "Test part" # test_data =[] # for s in test_sents: # test_data.extend(sent2features(s)) # test_vectors = vec.transform(test_data) # test_labels = [] # for s in test_sents: # test_labels.extend(sent2labels(s)) #classifier_rbf = svm.SVC(kernel='linear') classifier_rbf = svm.LinearSVC() print "Fitting" classifier_rbf.fit(train_vectors, train_labels) print "Dumping" # save the classifier with open('my_dumped_SVMTimexTypeclassifier.pkl', 'wb') as fid: pickle.dump(classifier_rbf, fid) pickle.dump(vec, fid) ''' # load it again with open('my_dumped_classifier.pkl', 'rb') as fid: gnb_loaded = cPickle.load(fid) prediction_rbf = classifier_rbf.predict(test_vectors) prediction_rbf = list(prediction_rbf)
import cv2 import os import random import argparse import numpy as np from sklearn import svm ########## Variables ########## random_seed = 42 random.seed(random_seed) target_img_size = (32, 32) np.random.seed(random_seed) classifiers = {'SVM': svm.LinearSVC(random_state=random_seed)} ########## Methods ########## def extract_hog_features(img): img = cv2.resize(img, target_img_size) win_size = (32, 32) cell_size = (4, 4) block_size_in_cells = (2, 2) block_size = (block_size_in_cells[1] * cell_size[1], block_size_in_cells[0] * cell_size[0]) block_stride = (cell_size[1], cell_size[0]) nbins = 9 hog = cv2.HOGDescriptor(win_size, block_size, block_stride, cell_size, nbins)
iris = datasets.load_iris() # fetch the first two features x = iris.data[:, :2] y = iris.target # step size in the mesh h = 0.02 # SVM regularization parameter C = 1.0 svc = svm.SVC(kernel = 'linear', C = C).fit(x, y) rbf_svc = svm.SVC(kernel = 'rbf', gamma = 0.7, C = C).fit(x, y) poly_svc = svm.SVC(kernel = 'poly', degree = 3, C = C).fit(x, y) lin_svc = svm.LinearSVC(C = C).fit(x, y) # create a mesh to plot x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1 y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) titles = ['SVC with linear kernel', 'LinearSVC (linear kernel)', 'SVC with RBF kernel', 'SVC with polynomial (degree = 3) kernel'] for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)): # plot the decision boundary plt.subplot(2, 2, i + 1) plt.subplots_adjust(wspace = 0.4, hspace = 0.4)
TRAIN = True C = 1 MAX_ITER = 1000 if TRAIN: X_train = np.load(TRAIN_FEATURES_FILE) y_train = loadtxt(TRAIN_LABELS_FILE, dtype=float).astype(int) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1) print X_train.shape, y_train.shape, X_val.shape, y_val.shape if CLASSIFIER == 'SVM': model = svm.LinearSVC(C=C, verbose=1, max_iter=MAX_ITER) model.fit(X_train, y_train) print model del X_train del y_train with open(MODEL_FILE, 'wb') as mf: pickle.dump(model, mf) val_preds = model.predict(X_val) accuracy = accuracy_score(y_val, val_preds) print("Val Accuracy: %.2f%%" % (accuracy * 100.0)) else: with open(MODEL_FILE, 'rb') as mf: model = pickle.load(mf) X_test = np.load(TEST_FEATURES_FILE)