def trainClassifier(trainData): print("Training Classifier...") pipeline = Pipeline([('svc', LinearSVC())]) return SklearnClassifier(pipeline).train(trainData)
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats) newsdata = {} ''' news_path = "./xa/" out_ = open('result.txt', 'w') for root, dirs, files, in os.walk(news_path): for name in files: if name == ".DS_Store": continue fp = open(root+'/'+name, 'r') #print(name) date = '' text = [] gotDate = False #print(root+'/'+name) for line in fp: if gotDate == False: date = line.replace('\n','') gotDate = True if date not in newsdata: newsdata[date] = [0,0] else: if len(line.strip()) == 0: gotDate = False continue text.append(line) #print(text) newsfeat = [(featx(f), date) for f in word_split(text)] del text[:] observed = classifier.classify(newsfeat[0][0]) if observed == 'neg': newsdata[date][1] += 1 #print('------------------------------ '+ 'neg') else: newsdata[date][0] += 1 #print('------------------------------ '+ 'pos') #print(root+'/'+name+': '+ 'pos') gotDate = False fp.close() for date in newsdata: #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1])) out_.write(date+'\n'+str(newsdata[date][0])+', '+str(newsdata[date][1])+'\n') out_.close() ''' out_ = open('TEST_result.txt', 'w') fp = open('test_half_half.txt', 'r') #print(name) date = '' text = [] gotDate = False #print(root+'/'+name) for line in fp: if gotDate == False: date = line.replace('\n', '') gotDate = True if date not in newsdata: newsdata[date] = [0, 0] else: if len(line.strip()) == 0: gotDate = False continue text.append(line) print(text) newsfeat = [(featx(f), date) for f in word_split(text)] del text[:] observed = classifier.classify(newsfeat[0][0]) if observed == 'neg': newsdata[date][1] += 1 print('------------------------------ ' + 'neg') else: newsdata[date][0] += 1 print('------------------------------ ' + 'pos') #print(root+'/'+name+': '+ 'pos') gotDate = False fp.close() for date in newsdata: #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1])) out_.write(date + '\n' + str(newsdata[date][0]) + ', ' + str(newsdata[date][1]) + '\n') out_.close()
def main(params): dp = DataProvider(params) auth_to_ix = dp.create_author_idx() # Preprocess the training data train_docs = [] targets = [] model = {} # remove numbers bad_hombres = range(10) if params['nostop']: bad_hombres = bad_hombres + stopwords.words('english') if params['nopunct']: bad_hombres = bad_hombres + list(string.punctuation) bad_hombres = set(bad_hombres) all_words = Counter() for i, doc in enumerate(dp.data['docs']): no_num = re.sub(r'\d+', '', doc['text'].lower()) curr_text = [ w for w in wordpunct_tokenize(no_num) if w not in bad_hombres ] dp.data['docs'][i]['tokenized'] = curr_text if doc['split'] == 'train': all_words.update(curr_text) short_vocab = { w: i for i, w in enumerate([ wrd for wrd in all_words if all_words[wrd] > params['vocab_threshold'] ]) } docCounts_train, target_train = count(dp, short_vocab, auth_to_ix, split='train') bow_features_train, idf_train = bow_features(docCounts_train, params['tfidf']) docCounts_val, target_val = count(dp, short_vocab, auth_to_ix, split='val') bow_features_val, _ = bow_features(docCounts_val, params['tfidf'], idf=idf_train) # Do PCA? if params['pca'] > 0: pca_model = PCA(n_components=params['pca']) bow_features_train = pca_model.fit_transform(bow_features_train) print 'Explained variance is %.2f' % (sum( pca_model.explained_variance_ratio_)) bow_features_val = pca_model.transform(bow_features_val) params['pca'] = bow_features_train.shape[-1] # Normalize the data bow_features_train, mean_tr, std_tr = normalize(bow_features_train) bow_features_val, _, _ = normalize(bow_features_val, mean_tr, std_tr) if params['mlp'] == False: if params['linearsvm']: # Linear SVC alread implements one-vs-rest svm_model = LinearSVC() # verbose=1) svm_model.fit(bow_features_train, target_train) # Time to evaluate now. confTr = svm_model.decision_function(bow_features_train) confVal = svm_model.decision_function(bow_features_val) else: params['num_output_layers'] = len(auth_to_ix) params['inp_size'] = params['pca'] model = MLP_classifier(params) model.fit(bow_features_train, target_train, bow_features_val, target_val, params['epochs'], params['lr'], params['l2']) confTr = model.decision_function(bow_features_train) confVal = model.decision_function(bow_features_val) mean_rank_train = np.where( confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1].mean() topk_train = ( np.where(confTr.argsort(axis=1)[:, ::-1] == target_train[:, None])[1] <= params['topk']).sum() * 100. / len(target_train) train_accuracy = 100. * float( (confTr.argmax(axis=1) == target_train).sum()) / len(target_train) mean_rank_val = np.where( confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1].mean() topk_val = ( np.where(confVal.argsort(axis=1)[:, ::-1] == target_val[:, None])[1] <= params['topk']).sum() * 100. / len(target_val) val_accuracy = 100. * float( (confVal.argmax(axis=1) == target_val).sum()) / len(target_val) # DO the binary evaluation similar to the Bagnall # confTr = confTr - confTr.mean(axis=1)[:,None] n_auths = len(auth_to_ix) n_train = confTr.shape[0] neg_auths_tr = np.random.randint(0, n_auths, n_train) adjusted_scores_tr = ((np.argsort( confTr[:, np.concatenate([target_train.astype(int), neg_auths_tr])], axis=0) == np.concatenate([np.arange(n_train), np.arange(n_train)])).argmax(axis=0) + 1) / float(n_train) auc_tr = roc_auc_score( np.concatenate([ np.ones(int(n_train), dtype=int), np.zeros(int(n_train), dtype=int) ]), adjusted_scores_tr) n_val = confVal.shape[0] neg_auths_val = np.random.randint(0, n_auths, n_val) adjusted_scores_val = ((np.argsort( confVal[:, np.concatenate([target_val.astype(int), neg_auths_val])], axis=0) == np.concatenate([np.arange(n_val), np.arange(n_val)])).argmax(axis=0) + 1) / float(n_val) auc_val = roc_auc_score( np.concatenate( [np.ones(int(n_val), dtype=int), np.zeros(int(n_val), dtype=int)]), adjusted_scores_val) print '------------- Training set-------------------' print 'Accuracy is %.2f, Mean rank is %.2f / %d' % ( train_accuracy, mean_rank_train, len(auth_to_ix)) print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_train) print 'Accuracy per adjusted scores %.3f' % (100. * ( (adjusted_scores_tr[:n_train] >= 0.5).sum() + (adjusted_scores_tr[n_train:] < 0.5).sum()) / (2. * n_train)) print 'AUC is %.2f' % (auc_tr) print '------------- Val set-------------------' print 'Accuracy is %.2f, Mean rank is %.2f / %d' % (val_accuracy, mean_rank_val, len(auth_to_ix)) print 'Top-%d Accuracy is %.2f' % (params['topk'], topk_val) print 'Accuracy per adjusted scores %.3f' % (100. * ( (adjusted_scores_val[:n_val] >= 0.5).sum() + (adjusted_scores_val[n_val:] < 0.5).sum()) / (2. * n_val)) print 'AUC is %.2f' % (auc_val) print '--------------------------------------------------------------------------' print '--------------------------------------------------------------------------\n\n'
axi.axis('off') from itertools import chain X_train = np.array([feature.hog(im) for im in chain(positive_patches, negative_patches)]) y_train = np.zeros(X_train.shape[0]) y_train[:positive_patches.shape[0]] = 1 X_train.shape from sklearn.naive_bayes import GaussianNB from sklearn.cross_validation import cross_val_score cross_val_score(GaussianNB(), X_train, y_train) from sklearn.svm import LinearSVC from sklearn.grid_search import GridSearchCV grid = GridSearchCV(LinearSVC(), {'C' :[1.0, 2.0, 4.0, 8.0]}) grid.fit(X_train, y_train) grid.best_score_ grid.best_params_ model = grid.best_estimator_ model.fit(X_train, y_train) import skimage test_image = skimage.data.astronaut() test_image = skimage.color.rgb2gray(test_image) test_image = skimage.transform.rescale(test_image, .5) test_image = test_image[:160, 40:180] plt.imshow(test_image, cmap='gray') plt.axis('off')
from sklearn.svm import LinearSVC # Initialize and fit the model model = LinearSVC() model.fit(X_train, y_train) # Generate predictions and score them manually predictions = model.predict(X_test) print(sum(predictions == y_test.squeeze()) / len(y_test))
# initialize the HOG descriptor hog = HOG(orientations = 18, pixelsPerCell = (10, 10), cellsPerBlock = (1, 1), normalize = True) # loop over the images for image in digits: # deskew the image, center it image = dataset.deskew(image, 20) image = dataset.center_extent(image, (20, 20)) # describe the image and update the data matrix hist = hog.describe(image) data.append(hist) # train the model model = LinearSVC(random_state = 42) model.fit(data, target) # dump the model to file f = open(args["model"], "w") f.write(cPickle.dumps(model)) f.close() cv2.putText(canvas,'Please select an image from the plates folder as filename:', (canvas.shape[1] - 600, canvas.shape[0] - 570), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.putText(canvas,'a. Car1Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 550), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.putText(canvas,'b. Car2Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 530), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.putText(canvas,'c. Car3Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 510), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.putText(canvas,'d. Car4Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 490), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.putText(canvas,'e. Car5Plate.jpg', (canvas.shape[1] - 600, canvas.shape[0] - 470), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.putText(canvas,'Enter a letter..', (canvas.shape[1] - 600, canvas.shape[0] - 450), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 255, 255), 2) cv2.imshow('CONSOLE',canvas)
from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.neighbors import KNeighborsClassifier # Define the classifiers classifiers = [LogisticRegression(), LinearSVC(), SVC(), KNeighborsClassifier()] # Fit the classifiers for c in classifiers: c.fit(X, y) # Plot the classifiers plot_4_classifiers(X, y, classifiers) plt.show()
""" Estimators Base article: Benchmarking functional connectome-based predictive models for resting-state fMRI """ from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.neural_network import MLPClassifier from sklearn import linear_model feature_selection = SelectPercentile(f_classif, percentile=10) svc_l1 = LinearSVC(penalty='l1', dual=False, random_state=0) anova_svcl1 = Pipeline([('anova', feature_selection), ('svc', svc_l1)]) svc_l2 = LinearSVC(penalty='l2', random_state=0) anova_svcl2 = Pipeline([('anova', feature_selection), ('svc', svc_l2)]) gnb = GaussianNB() randomf = RandomForestClassifier(random_state=0) logregression_l1 = LogisticRegression(penalty='l1', dual=False, random_state=0) logregression_l2 = LogisticRegression(penalty='l2', random_state=0) lasso = Lasso(random_state=0) knn = KNeighborsClassifier(n_neighbors=1) ridge = RidgeClassifier() netn5 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, ), random_state=1) netn5a = MLPClassifier(solver='adam',
# Create a color plot with the results n_classes = len(np.unique(y)) contours = ax.contourf(xx, yy, Z, alpha=0.3, levels=np.arange(n_classes + 1) - 0.5, cmap=cmap, clim=(y.min(), y.max()), zorder=1) ax.set(xlim=xlim, ylim=ylim) lsvc = LinearSVC(penalty='l2', loss='hinge', random_state=42, C=2) lsvc.fit(X_train, y_train) plot_decision_regions(X_train.values, y_train.values, clf=lsvc, res=0.001, legend=2) plt.title('Decision Regions') plt.xlabel('rank') plt.ylabel('comments') lsvc.predict(X_test) print('score: {}'.format(lsvc.score(X_test, y_test)))
from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import (cross_val_score, KFold, cross_validate, train_test_split) from sklearn.ensemble import StackingClassifier data = load_wine() y = data.target X = data.data stc = StandardScaler() lenc = LabelEncoder() columns = data.feature_names df = pd.DataFrame(data=np.hstack(tup=(X, y.reshape(-1, 1))), columns=np.hstack(tup=(columns, ["Class"]))) X_std = stc.fit_transform(df[columns]) pipesvm = Pipeline([("stc", stc), ("selection", RFE(LinearSVC())), ("svm", SVC(kernel="linear"))]) pipelda = Pipeline([("stc", stc), ("svm", LinearDiscriminantAnalysis())]) estimators = [("LDA", pipelda), ("SVM", pipesvm)] # El utilizar clasificadores apilados tiene beneficios cuando se trata de # problemas multiclase, puesto que puede mejorar mucho el pronostico de clase # al explotar el poder predictivo del pronostico para ciertas clases stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=GaussianNB()) print("Stacking stimators") print( cross_val_score(X=df[columns], y=y, estimator=stacking_classifier, cv=KFold(5))) print("Only SVM")
def plot_dataset(X, y, axes): plt.plot(X[:, 0][y == 0], X[:, 1][y == 0], "bs") plt.plot(X[:, 0][y == 1], X[:, 1][y == 1], "g^") plt.axis(axes) plt.grid(True, which='both') plt.xlabel(r"$x_1$", fontsize=20) plt.ylabel(r"$x_2$", fontsize=20, rotation=0) plot_dataset(X, y, [-1.5, 2.5, -1, 1.5]) plt.show() polynomial_svm_clf = Pipeline([("poly_features", PolynomialFeatures(degree=3)), ("scaler", StandardScaler()), ("svm_clf", LinearSVC(C=10, loss="hinge", random_state=42))]) polynomial_svm_clf.fit(X, y) def plot_predictions(clf, axes): x0s = np.linspace(axes[0], axes[1], 100) x1s = np.linspace(axes[2], axes[3], 100) x0, x1 = np.meshgrid(x0s, x1s) X = np.c_[x0.ravel(), x1.ravel()] y_pred = clf.predict(X).reshape(x0.shape) y_decision = clf.decision_function(X).reshape(x0.shape) plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2) plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)
def get_ten_fold_crossvalid_perfermance(self, settings = None): fisher_mode = settings['fisher_mode'] analysis_scr = [] predicted_score = settings['predicted_score'] reduce_ratio = settings['reduce_ratio'] #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1): #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0) kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds = 10, shuffle = True) #for subset_no in range(1, 11): for ((train_index, test_index),subset_no) in izip(kf,range(1,11)): #for train_index, test_index in kf; print("Subset:", subset_no) print("Train index: ", train_index) print("Test index: ", test_index) #logger.info('subset number: ' + str(subset_no)) if settings['SVM']: print "SVM" (train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(train_index, test_index, fisher_mode = fisher_mode, reduce_ratio = reduce_ratio) standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(scaled_train_X, train_y_reduced) predicted_test_y = Linear_SVC.predict(scaled_test_X) isTest = True; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(scaled_train_X) isTest = False; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values())) if settings['SVM_RBF']: print "SVM_RBF" standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) L1_SVC_RBF_Selector = SVC(C=1, gamma=0.01, kernel='rbf').fit(scaled_train_X, train_y_reduced) predicted_test_y = L1_SVC_RBF_Selector.predict(scaled_test_X) isTest = True; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM_RBF', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = L1_SVC_RBF_Selector.predict(scaled_train_X) isTest = False; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SVM_RBF', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values())) # direct deep learning min_max_scaler = Preprocessing_Scaler_with_mean_point5() X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced) X_train_pre_validation_minmax = min_max_scaler.transform(train_X_reduced) x_test_minmax = min_max_scaler.transform(test_X) pretraining_X_minmax = min_max_scaler.transform(train_X_10fold) x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(X_train_pre_validation_minmax, train_y_reduced , test_size=0.4, random_state=42) finetune_lr = settings['finetune_lr'] batch_size = settings['batch_size'] pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size) #pretrain_lr=0.001 pretrain_lr = settings['pretrain_lr'] training_epochs = settings['training_epochs'] hidden_layers_sizes= settings['hidden_layers_sizes'] corruption_levels = settings['corruption_levels'] if settings['SAE_SVM']: # SAE_SVM print 'SAE followed by SVM' x = X_train_pre_validation_minmax a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform(X_train_pre_validation_minmax) new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax) Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(new_x_train_minmax_A, train_y_reduced) predicted_test_y = Linear_SVC.predict(new_x_test_minmax_A) isTest = True; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SAE_SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(new_x_train_minmax_A) isTest = False; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'SAE_SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values())) if settings['DL']: print "direct deep learning" sda = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda.predict(x_train_minmax) y_train = y_train_minmax isTest = False; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_train, training_predicted).values())) test_predicted = sda.predict(x_test_minmax) y_test = test_y isTest = True; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values())) if settings['DL_U']: # deep learning using unlabeled data for pretraining print 'deep learning with unlabel data' pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size) sda_unlabel = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, pretraining_X_minmax = pretraining_X_minmax, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_unlabel.predict(x_train_minmax) y_train = y_train_minmax isTest = False; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_unlabel.predict(x_test_minmax) y_test = test_y isTest = True; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values())) if settings['DL_S']: # deep learning using split network print 'deep learning using split network' # get the new representation for A set. first 784-D pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size) x = x_train_minmax[:, :x_train_minmax.shape[1]/2] print "original shape for A", x.shape a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform(x_train_minmax[:, :x_train_minmax.shape[1]/2]) x = x_train_minmax[:, x_train_minmax.shape[1]/2:] print "original shape for B", x.shape a_MAE_B = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_B = a_MAE_B.transform(x_train_minmax[:, x_train_minmax.shape[1]/2:]) new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax[:, :x_test_minmax.shape[1]/2]) new_x_test_minmax_B = a_MAE_B.transform(x_test_minmax[:, x_test_minmax.shape[1]/2:]) new_x_validation_minmax_A = a_MAE_A.transform(x_validation_minmax[:, :x_validation_minmax.shape[1]/2]) new_x_validation_minmax_B = a_MAE_B.transform(x_validation_minmax[:, x_validation_minmax.shape[1]/2:]) new_x_train_minmax_whole = np.hstack((new_x_train_minmax_A, new_x_train_minmax_B)) new_x_test_minmax_whole = np.hstack((new_x_test_minmax_A, new_x_test_minmax_B)) new_x_validationt_minmax_whole = np.hstack((new_x_validation_minmax_A, new_x_validation_minmax_B)) sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax, new_x_validationt_minmax_whole, y_validation_minmax , new_x_test_minmax_whole, y_test, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_transformed.predict(new_x_train_minmax_whole) y_train = y_train_minmax isTest = False; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_transformed.predict(new_x_test_minmax_whole) y_test = test_y isTest = True; #new analysis_scr.append((self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values())) report_name = filename + '_' + '_test10fold_'.join(map(str, hidden_layers_sizes)) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(reduce_ratio)+ '_' + str(training_epochs) + '_' + current_date saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)
def get_LOO_perfermance(self, settings = None): fisher_mode = settings['fisher_mode'] analysis_scr = [] predicted_score = settings['predicted_score'] reduce_ratio = settings['reduce_ratio'] for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1): print seq_no logger.info('sequence number: ' + str(seq_no)) if settings['SVM']: print "SVM" (train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_LOO_training_and_reduced_traing(seq_no,fisher_mode = fisher_mode, reduce_ratio = reduce_ratio) standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(scaled_train_X, train_y_reduced) predicted_test_y = Linear_SVC.predict(scaled_test_X) isTest = True; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(scaled_train_X) isTest = False; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values())) # Deep learning part min_max_scaler = Preprocessing_Scaler_with_mean_point5() X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced) X_train_pre_validation_minmax = min_max_scaler.transform(train_X_reduced) x_test_minmax = min_max_scaler.transform(test_X) pretraining_X_minmax = min_max_scaler.transform(train_X_LOO) x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(X_train_pre_validation_minmax, train_y_reduced , test_size=0.4, random_state=42) finetune_lr = settings['finetune_lr'] batch_size = settings['batch_size'] pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size) #pretrain_lr=0.001 pretrain_lr = settings['pretrain_lr'] training_epochs = settings['training_epochs'] hidden_layers_sizes= settings['hidden_layers_sizes'] corruption_levels = settings['corruption_levels'] if settings['DL']: print "direct deep learning" # direct deep learning sda = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda.predict(x_train_minmax) y_train = y_train_minmax isTest = False; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_train, training_predicted).values())) test_predicted = sda.predict(x_test_minmax) y_test = test_y isTest = True; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values())) if 0: # deep learning using unlabeled data for pretraining print 'deep learning with unlabel data' pretraining_epochs_for_reduced = cal_epochs(1500, pretraining_X_minmax, batch_size = batch_size) sda_unlabel = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, pretraining_X_minmax = pretraining_X_minmax, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs_for_reduced, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_unlabel.predict(x_train_minmax) y_train = y_train_minmax isTest = False; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_unlabel.predict(x_test_minmax) y_test = test_y isTest = True; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values())) if settings['Split_DL']: # deep learning using split network print 'deep learning using split network' # get the new representation for A set. first 784-D pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size) hidden_layers_sizes= settings['hidden_layers_sizes'] corruption_levels = settings['corruption_levels'] x = x_train_minmax[:, :x_train_minmax.shape[1]/2] print "original shape for A", x.shape a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform(x_train_minmax[:, :x_train_minmax.shape[1]/2]) x = x_train_minmax[:, x_train_minmax.shape[1]/2:] print "original shape for B", x.shape a_MAE_B = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_B = a_MAE_B.transform(x_train_minmax[:, x_train_minmax.shape[1]/2:]) new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax[:, :x_test_minmax.shape[1]/2]) new_x_test_minmax_B = a_MAE_B.transform(x_test_minmax[:, x_test_minmax.shape[1]/2:]) new_x_validation_minmax_A = a_MAE_A.transform(x_validation_minmax[:, :x_validation_minmax.shape[1]/2]) new_x_validation_minmax_B = a_MAE_B.transform(x_validation_minmax[:, x_validation_minmax.shape[1]/2:]) new_x_train_minmax_whole = np.hstack((new_x_train_minmax_A, new_x_train_minmax_B)) new_x_test_minmax_whole = np.hstack((new_x_test_minmax_A, new_x_test_minmax_B)) new_x_validationt_minmax_whole = np.hstack((new_x_validation_minmax_A, new_x_validation_minmax_B)) finetune_lr = settings['finetune_lr'] batch_size = settings['batch_size'] pretraining_epochs = cal_epochs(settings['pretraining_interations'], x_train_minmax, batch_size = batch_size) #pretrain_lr=0.001 pretrain_lr = settings['pretrain_lr'] training_epochs = settings['training_epochs'] hidden_layers_sizes= settings['hidden_layers_sizes'] corruption_levels = settings['corruption_levels'] sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax, new_x_validationt_minmax_whole, y_validation_minmax , new_x_test_minmax_whole, y_test, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_transformed.predict(new_x_train_minmax_whole) y_train = y_train_minmax isTest = False; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_transformed.predict(new_x_test_minmax_whole) y_test = test_y isTest = True; #new analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_S', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values())) report_name = filename + '_' + '_'.join(map(str, hidden_layers_sizes)) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(reduce_ratio)+ '_' +str(training_epochs) + '_' + current_date saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)
def train_models(Xt=None, yt=None, Xv=None, yv=None, params={}): """classifier model training and validation inner most routine!? Four classifiers are implemented: - LDA: linear discriminant analysis - QDA: quadratic discriminant analysis - OVO: one-vs-one - OVR: one-vs-rest arguments ------ Xt/Xv: np.array feature vectors for training (t) and validation (v) yt/yv: list/np.array labels, corresponding to X, for training (t) and validation (v) params: dict doStandardize: bool standardize training features? numPCs: int PCA dimension reduction of training features (after standardization) quietmode: bool suppress sklearn noise during training labels: list names of the labels, for sorting confusion matrices num_lda: int number of dimensions for LDA classifier LSVCparam: dict LinearSVC parameters returns ------ mdic: dictionary dictionary w/ transformations, trained models and confusion matrices """ LSVCparam = dict(random_state=0, verbose=0, max_iter=3000) # parametersss p = dict(_about='parameters for train_models()', doStandardize=True, numPCs=0, quietmode=True, labels=None, num_lda=2, LSVCparam=LSVCparam) p.update(params) # assign labels if not already set if p['labels'] is None: p['labels'] = np.unique('yt').tolist() # labeling sanity checks lbls_trn = np.unique(yt).tolist() lbls_val = np.unique(yv).tolist() # training and validation labels (unique) must match if set(lbls_trn) != set(lbls_val): print('LABELS MISMATCH: train/validate %s' % (str([lbls_trn, lbls_val]))) raise Exception() # passed labels also must match for label in p['labels']: if label not in lbls_trn: print('LABEL [%s] not found in trn/val labels' % label, lbls_trn) raise Exception() #------------------------------------------------------- # standardization if p['doStandardize']: sc = StandardScaler().fit(Xt) Xt = sc.transform(Xt) Xv = sc.transform(Xv) else: sc = None # PCA if p['numPCs'] > 0: pca = PCA(n_components=numPCs).fit(Xt) Xt = pca.transform(Xt) Xv = pca.transform(Xv) else: pca = None # MODEL TRAINING lda = LinearDiscriminantAnalysis(n_components=p['num_lda']).fit(Xt, yt) qda = QuadraticDiscriminantAnalysis().fit(Xt, yt) with warnings.catch_warnings(): if p['quietmode']: warnings.filterwarnings( "ignore", category=sklearn.exceptions.ConvergenceWarning) ovo = OneVsOneClassifier(LinearSVC(**LSVCparam)).fit(Xt, yt) ovr = OneVsRestClassifier(LinearSVC(**LSVCparam)).fit(Xt, yt) # confusion matrices / validation cnf_lda = confusion_matrix(yv, lda.predict(Xv), labels=p['labels']) cnf_qda = confusion_matrix(yv, qda.predict(Xv), labels=p['labels']) cnf_ovr = confusion_matrix(yv, ovr.predict(Xv), labels=p['labels']) cnf_ovo = confusion_matrix(yv, ovo.predict(Xv), labels=p['labels']) cb = mt.ClassifierBundle( sc=sc, pca=pca, training_params=p, classifiers=dict( LDA=dict(cls=lda, confusion=cnf_lda), QDA=dict(cls=qda, confusion=cnf_qda), OVO=dict(cls=ovo, confusion=cnf_ovo), OVR=dict(cls=ovr, confusion=cnf_ovr), ), ) # pack results into a dictionary mdic = dict(_about="classifier models made with train_models()", cb=cb, sc=sc, pca=pca, num_trn=len(yt), num_val=len(yv), classifiers=dict( LDA=dict(cls=lda, confusion=cnf_lda), QDA=dict(cls=qda, confusion=cnf_qda), OVO=dict(cls=ovo, confusion=cnf_ovo), OVR=dict(cls=ovr, confusion=cnf_ovr), ), params=p) return mdic
print("LogisticRegression_classifier:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))) #SGD SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) print("SGDClassifier_classifier:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))) ###SVC ##SVC_classifier = SklearnClassifier(SVC()) ##SVC_classifier.train(training_set) ##print("SVC_classifier:", (nltk.classify.accuracy(SVC_classifier, testing_set))) ## #LinearSVC_classifier LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))) #NuSVC_classifier NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(training_set) print("NuSVC_classifier:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))) voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)
train_df = train_df.drop(labels = label, inplace=False, axis=0) x_train, y_train = prepareData(train_df) x_valid, y_valid = prepareData(valid_df) print('Done Read Train and Validation data!') print('Training NB') classificador = GaussianNB(priors=None, var_smoothing=1e-9) classificador.fit(x_train, y_train) previsoes_nb = classificador.predict(x_valid) print('Training SVM') classificador = LinearSVC() classificador.fit(x_train, y_train) previsoes_svc = classificador.predict(x_valid) print('Training RNA') kernel_initializer = 'normal' activation = 'relu' loss = 'binary_crossentropy' batch_size = 1500 neurons = 1536 dropout = 0.1 learning_rate = 0.001 beta_1 = 0.97 beta_2 = 0.97 decay = 0.05
save = True if args.save else False train, test, train_label, test_label = load_data(args.dataset, args.n_classes) # t = args.epoch # print(t) # train_label[train_label != t] = 10 # train_label[train_label == t] = 1 # train_label[train_label == 10] = 0 # test_label[test_label != t] = 10 # test_label[test_label == t] = 1 # test_label[test_label == 10] = 0 print('training data size: ') print(train.shape) print('testing data size: ') print(test.shape) dual = True if args.dual else False scd = LinearSVC(C=args.c, dual=dual) a = time.time() scd.fit(train, train_label) print('Cost: %.3f seconds'%(time.time() - a)) print('Best Train Accuracy: ', accuracy_score(y_true=train_label, y_pred=scd.predict(train))) print('Balanced Train Accuracy: ', balanced_accuracy_score(y_true=train_label, y_pred=scd.predict(train))) print('Best one Accuracy: ', accuracy_score(y_true=test_label, y_pred=scd.predict(test))) print('Balanced Accuracy: ', balanced_accuracy_score(y_true=test_label, y_pred=scd.predict(test))) if save: save_path = 'checkpoints' save_checkpoint(scd, save_path, args.target, et, vc)
from mglearn.datasets import load_extended_boston from sklearn.model_selection import train_test_split import pandas as pd import mglearn from IPython.display import display from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from pandas import np import matplotlib.pyplot as plt X, y = mglearn.datasets.make_forge() fig, axes = plt.subplots(1, 2, figsize=(10, 3)) for model, ax in zip([LinearSVC(), LogisticRegression(), 9], axes): clf = model.fit(X, y) mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5, ax=ax, alpha=.7) mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) ax.set_title("{}".format(clf.__class__.__name__)) ax.set_xlabel("Признак 0") ax.set_ylabel("Признак 1") axes[0].legend(loc=3) plt.show()
tweets = ["Hello world, today is a good day", "Bye, bye, world, I am sleeping", "Hello bikey, it is bleh", "Good bye popa, window", "Maybe now I will say hello", "Tomorrow I will do bye", "It is a good night for be hello", "Perhaps bye will be okay"] tokTweets = [nltk.word_tokenize(tweet) for tweet in tweets] stances = ['yes','no','yes','no','yes','no','yes','no'] stringTweets = [str(tweet) for tweet in tokTweets] X_train, X_test, y_train, y_test = train_test_split(stringTweets, stances, test_size=0.33, random_state=2) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) y_test = le.fit_transform(y_test) tf = TfidfVectorizer(max_features=5000) tf.fit(stringTweets) X_train_tf = tf.transform(X_train) X_test_tf = tf.transform(X_test) svm = LinearSVC() svm.fit(X_train_tf, y_train) predictions = svm.predict(X_test_tf) print("SVM Accuracy score: {0}".format(accuracy_score(predictions, y_test)*100))
np.random.seed(42) mnist = fetch_mldata("MNIST original") X = mnist["data"] y = mnist["target"] X_train = X[:60000] y_train = y[:60000] X_test = X[60000:] y_test = y[60000:] rnd_idx = np.random.permutation(60000) X_train = X_train[rnd_idx] y_train = y_train[rnd_idx] # Model lin_clf = LinearSVC(random_state=42) # lin_clf.fit(X_train, y_train) # Base model # y_pred = lin_clf.predict(X_train) # accuracy_score(y_train, y_pred) .83 # Scale data scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float32)) X_test_scaled = scaler.transform(X_test.astype(np.float32)) # lin_clf.fit(X_train_scaled, y_train) # Scaled model # y_pred = lin_clf.predict(X_train_scaled) # accuracy_score(y_train, y_pred) .92 # Model with RBF kernel function
print("{:10}{:20}{:10.2f}{}".format(clf_conf.id, features_key, recall, marker)) clfs = [ ClfConf(id="lr", clf=lambda: LogisticRegression(solver='lbfgs', max_iter=4000), normalized=False ), ClfConf(id="lda", clf=lambda: Lda(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001), normalized=False ), ClfConf(id="svm_lin", clf=lambda: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0), normalized=True ), ClfConf(id="svm", clf=lambda: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), normalized=True ), ClfConf(id="knn", clf=lambda: KNeighborsClassifier(n_neighbors=3), normalized=False ), ClfConf(id="nm_g",
def classifier_analysis(X, label, methodType): from sklearn.preprocessing import StandardScaler from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline #rng = None rng = np.random.RandomState(1) if methodType == 0: # random forest from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=n_jobs, random_state=rng, verbose=0, warm_start=False, class_weight=None) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__n_estimators': [5, 10, 20], 'classifier__max_depth': [None, 10, 5, 3], 'classifier__max_features': ['auto', 10, 5] } elif methodType == 1: # adaboost from sklearn.ensemble import AdaBoostClassifier classifier = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=rng) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__n_estimators': [5, 10, 20], 'classifier__learning_rate': [0.8, 0.9, 1.0] } elif methodType == 2: # GBC from sklearn.ensemble import GradientBoostingClassifier classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=rng, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__n_estimators': [50, 100, 150], 'classifier__max_depth': [None, 10, 5, 3], 'classifier__learning_rate': [0.8, 0.9, 1.0] } elif methodType == 3: # logtistic regression from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=rng, solver='saga', max_iter=100, multi_class='multinomial', verbose=0, warm_start=False, n_jobs=n_jobs) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': [0.9, 1.0, 1.1] } elif methodType == 4: # SVM from sklearn.svm import SVC classifier = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=rng) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'classifier__C': [0.9, 1.0, 1.1] } elif methodType == 5: # MLP from sklearn.neural_network import MLPClassifier classifier = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__hidden_layer_sizes': [(100, ), (50, ), (20, )], 'classifier__learning_rate_init': [0.0001, 0.001, 0.01] } elif methodType == 6: # linear SVM from sklearn.svm import LinearSVC classifier = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=rng, max_iter=1000) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': [0.9, 1.0, 1.1] } elif methodType == 7: # Bernoulli Naive Bayes from sklearn.naive_bayes import BernoulliNB classifier = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) param_grid = { 'filter__threshold': [0.95, 0.97, 0.99], 'classifier__alpha': [0.90, 0.95, 1.0], 'classifier__fit_prior': [True, False] } elif methodType == 8: # multinomial Naive Bayes from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) param_grid = { 'classifier__alpha': [0.90, 0.95, 1.0], 'classifier__fit_prior': [True, False] } else: return if methodType == 8: pipe = Pipeline([ ('classifier', classifier) ]) else: pipe = Pipeline([ ('scale', StandardScaler()), ('filter', FilterSimu()), ('classifier', classifier) ]) grid = GridSearchCV(pipe, cv=ShuffleSplit(n_splits=4, test_size=0.25, random_state=rng), n_jobs=1, param_grid=param_grid) grid.fit(X, label) best_estimator = grid.best_estimator_ #mean_scores = np.array(grid.cv_results_['mean_test_score']) #mean_tscores = np.array(grid.cv_results_['mean_train_score']) #print mean_scores #print mean_tscores print grid.best_params_ score = grid.best_score_ #print grid.cv_results_['params'] return best_estimator, grid.predict(X), score
data=np.asarray(data_df) label=np.asarray(label_df).flatten('F') #change to 1D vector scaler = joblib.load('scaler.joblib') scaler.fit(data) data=scaler.transform(data) x_train, x_test, y_train, y_test = train_test_split(data,label, test_size=0.2, random_state = 4) mlp =MLPClassifier(random_state=4) rfc=RandomForestClassifier(random_state=4) svc = LinearSVC() ovr = OneVsRestClassifier(svc) models =[] models.append(ovr) models.append(mlp) models.append(rfc) kf = StratifiedKFold(n_splits=5, random_state = 4) y_pred=[] for model in models: model.fit(data,label) y=model.predict(x_test) y_pred.append(y) print(accuracy_score(y_test, y))
# ### Perceptron # In[ ]: # Perceptron perceptron = Perceptron() acc_perceptron = predict_model(X_data, Y_data, perceptron, X_test_kaggle, 'submission_Perception.csv') # ### Linear SVC # In[ ]: # Linear SVC linear_svc = LinearSVC() acc_linear_svc = predict_model(X_data, Y_data, linear_svc, X_test_kaggle, 'submission_Linear_SVC.csv') # ### Stochastic Gradient Descent # In[ ]: # Stochastic Gradient Descent sgd = SGDClassifier() acc_sgd = predict_model(X_data, Y_data, sgd, X_test_kaggle, 'submission_stochastic_Gradient_Descent.csv') # ### Decision Tree # In[ ]:
# created a new column called 'cleaned' in data to store the processed data X_train, X_test, y_train, y_test = train_test_split(data['cleaned'], data.stars, test_size=0.2) #stars column in the data contains the stars of that perticular comment. pipeline = Pipeline([ ('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)), #TfidfVectorizer - Transforms text to feature vectors that can be used as input to estimator. ('chi', SelectKBest(chi2, k=10000)), #selecting best words/features using chisquare algorithm ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False)) ]) #support vector classifier using 3000 iterations model = pipeline.fit(X_train, y_train) #training the processed data vectorizer = model.named_steps['vect'] chi = model.named_steps['chi'] clf = model.named_steps['clf'] feature_names = vectorizer.get_feature_names() feature_names = [feature_names[i] for i in chi.get_support(indices=True)] feature_names = np.asarray(feature_names) target_names = ['1', '2', '3', '4', '5']
############################################################################## # There is freedom in the choice of ``events`` composing the feature vectors and we encourage the # reader to explore different combinations. Note, however, that odd photon-numbered events have # zero probability because ideal GBS only generates and outputs pairs of photons. # # Given our points in the feature space and their target labels, we can use # scikit-learn's Support Vector Machine `LinearSVC <https://scikit-learn.org/stable/modules/generated/sklearn.svm # .LinearSVC.html>`__ as our model to train: from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler R_scaled = StandardScaler().fit_transform(R) # Transform data to zero mean and unit variance classifier = LinearSVC() classifier.fit(R_scaled, classes) ############################################################################## # Here, the term "linear" refers to the *kernel* function used to calculate inner products # between vectors in the space. We can use a linear SVM because we have already embedded the # graphs in a feature space based on GBS. We have also rescaled the feature vectors so that they # zero mean and unit variance using scikit-learn's ``StandardScaler``, a technique # `often used <https://scikit-learn.org/stable/modules/preprocessing.html>`__ in machine learning. # # We can then visualize the trained SVM by plotting the decision boundary with respect to the # points: w = classifier.coef_[0] i = classifier.intercept_[0]
random_state=42) # Logisitic Regression classifier log_reg = LogisticRegression(random_state=0) log_reg.fit(X_train, y_train) y_pred = log_reg.predict(X_val) log_accuracy = accuracy_score(y_val, y_pred) # Naive Bayes Classifier nb = MultinomialNB() nb.fit(X_train, y_train) y_pred = nb.predict(X_val) nb_accuracy = accuracy_score(y_val, y_pred) # Support Vector Classifier lsvm = LinearSVC(random_state=0) lsvm.fit(X_train, y_train) y_pred = lsvm.predict(X_val) lsvm_accuracy = accuracy_score(y_val, y_pred) print( "Accuracy benchmark:\nLogistic Regression: {}\nNaive Bayes: {}\nSupport Vector: {}" .format(log_accuracy, nb_accuracy, lsvm_accuracy)) plt.figure() plt.bar([1, 2, 3], [log_accuracy, nb_accuracy, lsvm_accuracy]) plt.show() # -------------- # path_test : Location of test data
# | # 0 | 0 0 # | # 1 | 0 1 # | print(type(y_data)) #2. 모델 model = LinearSVC() # 사용 모델 명시 #3. 훈련 model.fit(x_data, y_data) #4. 평가, 예측 x_test = [[0, 0], [1, 0], [0, 1], [1, 1]] y_pred = model.predict(x_test) # score = model.evaluate(예측) acc = accuracy_score([0, 0, 0, 1], y_pred) # evaluate = score()
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10) softmax_reg.fit(X, y) print(softmax_reg.predict([[5, 2]])) print(softmax_reg.predict_proba([[5, 2]])) # 3.1 linear SVM; feature scale sensitive, so normalization important # SVM does not output probabilities from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler X = iris["data"][:, (2, 3)] y = (iris["target"] == 2).astype(np.float64) # iris-virginica svm_clf = Pipeline([ ("scaler", StandardScaler()), ("linear_svc", LinearSVC(C=1, loss="hinge")), ]) svm_clf.fit(X, y) print(svm_clf.predict([[5.5, 1.7]])) #3.2 polynomial Kernel SVM # set degree, doesn't actually add any high degree features from sklearn.svm import SVC poly_kernel_svm_clf = Pipeline([("scaler", StandardScaler()), ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))]) poly_kernel_svm_clf.fit(X, y) # 3.3 Gaussian RBF Kernel: good for small size training set # gamma is regulation, high gamma, irregular boundary, overfitting rbf_kernel_svm_clf = Pipeline([("scaler", StandardScaler()),
def run(): """Run example for Doc-2-Vec method and IMDB dataset.""" log.info('START') data = { 'test-neg.txt': 'TEST_NEG', 'test-pos.txt': 'TEST_POS', 'train-neg.txt': 'TRAIN_NEG', 'train-pos.txt': 'TRAIN_POS', 'train-unsup.txt': 'TRAIN_UNS' } data = {join(IMDB_MERGED_PATH, k): v for k, v in data.iteritems()} sentences = Doc2VecGenerator(data) vector_size = 400 models_path = '/datasets/amazon-data/csv/models/doc2vec/' if not exists(models_path): makedirs(models_path) log.info('Directory: {} has been created'.format(models_path)) f_name = 'imdb-{}.d2v'.format(vector_size) f_model = join(models_path, f_name) log.info('Model Load or Save') if isfile(f_model): model = Doc2Vec.load(f_model) log.info('Model has been loaded from: {}'.format(f_model)) else: cores = multiprocessing.cpu_count() model = Doc2Vec(min_count=1, window=10, size=vector_size, sample=1e-4, negative=5, workers=cores) model.build_vocab(sentences.to_array()) log.info('Epochs') for epoch in range(10): log.info('EPOCH: #{}'.format(epoch)) model.train(sentences.sentences_perm()) model.save(f_model) log.info('Sentiment') train_arrays = numpy.zeros((25000, vector_size)) train_labels = numpy.zeros(25000) for i in range(12500): log.debug('TRAIN_{}'.format(i)) prefix_train_pos = 'TRAIN_POS_' + str(i) prefix_train_neg = 'TRAIN_NEG_' + str(i) train_arrays[i] = model.docvecs[prefix_train_pos] train_arrays[12500 + i] = model.docvecs[prefix_train_neg] train_labels[i] = 1 train_labels[12500 + i] = 0 test_arrays = numpy.zeros((25000, vector_size)) test_labels = numpy.zeros(25000) for i in range(12500): log.debug('TEST_{}'.format(i)) prefix_test_pos = 'TEST_POS_' + str(i) prefix_test_neg = 'TEST_NEG_' + str(i) test_arrays[i] = model.docvecs[prefix_test_pos] test_arrays[12500 + i] = model.docvecs[prefix_test_neg] test_labels[i] = 1 test_labels[12500 + i] = 0 log.info('Fitting') classifiers = { 'BernoulliNB': BernoulliNB(), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'RandomForestClassifier': RandomForestClassifier(), 'LogisticRegression': LogisticRegression(), 'SVC': SVC(), 'LinearSVC': LinearSVC() } results = {} for classifier_name, classifier in classifiers.iteritems(): log.info('Clf: {}'.format(classifier_name)) classifier.fit(train_arrays, train_labels) # # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, # intercept_scaling=1, penalty='l2', random_state=None, # tol=0.0001) result = classifier.score(test_arrays, test_labels) log.info('Clf acc: {}'.format(result)) results[classifier_name] = result log.info(results) with open(models_path + 'results-{}'.format(f_name)) as res: pickle.dump(results, res)