# evaluate svm with calibrated probabilities for imbalanced classification from numpy import mean from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.calibration import CalibratedClassifierCV from sklearn.svm import SVC # generate dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4) # define model model = SVC(gamma='scale') # wrap the model calibrated = CalibratedClassifierCV(model, method='isotonic', cv=3) # define evaluation procedure cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # evaluate model scores = cross_val_score(calibrated, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) # summarize performance print('Mean ROC AUC: %.3f' % mean(scores))
def svm(train, labels, test, C=10, kernel='rbf', degree=3, gamma=0.5, calibration=0.0, calibrationmethod='sigmoid', coef0=0.0, probability=True, shrinking=True, tol=1e-3, verbose=0, outlier_frac=0.0, outlier_method='EE', rescale_pred=False, class_weight=None, sample_weight=None, rescale=True): """ Trains a model by giving it a feature matrix, as well as the labels (the ground truth) then using that model, predicts the given test samples output is 9 probabilities, one for each class :param train: The training data, to train the model :param labels: The labels of the training data, an array :param C: trades off misclassification of training examples against simplicity of the decision surface low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly :param gamma: parameter defines how far the influence of a single training example reaches low values meaning ‘far’ and high values meaning ‘close’. :param verbose: See sklearn documentation :param rescale: both the training and testing data are taken square root of, rescaled to unit variance, and moved to interval [0,1] """ if outlier_frac > 0: train, labels = filter_data(train, labels, cut_outlier_frac=outlier_frac, method=outlier_method) # remove ourliers if isinstance(sample_weight, str): sample_weight = obtain_class_weights(labels, sample_weight) if rescale: #take square root, rescale variance to unit, rescale to [0,1] #this should preserve sparsity of matrix train = sqrt(train) test = sqrt(test) scaler = StandardScaler(with_mean=False, with_std=True, copy=True) train = scaler.fit_transform(train) scaler = StandardScaler(with_mean=False, with_std=True, copy=True) test = scaler.fit_transform(test) scaler = MinMaxScaler() train = scaler.fit_transform(train) scaler = MinMaxScaler() test = scaler.fit_transform(test) model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, probability=probability, shrinking=shrinking, tol=tol, verbose=verbose, class_weight=class_weight) if calibration == 0.0: model.fit(train, labels, sample_weight) elif calibration > 1: model = CalibratedClassifierCV(model, calibrationmethod, calibration) model.fit(train, labels, sample_weight) else: N = len(labels) if sample_weight is None: sample_weight = ones(N) train_rows = floor((1.0 - calibration) * N) model.fit(train[:train_rows, :], labels[:train_rows], sample_weight[:train_rows]) model = CalibratedClassifierCV(model, calibrationmethod, "prefit") model.fit(train[train_rows:, :], labels[train_rows:], sample_weight=sample_weight[train_rows:]) model.fit(train, labels, sample_weight) predictions = model.predict_proba(test) if rescale_pred: predictions = rescale_prior(predictions, bincount(labels)) return predictions
from sklearn.calibration import CalibratedClassifierCV import numpy as np iter = 5 with open('ListOfBestParamsRS.pkl', 'rb') as f: best_params = pickle.load(f) pathd = "C://Users//Arushi//PycharmProjects//ThesisChap2//Dataset//" path = "C://Users//Arushi//PycharmProjects//ThesisChap2//fixedBuckets(10)//" genenamesFile = open(pathd + "transformedColumnNames221.txt", 'r').readline().rstrip('\n').split(',') for i in range(iter): X_train = np.load(path + 'final_train_binarydata_' + str(i) + '.npy') Y_train = np.load(path + 'final_train_labels_' + str(i) + '.npy') bp = best_params[i] X_train = X_train.astype('float') X_train = normalize(X_train) Y_train = Y_train.astype('float') Y_train = Y_train.astype(int) clf = LinearSVC(C=bp['C'], max_iter=10000, tol=1e-4) clf_sigmoid = CalibratedClassifierCV(clf, cv=4, method='sigmoid').fit( X_train, Y_train.ravel()) with open('Model_ism' + str(i) + '.pkl', 'wb') as f: pickle.dump(clf_sigmoid, f)
y = np.hstack( (np.ones(len(car_features)), np.zeros(len(notcar_features)))) # Split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.2, random_state=rand_state) print('Using: ', orient, 'orientations', pix_per_cell, 'pixels per cell and ', cell_per_block, 'cells per block ', hist_bins, 'hist_bins => feature vector length: ', len(X_train[0])) # Use a linear SVC svc = LinearSVC() svc_model = CalibratedClassifierCV(svc) # Check the training time for the SVC t = time.time() svc_model.fit(X_train, y_train) t2 = time.time() print(' Seconds to train SVC: ', round(t2 - t, 2)) # Check the score of the SVC print(' Train Accuracy of SVC: ', round(svc_model.score(X_train, y_train), 4)) print(' Test Accuracy of SVC: ', round(svc_model.score(X_test, y_test), 4)) # Check the prediction time for a single sample
def TrainPerceptron(X_train,y_train): clf = Perceptron(eta0=0.01, random_state=1, max_iter= 100) clf = CalibratedClassifierCV(clf) clf.fit(X_train, y_train) return clf
def train_SVC(data_vec, label): svc = LinearSVC() clf = CalibratedClassifierCV(svc) clf.fit(data_vec, label) return clf
# split train and test x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y) # x_train.shape,x_test.shape,y_train.shape,y_test.shape count_vect_tfidf = TfidfVectorizer() x_train_text_tfidf = count_vect_tfidf.fit_transform(x_train.text) # Creating a pickle file for the TFIDFVectorizer pickle.dump(count_vect_tfidf, open('cv-transform.pkl', 'wb')) print("victorizer dumped") # Model Building clf = SGDClassifier(class_weight='balanced', alpha=0.0001, penalty='l2', loss='log', random_state=42) # Fitting Logistic Regression to the Training set clf.fit(x_train_text_tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(x_train_text_tfidf, y_train) # Creating a pickle file for the Disaster-Tweet-LR-model filename = 'Disaster-Tweet-LR-model.pkl' pickle.dump(sig_clf, open(filename, 'wb')) print("model dumped")
"\n\n\nWhich label is cost sensitive type the class number to over sample it:" )) costval = int(input("\nWhat is the cost?(integer):")) #perfom the imbalancing actions for ampl_m in range(1, 6): print( "\n==================================================\n---------------------------------------New sampling method\n" ) x_train, x_test, y_train, y_test = class_imbal(df, 5, transformer, costclass, costval, ampl_m) ############################################## #it is required a base algorithm callibration before any cost sensitivity action upsampled = CalibratedClassifierCV(base_estimator=arclfs[0][0], method='sigmoid', cv=None) upsampled2 = CalibratedClassifierCV(base_estimator=arclfs[1][0], method='isotonic', cv=None) fclf = [ #[class_multi_label(x_train, y_train, 0, 0, 9), "Applying Multilabel k Nearest Neighbours", "SVM_KK - MLKnn"], [ class_multi_label(x_train, y_train, upsampled, arclfs[0][1], 1), "Applying binary relevance", "RFC - Binary Relevance" ], [ class_multi_label(x_train, y_train, upsampled, arclfs[0][1], 2), "Duplicates multi-label examples into examples with one label each", "RFC - Multi-label examples into examples with one label each" ],
from sklearn.preprocessing import MinMaxScaler from sklearn.calibration import CalibratedClassifierCV from sklearn.svm import SVC X = np.genfromtxt(os.environ['FEATURES_FILE'], delimiter=',',dtype=None, encoding='utf-8') X = np.delete(X,(0),axis=0) names = X[:,0] X = np.delete(X,(0),axis=1).astype(float) scaler = MinMaxScaler() scaler.fit(X) X = scaler.transform(X) size = len(X) y = np.genfromtxt(os.environ['TARGETS_FILE'], delimiter=',',dtype=None, encoding='utf-8') y = np.delete(y,(0),axis=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.5) regr = SVC() regr_cal = CalibratedClassifierCV(regr, method='sigmoid') regr_cal.fit(X_train, y_train) dump(regr_cal, os.environ['MODEL_FILE']) print('Train score =', regr_cal.score(X_train, y_train)) print('Test score =', regr_cal.score(X_test, y_test))
from sklearn.calibration import CalibratedClassifierCV import joblib from src.load_data import LoadData from src.prep_data import PrepData # Set the name of the file to load, and bring in Loader and Data Prep loader = LoadData() prep = PrepData() df = loader.load_traindata_to_df() # Set target and get features y = df['Class'] X = prep.drop_target_column(df) # Create training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=680) # Create a Support Vector Machine model and then Calibrate it using CalibratedClassifierCV model = SVC(kernel='linear') model.fit(X_train, y_train) calibrator = CalibratedClassifierCV(model, cv=10) calibrator.fit(X_train, y_train) # Export the model joblib.dump(calibrator, 'models/cal_model.pkl')
def main(cfg: DictConfig) -> None: x, y = get_data(cfg) assert_binary_data(x, y) x_tr, x_te, y_tr, y_te = train_test_split( x, y, test_size=cfg.evaluation.test_ratio, random_state=_seed(cfg.seed)) mlflow.set_tracking_uri(hydra.utils.get_original_cwd() + '/mlruns') experiment_id = get_mlflow_experiment_id(cfg.name) with mlflow.start_run(experiment_id=experiment_id): positive_ratio = len(y[y == 1]) / len(y) loss = get_loss_class(cfg) stopping_criterion = EarlyStopping( monitor='train_loss', lower_is_better=True, patience=min(10, cfg.training.max_epochs), threshold=cfg.training.tol, threshold_mode='rel', sink=logger.info, ) clf = Classifier( module=LinearClassifier, module__n_features=x.shape[1], max_epochs=cfg.training.max_epochs, criterion=loss, predict_nonlinearity=get_loss_link(cfg), optimizer=torch.optim.Adam, iterator_train__batch_size=cfg.training.batch_size, iterator_train__shuffle=True, train_split=False, callbacks=[('stopping_criterion', stopping_criterion)], verbose=cfg.verbose, ) if check_if_weight(cfg): pos_weight = torch.FloatTensor([1 / positive_ratio - 1]) clf.set_params(criterion__pos_weight=pos_weight) if check_if_validation(cfg): params = get_validation_params(cfg) clf = GridSearchCV( clf, params, refit=True, cv=cfg.evaluation.n_cv, scoring='accuracy' if loss is HingeLoss else negative_brier_score, n_jobs=-1, ) else: clf.set_params( lr=cfg.training.lr, optimizer__weight_decay=cfg.training.regularization, ) if check_if_gev_loss(loss): clf.set_params(criterion__xi=cfg.loss.xi) mlflow.log_param('dataset', cfg.dataset) mlflow.log_param('dataset.positive_ratio', positive_ratio) mlflow.log_param('loss', cfg.loss.name) mlflow.log_param('lr', cfg.training.lr) mlflow.log_param('max_epochs', cfg.training.max_epochs) mlflow.log_param('tol', cfg.training.tol) mlflow.log_param('regularization', cfg.training.regularization) mlflow.log_param('seed', _seed(cfg.seed)) if check_if_gev_loss(loss): mlflow.log_param('xi', cfg.loss.xi) if loss is HingeLoss: # Step 1: fit linear model with hinge loss x_tr, x_vl, y_tr, y_vl = train_test_split(x, y, test_size=0.5) clf.fit(x_tr, y_tr) # Step 2: calibrate linear model with Platt's scaling clf = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit') clf.fit(x_vl, y_vl) elif cfg.loss.name == "isotonic": # Step 1: fit linear model with logistic regression (AUC maximization) x_tr, x_vl, y_tr, y_vl = train_test_split(x, y, test_size=0.5) clf.fit(x_tr, y_tr) # Step 2: calibrate linear model with isotonic regression clf = CalibratedClassifierCV(clf, method='isotonic', cv='prefit') clf.fit(x_vl, y_vl) elif cfg.loss.name == "bagging": clf = BalancedBaggingRegressor( clf, n_estimators=10, bootstrap=True, sampling_strategy='majority', n_jobs=1, ) clf.fit(x_tr, y_tr) else: clf.fit(x_tr, y_tr) y_pred = clf.predict(x_te) y_prob = clf.predict_proba(x_te)[:, 1] log_metric('brier_score', brier_score_loss(y_te, y_prob))
Y_test = np.delete(Y_use, index, axis=0) ##Group CV groups = data_xy.iloc[:, -2] gkf = GroupKFold(n_splits=7) for train_index, test_index in gkf.split(X, Y, groups): X_train = X[train_index] X_validate = X[test_index] Y_train = Y[train_index] Y_validate = Y[test_index] print(X_train.shape) print(Y_train.shape) ##Establish Model model_logi = LogisticRegression().fit(X=X_train, y=Y_train, sample_weight=None) model_ccv = CalibratedClassifierCV().fit(X_train, Y_train) model_gbc = GradientBoostingClassifier().fit(X_train, Y_train) model_rfc = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, Y_train) model_abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=100).fit(X_train, Y_train) ##Calculate logloss on test set Y_validate_pre = model_gbc.predict_proba(X_validate)[:, 1] Y_test_pre = model_gbc.predict_proba(X_test)[:, 1] # Y_validate_pre[np.where(Y_validate_pre < 0.15)]=0.01 # Y_validate_pre[np.where((Y_validate_pre > 0.2)&(Y_validate_pre < 0.4))]=0.4 # Y_validate_pre[np.where(Y_validate_pre > 0.9)]=0.99 loos1 = metrics.log_loss(Y_validate, Y_validate_pre)
def report_log_loss(train_x, train_y, test_x, test_y, clf): clf.fit(train_x, train_y) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x, train_y) sig_clf_probs = sig_clf.predict_proba(test_x) return log_loss(test_y, sig_clf_probs, eps=1e-15)
# is not what explains a performance difference between no calibration, and calibration. clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clfbag = BaggingClassifier(clf, n_estimators=5) # 在random_forest上做bagging clfbag.fit(Xtrain, ytrain) ypreds = clfbag.predict_proba(Xtest)[:, 1] print("loss WITHOUT calibration : ", log_loss(ytest, ypreds, eps=1e-15, normalize=True)) print("auc WITHOUT calibration : ", roc_auc_score(ytest, ypreds)) # Now, we train and apply a Random Forest WITH calibration # In our case, 'isotonic' worked better than default 'sigmoid' # This is not always the case. Depending of the case, you have to test the two possibilities clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5) #calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5) calibrated_clf.fit(Xtrain, ytrain) ypreds = calibrated_clf.predict_proba(Xtest)[:, 1] print("loss WITH calibration : ", log_loss(ytest, ypreds, eps=1e-15, normalize=True)) # logloss会降低,auc升高 print("auc WITHOUT calibration : ", roc_auc_score(ytest, ypreds)) print(" ") print( "Conclusion : in our case, calibration improved performance a lot ! (reduced loss)" ) # We can see that we highly improved performance with calibration (loss is reduced) ! # Using calibration helped our team a lot to climb the leaderboard. # In the future competitions, that's for sure, I will not forget to test this trick !
tr_dtmat = tfidf.fit_transform(tr_dtmat) print("Done.\n") if PRINT_FEATURES: feature_list = cv.get_feature_names() feature_map = sel.get_support() features = [i for i, j in zip(feature_list, feature_map) if j == True] print("Features: " + ','.join(features)) # Train Classifier print("Training classifier\n---------") if REGULARIZATION == 'l2': clf = LinearSVC(penalty='l2') else: clf = LinearSVC() clf = CalibratedClassifierCV(clf) clf.fit(tr_dtmat, tr_y_) print("Done.\n") if RUN_VALIDATION_SET: print("Running validation set\n---------") va_dtmat = cv.transform(pva_x) va_dtmat = sel.transform(va_dtmat) if TF_IDF: va_dtmat = tfidf.transform(va_dtmat) predicted = clf.predict(va_dtmat) # Create probability mask probs = clf.predict_proba(va_dtmat) confidences = [] prob_mask = []
def create_classifier(): return CalibratedClassifierCV(LinearSVC(C=0.1), cv=3)
# print full_data pipe_gauss = Pipeline([ ('scale', StandardScaler()), ('pca', decomposition.PCA(n_components=6, whiten=False)), ('clf', BaggingClassifier(GaussianNB())) ]) pipe = Pipeline([ ('scale', StandardScaler()), ('pca', decomposition.PCA(n_components=6, whiten=False)), ('clf', CalibratedClassifierCV(GradientBoostingClassifier(n_estimators=300, max_features=1.0, max_depth=6, learning_rate=0.05, min_samples_leaf=150))) ]) pipe_ccv = Pipeline([ ('scale', StandardScaler()), ('clf', CalibratedClassifierCV(BaggingClassifier(GradientBoostingClassifier(), n_jobs=-1, verbose=True), method='isotonic', cv=5)) ]) gb_grid_params = {'clf__base_estimator__learning_rate': [0.1, 0.05, 0.02, 0.01], 'clf__base_estimator__max_depth': [4, 6, 8], 'clf__base_estimator__min_samples_leaf': [20, 50, 100, 150], 'clf__base_estimator__max_features': [1.0, 0.3, 0.1] } cv = KFold(n_splits=10, random_state=2)
def calibrate(self, X, y): ccc = CalibratedClassifierCV(self.model, method='isotonic', cv='prefit') ccc.fit(X, y) self.model = ccc # self.params = self.model.get_params() return self
"logistic_ucb", "logistic_egreedy", ]: kwargs["epsilon"] = 0.01 policy = counterfactual_policy_dict[counterfactual_policy](**kwargs) # compared OPE estimators ope_estimators = [ DirectMethod(), InverseProbabilityWeighting(), SelfNormalizedInverseProbabilityWeighting(), DoublyRobust(), SelfNormalizedDoublyRobust(), SwitchDoublyRobust(), ] # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV(RandomForest(**hyperparams)) evaluation_of_ope_results = { est.estimator_name: np.zeros(n_runs) for est in ope_estimators } for i in np.arange(n_runs): # sample a new set of logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation( bandit_feedback=bandit_feedback, policy=policy) # estimate the ground-truth policy values of the counterfactual policy # using the full expected reward contained in the bandit feedback dictionary ground_truth_policy_value = bandit_feedback["expected_reward"][
def search(self, queryPath, limit=100): # initialize our dictionary of results No_of_visual_words = int(math.pow(k, l - 1)) results = {} # open the index file for reading f1 = open("centroids.csv") r1 = csv.reader(f1) it1 = 0 it2 = 0 for row in r1: it3 = 0 for j in range(128): centroids[it1][it2][it3] = float(row[j]) it3 = it3 + 1 #j=j+1 it2 = (it2 + 1) % k if (it2 % k == 0): it1 = it1 + 1 f1.close() im_features = [] with open(self.indexPath) as f: # initialize the CSV reader reader = csv.reader(f) #word_count=np.zeros(No_of_visual_words) #words, distance = vq(queryFeatures,dictionary) #for w in words: # word_count[w] += 1 #print(word_count) #x=input() # loop over the rows in the index row_count = 0 label_count = 0 image_name = [] labels = np.zeros(5292) for row in reader: if (row_count == 0): idf = np.zeros(No_of_visual_words) for j in range(No_of_visual_words + 1): if j == 0: continue idf[j - 1] = float(row[j]) #print(No_of_visual_words) #for w in range(No_of_visual_words): # word_count[w]=(word_count[w]/len(words))*idf[w]; #print(word_count) #x=input() row_count = row_count + 1 continue features = np.zeros(No_of_visual_words) for j in range(No_of_visual_words + 1): if j == 0: continue features[j - 1] = float(row[j]) im_features.append(features) #print(row[0]) #print(features) #print(word_count) temp = row[0].partition('/')[-1].rpartition('/')[0] if (row_count != 1 and temp != prev): label_count = label_count + 1 prev = temp labels[row_count - 1] = label_count image_name.append(row[0]) row_count = row_count + 1 #x=input() #sum1=0 #for i in range(No_of_visual_words): # sum1=sum1+(features[i])*(features[i]) #sum2=0 #for i in range(No_of_visual_words): # sum2=sum2+(word_count[i])*(word_count[i]) #d=np.dot(features,word_count)/(math.sqrt(sum1)*math.sqrt(sum2)) #results[row[0]] = d im_features = np.array(im_features) # close the reader f.close() ''' c=0 for p in glob.glob("Dataset"+'/'+"cup_noodles_shrimp_picante" + "/*.jpg"): if(c==0): image = cv2.imread(p) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) sift = cv2.xfeatures2d.SIFT_create() kp, dsc= sift.detectAndCompute(gray, None) word_count=np.zeros(No_of_visual_words) words, distance = vq(dsc,dictionary) for w in words: word_count[w] += 1 print(word_count) print(len(words)) print(idf) c=c+1 ''' svc = LinearSVC() clf = CalibratedClassifierCV(svc, cv=10) clf.fit(im_features, labels) print(len(im_features)) print(len(labels)) directory = os.listdir(queryPath) #print(directory) all_query_features = [] folder_cnt = 0 test_labels = [] #for d in directory: for p in glob.glob(queryPath + "/*.jpg"): image = cv2.imread(p) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) sift = cv2.xfeatures2d.SIFT_create() kp, dsc = sift.detectAndCompute(gray, None) print(p) word_count = np.zeros(No_of_visual_words) for j in range(len(dsc)): c = go1(np.array(dsc[j]), 0) word_count[c] += 1 #for j in range(No_of_visual_words): # print(word_count[j]) #print(len(dsc)) #x=input() for w in range(No_of_visual_words): word_count[w] = word_count[w] * idf[w] #print(word_count[w]) #print(word_count) #x=input() x = comparison(word_count, im_features) #print(type(x)) name = str(p).split(".")[0] + ".txt" name = name.split("/")[1] #print(name) #x=input() file = open(name, "w") #r=csv.reader(file) for i in range(len(x)): category = image_name[x[i]].split("/")[1] im_No = image_name[x[i]].split("/")[2] #print(str(image_name[x[i]])) file.write(im_No + " ") file.write(category) file.write("\n") file.close() ''' proba = clf.predict([word_count]) print(proba) print(image_name[int(proba*63)]) #all_query_features.append(word_count) #test_labels.append(folder_cnt) #print(p) #folder_cnt+=1 #test_labels=np.array(test_labels) #np.array(all_query_features) ''' ''' svc = LinearSVC() clf = CalibratedClassifierCV(svc, cv=10) clf.fit(im_features, labels) proba = clf.predict(all_query_features) print("check") print(im_features[0]) print(all_query_features[0]) ''' ''' #print(image_name[int(proba*72)]) ''' #print(proba) #print(labels) '''
# split the data into training and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # Scale the variables to have 0 mean and unit variance scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Split the data into training and DSEL for DS techniques X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, test_size=0.5) # Considering a pool composed of 10 base classifiers # Calibrating Perceptrons to estimate probabilities model = CalibratedClassifierCV(Perceptron(max_iter=100)) # Train a pool of 10 classifiers pool_classifiers = BaggingClassifier(model, n_estimators=100) pool_classifiers.fit(X_train, y_train) # Initialize the DS techniques knorau = KNORAU(pool_classifiers) kne = KNORAE(pool_classifiers) desp = DESP(pool_classifiers) ola = OLA(pool_classifiers) mcb = MCB(pool_classifiers) apriori = APriori(pool_classifiers) meta = METADES(pool_classifiers) # Fit the des techniques
from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=25) clf.fit(X_train_valid, y_train_valid) # %% # To train the calibrated classifier, we start with the same # :class:`~sklearn.ensemble.RandomForestClassifier` but train it using only # the train data subset (600 samples) then calibrate, with `method='sigmoid'`, # using the valid data subset (400 samples) in a 2-stage process. from sklearn.calibration import CalibratedClassifierCV clf = RandomForestClassifier(n_estimators=25) clf.fit(X_train, y_train) cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit") cal_clf.fit(X_valid, y_valid) # %% # Compare probabilities # --------------------- # Below we plot a 2-simplex with arrows showing the change in predicted # probabilities of the test samples. import matplotlib.pyplot as plt plt.figure(figsize=(10, 10)) colors = ["r", "g", "b"] clf_probs = clf.predict_proba(X_test) cal_clf_probs = cal_clf.predict_proba(X_test)
test_features = pd.read_csv("../CSV Files/test_features.csv", index_col=0) test_labels = pd.read_csv("../CSV Files/test_labels.csv", index_col=0) train_features = pd.read_csv("../CSV Files/train_features.csv", index_col=0) train_labels = pd.read_csv("../CSV Files/train_labels.csv", index_col=0) train_features["USERID"] = [str(i) for i in train_features["USERID"]] test_features["USERID"] = [str(i) for i in test_features["USERID"]] # Begin Training: Try to load the model and test data sets if the model already exists if os.path.isfile('saved_model.pkl'): classifier = joblib.load('saved_model.pkl') else: # Training classifier = neural_network.MLPClassifier(verbose=True, max_iter=200, hidden_layer_sizes=(10, 5)) classifier = CalibratedClassifierCV(classifier, cv=3, method="isotonic") classifier.fit(train_features, train_labels) joblib.dump(classifier, 'saved_model.pkl') # Save Model classifier.fit(train_features, train_labels) # Testing: Place probabilities in a DF test_predict_results_proba = classifier.predict_proba(test_features) test_predict_results_proba = pd.DataFrame.from_records( test_predict_results_proba) test_accuracy = classifier.score(test_features, test_labels) print("Test Accuracy: ", test_accuracy) print(test_predict_results_proba)
print('Original prediction:', lr.predict_proba(cv_test_features[idx])[0,1]) tmp = cv_test_features[idx].copy() tmp[0,cv.vocabulary_['excellent']] = 0 tmp[0,cv.vocabulary_['see']] = 0 print('Prediction removing some features:', lr.predict_proba(tmp)[0,1]) print('Difference:', lr.predict_proba(tmp)[0,1] - lr.predict_proba(cv_test_features[idx])[0,1]) fig = exp.as_pyplot_figure() exp.show_in_notebook(text=True) ## SVM from sklearn.calibration import CalibratedClassifierCV calibrator = CalibratedClassifierCV(svm, cv='prefit') svm2=calibrator.fit(cv_train_features, train_sentiments) c2 = make_pipeline(cv, svm2) print(c2.predict_proba([norm_test_reviews[0]])) idx = 200 exp = explainer.explain_instance(norm_test_reviews[idx], c2.predict_proba, num_features=6) print('Document id: %d' % idx) print('Probability(negative) =', c2.predict_proba([norm_test_reviews[idx]])[0,1]) print('True class: %s' % test_sentiments[idx]) exp.as_list()
text_data = [overall_data[i][0] for i in range(len(overall_data))] label = [overall_data[i][1] for i in range(len(overall_data))] X_train, X_test, y_train, y_test = train_test_split(text_data, label, test_size=0.2, random_state=0) ########### # Bagging # ########### base1 = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=0) base2 = MultinomialNB(random_state=0) base3 = CalibratedClassifierCV(RidgeClassifier(solver='sparse_cg'), cv=5, random_state=0) base4 = SGDClassifier(loss='log', random_state=0) #base5 = RandomForestClassifier(n_estimators=10, max_depth=2,random_state=0) # 10회 수행, 9개의 bags bagging_accuracy = [] bg_clf1 = BaggingClassifier(base_estimator=base1, n_estimators=9) bg_clf2 = BaggingClassifier(base_estimator=base2, n_estimators=9) bg_clf3 = BaggingClassifier(base_estimator=base3, n_estimators=9) bg_clf4 = BaggingClassifier(base_estimator=base4, n_estimators=9) #bg_clf5=BaggingClassifier(base_estimator=base5, n_estimators=9) for clf, label in zip([bg_clf1, bg_clf2, bg_clf3, bg_clf4], [ 'Logistic Regression', 'Multinomial NB', 'RidgeClassifier',
def getFitness(individual, X, y): """ Feature subset fitness function """ if individual.count(0) != len(individual): # get index with value 0 cols = [index for index in range( len(individual)) if individual[index] == 0] # get features subset X_parsed = X.drop(X.columns[cols], axis=1) X_subset = pd.get_dummies(X_parsed) # X_subset = X # # for col in cols: # X_subset[col].values[:] = 0 # apply classification algorithm clf = AdaBoostClassifier() clf = BaggingClassifier() clf = BernoulliNB() clf = CalibratedClassifierCV() clf = CategoricalNB() clf = ClassifierChain() clf = ComplementNB() clf = DecisionTreeClassifier() clf = DummyClassifier() clf = ExtraTreeClassifier() clf = ExtraTreesClassifier() clf = GaussianNB() clf = GaussianProcessClassifier() clf = GradientBoostingClassifier() # clf = HistGradientBoostingClassifier() clf = KNeighborsClassifier() clf = LabelPropagation() clf = LabelSpreading() clf = LinearDiscriminantAnalysis() clf = LinearSVC() clf = LogisticRegression() clf = LogisticRegressionCV() clf = MLPClassifier() clf = MultiOutputClassifier() clf = MultinomialNB() clf = NearestCentroid() clf = NuSVC() clf = OneVsOneClassifier() clf = OneVsRestClassifier() clf = OutputCodeClassifier() clf = PassiveAggressiveClassifier() clf = Perceptron() clf = QuadraticDiscriminantAnalysis() clf = RadiusNeighborsClassifier() clf = RandomForestClassifier() clf = RidgeClassifier() clf = RidgeClassifierCV() clf = SGDClassifier() clf = SVC() clf = StackingClassifier() clf = VotingClassifier() # clf.fit(X, y) # clf.fit(X_subset, y_train) clf.fit(X_subset, y) # y_pred_ANN = clf.predict(X_test) # y_pred = clf.predict(X_subset) # score = cross_val_score(clf, X, y, cv=5) # # print(max(score), min(score)) return (avg(cross_val_score(clf, X_subset, y, cv=5)),) # return (avg(score),) # return accuracy_score(y, y_pred_ANN) else: return (0,)
print(log_loss(Y[:50400], KNC3[:50400, :])) print(log_loss(Y[50400:], KNC3[50400:, :])) print(KNC3.score(X[:50400, :], Y[:50400])) print(KNC3.score(X[50400:, :], Y[50400:])) A = list(KNC3.kneighbors(X, 6))[0].tolist() dist = pd.DataFrame(A).reset_index() dist['sum2'] = dist[[0, 1]].sum(axis=1) dist['sum4'] = dist[[0, 1, 2, 3]].sum(axis=1) dist['sum6'] = dist[[0, 1, 2, 3, 4, 5]].sum(axis=1) SVM = svm.SVC(kernel='rbf', probability=True, class_weight='balanced') bagg_SVM = Bag(base_estimator=SVM, n_estimators=200, bootstrap='true', max_samples=1000) bagg_SVM_isotonic = CalibratedClassifierCV(bagg_SVM, cv=2, method='isotonic') bagg_SVM_isotonic.fit(X[:50400, :], Y[:50400]) prob_SVM = bagg_SVM_isotonic.predict_proba(X) print(log_loss(Y[:50400], prob_SVM[:50400, :])) print(log_loss(Y[50400:], prob_SVM[50400:, :])) GNB = gnb() bagg_GNB = Bag(base_estimator=GNB, n_estimators=50, bootstrap='true', max_samples=500) bagg_GNB_isotonic = CalibratedClassifierCV(bagg_GNB, cv=2, method='isotonic') bagg_GNB_isotonic.fit(X[:70000, :], Y[:70000]) prob_GNB = bagg_GNB_isotonic.predict_proba(X) GBC = GradientBoostingClassifier(n_estimators=100,
plt.legend(fontsize = 14) plt.title('Precision-Recall plot', fontsize = 18) fig = plt.gcf() fig.set_size_inches(10, 8) fig.savefig(base_dir+'/AUCPr.png') sns.set_style("darkgrid",{'xtick.bottom': True,'xtick.top': False, 'ytick.left': True, 'ytick.right': False,}) plt.figure(figsize=(10, 10)) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") clf = CalibratedClassifierCV(model, cv = 2, method='isotonic') clf.fit(prob_pos[:,1].reshape(-1, 1), y_test) prob_pos_test = clf.predict_proba(prob_pos[:,1].reshape(-1, 1))[:,1] fraction_of_positives, mean_predicted_value = \ calibration_curve(y_test, prob_pos_test, n_bins=5, normalize=True) ax1.plot(mean_predicted_value, fraction_of_positives, "--", label="%s" % ('LR')) ax2.hist(prob_pos_test, range=(0, 1), bins=10, label='LR', histtype="step", lw=2) ax1.set_ylabel("Fraction of positives", fontsize=16) ax1.set_ylim([-0.02, 1.02]) ax1.set_xlim([-0.02, 1.02]) ax1.legend(loc='upper left', facecolor='white', fontsize = 12)
def main(): args = define_args() Paragraph.set_reinterpretations(args.reinterpret) if args.dump_files: print('\ntraining_files:', args.training_files) print('\nevaluate_files:', args.evaluate_files) classifiers = [ BernoulliNB(), RandomForestClassifier(n_estimators=100, n_jobs=-1), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), CalibratedClassifierCV(), DummyClassifier(), PassiveAggressiveClassifier(max_iter=5, tol=None), RidgeClassifier(), RidgeClassifierCV(), SGDClassifier(max_iter=5, tol=-np.infty), OneVsRestClassifier(SVC(kernel='linear')), OneVsRestClassifier(LogisticRegression()), KNeighborsClassifier() ] vectorizers = [ CountVectorizer(), TfidfVectorizer(), HashingVectorizer() ] fast_classifiers = [ BernoulliNB(), RandomForestClassifier(n_estimators=100, n_jobs=-1), AdaBoostClassifier(), # BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), CalibratedClassifierCV(), DummyClassifier(), PassiveAggressiveClassifier(max_iter=5, tol=None), RidgeClassifier(), # RidgeClassifierCV(), SGDClassifier(max_iter=5, tol=-np.infty), # OneVsRestClassifier(SVC(kernel='linear')), OneVsRestClassifier(LogisticRegression()), # KNeighborsClassifier() # Actually not slow, but we run out of memory. ] fast_vectorizers = [ CountVectorizer(), TfidfVectorizer(), # HashingVectorizer() ] if args.fast: classifiers = fast_classifiers vectorizers = fast_vectorizers try: i = [c.__class__.__name__ for c in classifiers].index(args.classifier) except ValueError: raise ValueError('Unknown classifier %s' % args.classifier) classifier = classifiers[i] try: i = [v.__class__.__name__ for v in vectorizers].index(args.vectorizer) except ValueError: raise ValueError('Unknown vectorizer %s' % args.vectorizer) vectorizer = vectorizers[i] if args.training_files: contents = read_files(args.training_files) if args.annotated_paragraphs: phase1 = parse_annotated(contents) else: phase1 = parse_paragraphs(contents) if 1 in args.dump_phase: print('Phase 1') print('=======') phase1 = list(phase1) print(repr(phase1)) if 1 == max(args.dump_phase): sys.exit(0) if args.keep_interstitials: phase2 = phase1 else: phase2 = remove_interstitials(phase1) phase1 = None # Potentially recover memory. if 2 in args.dump_phase: print('Phase 2') print('=======') phase2 = list(phase2) print(repr(phase2)) if 2 == max(args.dump_phase): sys.exit(0) # All labels need to be resolved for this phase. The easiest way # to assure this is to convert to list. phase3 = target_classes( list(phase2), default=Label('Misc-exposition'), keep=[Label(l) for l in args.labels] ) if args.dump_input: phase3 = list(phase3) if args.output_annotated: if not args.output_labels: print('\n'.join([pp.as_annotated() for pp in phase3])) else: print('\n'.join([pp.as_annotated() for pp in phase3 if pp.top_label() in args.output_labels])) else: print('\n'.join([str(pp) for pp in phase3])) phase2 = None if 3 in args.dump_phase: print('Phase 3') print('=======') phase3 = list(phase3) print(repr(phase3)) if 3 == max(args.dump_phase): sys.exit(0) phase3 = list(phase3) sample_size = len(phase3) if args.group_paragraphs: writer = csv.DictWriter(sys.stdout, fieldnames=Taxon.FIELDNAMES) writer.writeheader() for taxon in group_paragraphs(phase3): for d in taxon.dictionaries(): writer.writerow(d) sys.exit(0) np.random.seed(SEED) cutoff = int(sample_size * 0.70) permutation = np.random.permutation(phase3) phase3 = None learn = paragraph.to_dataframe(permutation[:cutoff], args.suppress_text) test = paragraph.to_dataframe(permutation[cutoff:], args.suppress_text) if args.test_classifiers: perform( classifiers, vectorizers, learn, test ) sys.exit(0) if args.test_classifiers_by_label: perform_confusion_matrix( classifiers, vectorizers, learn, test, emit_csv=args.csv ) sys.exit(0) # train or load models if args.load_vectorizer: vectorizer = joblib.load(args.load_vectorizer) classifier = joblib.load(args.load_classifier) else: vectorize_text = vectorizer.fit_transform(learn.v2) classifier.fit(vectorize_text, learn.v1) # Dump trained models. if args.dump_vectorizer: joblib.dump(vectorizer, args.dump_vectorizer) if args.dump_classifier: joblib.dump(classifier, args.dump_classifier) if args.evaluate_files: phase4 = [] # predict if args.keep_interstitials: evaluated = ( parse_paragraphs(read_files(args.evaluate_files))) else: evaluated = remove_interstitials( parse_paragraphs(read_files(args.evaluate_files))) for pp in evaluated: text = str(pp) vectorize_text = vectorizer.transform([text]) predict = classifier.predict(vectorize_text)[0] if args.insert_nomenclature and pp.contains_nomenclature(): predict = 'Nomenclature' phase4.append(pp.replace_labels(labels=[Label(predict)])) if args.output_annotated: if not args.output_labels: print('\n'.join([pp.as_annotated() for pp in phase4])) else: print('\n'.join([pp.as_annotated() for pp in phase4 if pp.top_label() in args.output_labels])) if 4 in args.dump_phase: print('Phase 4') print('=======') print(repr(phase4)) if 4 == max(args.dump_phase): sys.exit(0)
def al(dataset_name='seed4', FOIT_type='cross-all', rounds=10, batch_size=50): data, label = utils.load_source_data(dataset_name=dataset_name, FOIT_type=FOIT_type) _, number_label, _ = utils.get_number_of_label_n_trial(dataset_name) # data, label = utils.load_session_data_label(dataset_name, 0) # as unlabelled data cd_count = 16 if dataset_name == 'seed4' else 9 if dataset_name == 'seed3' else print( 'Wrong dataset_name') iteration_number = 3 if FOIT_type == 'cross-subject' else 15 accs = [([]) for i in range(iteration_number)] times = [([]) for i in range(iteration_number)] for ite in range(iteration_number): session_id = -1 sub_id = -1 if FOIT_type == 'cross-subject': session_id = ite sub_id = 14 elif FOIT_type == 'cross-session': session_id = 2 sub_id = ite elif FOIT_type == 'cross-all': session_id = 1 sub_id = ite else: print('Wrong FOIT type!') # print("Ite: ", ite) cd_data, cd_label, ud_data, ud_label = utils.pick_one_data( dataset_name, session_id=session_id, cd_count=cd_count, sub_id=sub_id) cd_data, cd_label = shuffle(cd_data, cd_label, random_state=0) ud_data, ud_label = shuffle(ud_data, ud_label, random_state=0) cd_data_min, cd_data_max = np.min(cd_data), np.max(cd_data) cd_data = utils.normalization(cd_data) # labelled data ud_data = utils.normalization(ud_data) # test data if FOIT_type == 'cross-all': data_ite, label_ite = data.copy(), label.copy() for i in range(len(data)): data_ite[i], label_ite[i] = shuffle(data_ite[i], label_ite[i], random_state=0) # data_ite, label_ite = shuffle(data, label, random_state=0) for i in range(len(data)): data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min, cd_data_max) # data_ite = utils.normalization(data_ite) elif FOIT_type == 'cross-session': data_ite, label_ite = data[ite], label[ite] for i in range(len(data_ite)): data_ite[i], label_ite[i] = shuffle(data_ite[i], label_ite[i], random_state=0) # data_ite[i] = utils.normalization(data_ite[i]) data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min, cd_data_max) # data_ite = utils.normalization(data_ite) else: data_ite, label_ite = data[ite], label[ite] for i in range(len(data_ite)): data_ite[i], label_ite[i] = shuffle(data_ite[i], label_ite[i], random_state=0) # data_ite, label_ite = shuffle(data_ite, label_ite, random_state=0) for i in range(len(data_ite)): # data_ite[i] = utils.normalization(data_ite[i]) data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min, cd_data_max) # data_ite, label_ite = data.copy(), label.copy() # for i in range(len(data)): # data_ite[i], label_ite[i] = shuffle(data_ite[i], label_ite[i], random_state=0) # for i in range(len(data)): # data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min, cd_data_max) # baseline clf = svm.LinearSVC(max_iter=30000) clf = CalibratedClassifierCV(clf, cv=5) since = time.time() clf.fit(cd_data, cd_label.squeeze()) time_baseline = time.time() - since scoreA = utils.test(clf, ud_data, ud_label.squeeze()) accs[ite].append(scoreA) times[ite].append(time_baseline) # select the data from the reservoir iteratively s_data_all, s_label_all = utils.stack_list(data_ite, label_ite) L_S_data = None L_S_label = None for i in range(rounds): # print("Rounds: ", i) # print(type(s_data_all)) # print(s_data_all.shape) s_data_all_predict_proba = clf.predict_proba(s_data_all) s_label_all_proba = utils.get_one_hot(s_label_all.squeeze(), number_label) confidence = np.zeros((s_label_all_proba.shape[0], 1)) for i in range(s_label_all_proba.shape[0]): confidence[i] = s_label_all_proba[i].dot( s_data_all_predict_proba[i].T) # confidence[i] = log_loss(s_label_all_proba[i], s_data_all_predict_proba[i]) indices = np.argsort(confidence, axis=0) # take the minimum topK indices topK_indices = indices[:batch_size] S_data = None S_label = None for i in topK_indices: one_data = s_data_all[i] one_label = s_label_all[i] if S_data is not None: S_data = np.vstack((S_data, one_data)) S_label = np.vstack((S_label, one_label)) else: S_data = one_data S_label = one_label for i in range(len(s_data_all) - 1, -1, -1): if i in topK_indices: s_data_all = np.delete(s_data_all, i, axis=0) s_label_all = np.delete(s_label_all, i, axis=0) if L_S_data is None: L_S_data = cd_data.copy() L_S_label = cd_label.copy() else: pass L_S_data = np.vstack((L_S_data, S_data)) L_S_label = np.vstack((L_S_label, S_label)) L_S_data, L_S_label = shuffle(L_S_data, L_S_label, random_state=0) clf.fit(L_S_data, L_S_label.squeeze()) time_updated_time = time.time() - since times[ite].append(time_updated_time) scoreTMP = utils.test(clf, ud_data, ud_label.squeeze()) accs[ite].append(scoreTMP) ResultTime = [] ResultAcc = [] ResultStd = [] for i in range(rounds + 1): tmpTime = [] tmpAcc = [] for j in range(iteration_number): tmpTime.append(times[j][i]) tmpAcc.append(accs[j][i]) ResultTime.append(np.mean(tmpTime)) ResultAcc.append(np.mean(tmpAcc)) ResultStd.append(np.std(tmpAcc)) print("Time: ", ResultTime) print("Accs: ", ResultAcc) print("Stds: ", ResultStd)