Beispiel #1
0
# evaluate svm with calibrated probabilities for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
# generate dataset
X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=4)
# define model
model = SVC(gamma='scale')
# wrap the model
calibrated = CalibratedClassifierCV(model, method='isotonic', cv=3)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(calibrated, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))
Beispiel #2
0
def svm(train,
        labels,
        test,
        C=10,
        kernel='rbf',
        degree=3,
        gamma=0.5,
        calibration=0.0,
        calibrationmethod='sigmoid',
        coef0=0.0,
        probability=True,
        shrinking=True,
        tol=1e-3,
        verbose=0,
        outlier_frac=0.0,
        outlier_method='EE',
        rescale_pred=False,
        class_weight=None,
        sample_weight=None,
        rescale=True):
    """
    Trains a model by giving it a feature matrix, as well as the labels (the ground truth)
    then using that model, predicts the given test samples
    output is 9 probabilities, one for each class
    :param train: The training data, to train the model
    :param labels: The labels of the training data, an array
    :param C: trades off misclassification of training examples against simplicity of the decision surface
              low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly 
    :param gamma: parameter defines how far the influence of a single training example reaches
                  low values meaning ‘far’ and high values meaning ‘close’. 
    :param verbose: See sklearn documentation
    :param rescale: both the training and testing data are taken square root of, rescaled to unit variance, and moved to interval [0,1]
    """
    if outlier_frac > 0:
        train, labels = filter_data(train,
                                    labels,
                                    cut_outlier_frac=outlier_frac,
                                    method=outlier_method)  # remove ourliers
    if isinstance(sample_weight, str):
        sample_weight = obtain_class_weights(labels, sample_weight)

    if rescale:  #take square root, rescale variance to unit, rescale to [0,1]
        #this should preserve sparsity of matrix
        train = sqrt(train)
        test = sqrt(test)
        scaler = StandardScaler(with_mean=False, with_std=True, copy=True)
        train = scaler.fit_transform(train)
        scaler = StandardScaler(with_mean=False, with_std=True, copy=True)
        test = scaler.fit_transform(test)
        scaler = MinMaxScaler()
        train = scaler.fit_transform(train)
        scaler = MinMaxScaler()
        test = scaler.fit_transform(test)

    model = SVC(C=C,
                kernel=kernel,
                degree=degree,
                gamma=gamma,
                coef0=coef0,
                probability=probability,
                shrinking=shrinking,
                tol=tol,
                verbose=verbose,
                class_weight=class_weight)

    if calibration == 0.0:
        model.fit(train, labels, sample_weight)
    elif calibration > 1:
        model = CalibratedClassifierCV(model, calibrationmethod, calibration)
        model.fit(train, labels, sample_weight)
    else:
        N = len(labels)
        if sample_weight is None:
            sample_weight = ones(N)
        train_rows = floor((1.0 - calibration) * N)
        model.fit(train[:train_rows, :], labels[:train_rows],
                  sample_weight[:train_rows])
        model = CalibratedClassifierCV(model, calibrationmethod, "prefit")
        model.fit(train[train_rows:, :],
                  labels[train_rows:],
                  sample_weight=sample_weight[train_rows:])

    model.fit(train, labels, sample_weight)

    predictions = model.predict_proba(test)

    if rescale_pred:
        predictions = rescale_prior(predictions, bincount(labels))
    return predictions
Beispiel #3
0
from sklearn.calibration import CalibratedClassifierCV
import numpy as np

iter = 5

with open('ListOfBestParamsRS.pkl', 'rb') as f:
    best_params = pickle.load(f)

pathd = "C://Users//Arushi//PycharmProjects//ThesisChap2//Dataset//"

path = "C://Users//Arushi//PycharmProjects//ThesisChap2//fixedBuckets(10)//"
genenamesFile = open(pathd + "transformedColumnNames221.txt",
                     'r').readline().rstrip('\n').split(',')

for i in range(iter):
    X_train = np.load(path + 'final_train_binarydata_' + str(i) + '.npy')
    Y_train = np.load(path + 'final_train_labels_' + str(i) + '.npy')
    bp = best_params[i]

    X_train = X_train.astype('float')
    X_train = normalize(X_train)
    Y_train = Y_train.astype('float')
    Y_train = Y_train.astype(int)

    clf = LinearSVC(C=bp['C'], max_iter=10000, tol=1e-4)
    clf_sigmoid = CalibratedClassifierCV(clf, cv=4, method='sigmoid').fit(
        X_train, Y_train.ravel())

    with open('Model_ism' + str(i) + '.pkl', 'wb') as f:
        pickle.dump(clf_sigmoid, f)
Beispiel #4
0
            y = np.hstack(
                (np.ones(len(car_features)), np.zeros(len(notcar_features))))

            # Split up data into randomized training and test sets
            rand_state = np.random.randint(0, 100)
            X_train, X_test, y_train, y_test = train_test_split(
                scaled_X, y, test_size=0.2, random_state=rand_state)

            print('Using: ', orient, 'orientations', pix_per_cell,
                  'pixels per cell and ', cell_per_block, 'cells per block ',
                  hist_bins, 'hist_bins => feature vector length: ',
                  len(X_train[0]))

            # Use a linear SVC
            svc = LinearSVC()
            svc_model = CalibratedClassifierCV(svc)

            # Check the training time for the SVC
            t = time.time()
            svc_model.fit(X_train, y_train)

            t2 = time.time()
            print('   Seconds to train SVC: ', round(t2 - t, 2))

            # Check the score of the SVC
            print('   Train Accuracy of SVC: ',
                  round(svc_model.score(X_train, y_train), 4))
            print('   Test  Accuracy of SVC:  ',
                  round(svc_model.score(X_test, y_test), 4))

            # Check the prediction time for a single sample
Beispiel #5
0
def TrainPerceptron(X_train,y_train):
    clf = Perceptron(eta0=0.01, random_state=1, max_iter= 100)
    clf = CalibratedClassifierCV(clf)
    clf.fit(X_train, y_train)
    return clf
def train_SVC(data_vec, label):
    svc = LinearSVC()
    clf = CalibratedClassifierCV(svc)
    clf.fit(data_vec, label)
    return clf
Beispiel #7
0
# split	train and test
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y)
# x_train.shape,x_test.shape,y_train.shape,y_test.shape

count_vect_tfidf = TfidfVectorizer()
x_train_text_tfidf = count_vect_tfidf.fit_transform(x_train.text)

# Creating a pickle file for the TFIDFVectorizer
pickle.dump(count_vect_tfidf, open('cv-transform.pkl', 'wb'))
print("victorizer dumped")

# Model Building
clf = SGDClassifier(class_weight='balanced',
                    alpha=0.0001,
                    penalty='l2',
                    loss='log',
                    random_state=42)

# Fitting Logistic Regression to the Training set
clf.fit(x_train_text_tfidf, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(x_train_text_tfidf, y_train)

# Creating a pickle file for the Disaster-Tweet-LR-model
filename = 'Disaster-Tweet-LR-model.pkl'
pickle.dump(sig_clf, open(filename, 'wb'))
print("model dumped")
Beispiel #8
0
        "\n\n\nWhich label is cost sensitive type the class number to over sample it:"
    ))
costval = int(input("\nWhat is the cost?(integer):"))

#perfom the imbalancing actions
for ampl_m in range(1, 6):
    print(
        "\n==================================================\n---------------------------------------New sampling method\n"
    )
    x_train, x_test, y_train, y_test = class_imbal(df, 5, transformer,
                                                   costclass, costval, ampl_m)

    ##############################################
    #it is required a base algorithm callibration before any cost sensitivity action
    upsampled = CalibratedClassifierCV(base_estimator=arclfs[0][0],
                                       method='sigmoid',
                                       cv=None)
    upsampled2 = CalibratedClassifierCV(base_estimator=arclfs[1][0],
                                        method='isotonic',
                                        cv=None)
    fclf = [
        #[class_multi_label(x_train, y_train, 0, 0, 9), "Applying Multilabel k Nearest Neighbours", "SVM_KK - MLKnn"],
        [
            class_multi_label(x_train, y_train, upsampled, arclfs[0][1], 1),
            "Applying binary relevance", "RFC - Binary Relevance"
        ],
        [
            class_multi_label(x_train, y_train, upsampled, arclfs[0][1], 2),
            "Duplicates multi-label examples into examples with one label each",
            "RFC - Multi-label examples into examples with one label each"
        ],
from sklearn.preprocessing import MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC


X = np.genfromtxt(os.environ['FEATURES_FILE'], delimiter=',',dtype=None, encoding='utf-8')

X = np.delete(X,(0),axis=0)
names = X[:,0]
X = np.delete(X,(0),axis=1).astype(float)

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

size = len(X)

y = np.genfromtxt(os.environ['TARGETS_FILE'], delimiter=',',dtype=None, encoding='utf-8')
y = np.delete(y,(0),axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.5)

regr = SVC()
regr_cal = CalibratedClassifierCV(regr,  method='sigmoid')
regr_cal.fit(X_train, y_train)

dump(regr_cal, os.environ['MODEL_FILE'])

print('Train score =', regr_cal.score(X_train, y_train))

print('Test score =', regr_cal.score(X_test, y_test))
Beispiel #10
0
from sklearn.calibration import CalibratedClassifierCV
import joblib

from src.load_data import LoadData
from src.prep_data import PrepData

# Set the name of the file to load, and bring in Loader and Data Prep
loader = LoadData()
prep = PrepData()

df = loader.load_traindata_to_df()

# Set target and get features
y = df['Class']
X = prep.drop_target_column(df)

# Create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=680)

# Create a Support Vector Machine model and then Calibrate it using CalibratedClassifierCV
model = SVC(kernel='linear')
model.fit(X_train, y_train)

calibrator = CalibratedClassifierCV(model, cv=10)
calibrator.fit(X_train, y_train)

# Export the model
joblib.dump(calibrator, 'models/cal_model.pkl')
def main(cfg: DictConfig) -> None:
    x, y = get_data(cfg)
    assert_binary_data(x, y)

    x_tr, x_te, y_tr, y_te = train_test_split(
        x,
        y,
        test_size=cfg.evaluation.test_ratio,
        random_state=_seed(cfg.seed))

    mlflow.set_tracking_uri(hydra.utils.get_original_cwd() + '/mlruns')

    experiment_id = get_mlflow_experiment_id(cfg.name)

    with mlflow.start_run(experiment_id=experiment_id):
        positive_ratio = len(y[y == 1]) / len(y)
        loss = get_loss_class(cfg)
        stopping_criterion = EarlyStopping(
            monitor='train_loss',
            lower_is_better=True,
            patience=min(10, cfg.training.max_epochs),
            threshold=cfg.training.tol,
            threshold_mode='rel',
            sink=logger.info,
        )
        clf = Classifier(
            module=LinearClassifier,
            module__n_features=x.shape[1],
            max_epochs=cfg.training.max_epochs,
            criterion=loss,
            predict_nonlinearity=get_loss_link(cfg),
            optimizer=torch.optim.Adam,
            iterator_train__batch_size=cfg.training.batch_size,
            iterator_train__shuffle=True,
            train_split=False,
            callbacks=[('stopping_criterion', stopping_criterion)],
            verbose=cfg.verbose,
        )

        if check_if_weight(cfg):
            pos_weight = torch.FloatTensor([1 / positive_ratio - 1])
            clf.set_params(criterion__pos_weight=pos_weight)

        if check_if_validation(cfg):
            params = get_validation_params(cfg)
            clf = GridSearchCV(
                clf,
                params,
                refit=True,
                cv=cfg.evaluation.n_cv,
                scoring='accuracy'
                if loss is HingeLoss else negative_brier_score,
                n_jobs=-1,
            )
        else:
            clf.set_params(
                lr=cfg.training.lr,
                optimizer__weight_decay=cfg.training.regularization,
            )
            if check_if_gev_loss(loss):
                clf.set_params(criterion__xi=cfg.loss.xi)

        mlflow.log_param('dataset', cfg.dataset)
        mlflow.log_param('dataset.positive_ratio', positive_ratio)
        mlflow.log_param('loss', cfg.loss.name)
        mlflow.log_param('lr', cfg.training.lr)
        mlflow.log_param('max_epochs', cfg.training.max_epochs)
        mlflow.log_param('tol', cfg.training.tol)
        mlflow.log_param('regularization', cfg.training.regularization)
        mlflow.log_param('seed', _seed(cfg.seed))
        if check_if_gev_loss(loss):
            mlflow.log_param('xi', cfg.loss.xi)

        if loss is HingeLoss:
            # Step 1: fit linear model with hinge loss
            x_tr, x_vl, y_tr, y_vl = train_test_split(x, y, test_size=0.5)
            clf.fit(x_tr, y_tr)

            # Step 2: calibrate linear model with Platt's scaling
            clf = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit')
            clf.fit(x_vl, y_vl)
        elif cfg.loss.name == "isotonic":
            # Step 1: fit linear model with logistic regression (AUC maximization)
            x_tr, x_vl, y_tr, y_vl = train_test_split(x, y, test_size=0.5)
            clf.fit(x_tr, y_tr)

            # Step 2: calibrate linear model with isotonic regression
            clf = CalibratedClassifierCV(clf, method='isotonic', cv='prefit')
            clf.fit(x_vl, y_vl)
        elif cfg.loss.name == "bagging":
            clf = BalancedBaggingRegressor(
                clf,
                n_estimators=10,
                bootstrap=True,
                sampling_strategy='majority',
                n_jobs=1,
            )
            clf.fit(x_tr, y_tr)
        else:
            clf.fit(x_tr, y_tr)

        y_pred = clf.predict(x_te)
        y_prob = clf.predict_proba(x_te)[:, 1]
        log_metric('brier_score', brier_score_loss(y_te, y_prob))
Beispiel #12
0
Y_test = np.delete(Y_use, index, axis=0)

##Group CV
groups = data_xy.iloc[:, -2]
gkf = GroupKFold(n_splits=7)
for train_index, test_index in gkf.split(X, Y, groups):
    X_train = X[train_index]
    X_validate = X[test_index]
    Y_train = Y[train_index]
    Y_validate = Y[test_index]
print(X_train.shape)
print(Y_train.shape)

##Establish Model
model_logi = LogisticRegression().fit(X=X_train, y=Y_train, sample_weight=None)
model_ccv = CalibratedClassifierCV().fit(X_train, Y_train)
model_gbc = GradientBoostingClassifier().fit(X_train, Y_train)
model_rfc = RandomForestClassifier(max_depth=2,
                                   random_state=0).fit(X_train, Y_train)
model_abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                               algorithm="SAMME.R",
                               n_estimators=100).fit(X_train, Y_train)

##Calculate logloss on test set
Y_validate_pre = model_gbc.predict_proba(X_validate)[:, 1]
Y_test_pre = model_gbc.predict_proba(X_test)[:, 1]
# Y_validate_pre[np.where(Y_validate_pre < 0.15)]=0.01
# Y_validate_pre[np.where((Y_validate_pre > 0.2)&(Y_validate_pre < 0.4))]=0.4
# Y_validate_pre[np.where(Y_validate_pre > 0.9)]=0.99

loos1 = metrics.log_loss(Y_validate, Y_validate_pre)
Beispiel #13
0
def report_log_loss(train_x, train_y, test_x, test_y, clf):
    clf.fit(train_x, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x, train_y)
    sig_clf_probs = sig_clf.predict_proba(test_x)
    return log_loss(test_y, sig_clf_probs, eps=1e-15)
Beispiel #14
0
# is not what explains a performance difference between no calibration, and calibration.

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clfbag = BaggingClassifier(clf, n_estimators=5)  # 在random_forest上做bagging
clfbag.fit(Xtrain, ytrain)
ypreds = clfbag.predict_proba(Xtest)[:, 1]
print("loss WITHOUT calibration : ",
      log_loss(ytest, ypreds, eps=1e-15, normalize=True))
print("auc WITHOUT calibration : ", roc_auc_score(ytest, ypreds))

# Now, we train and apply a Random Forest WITH calibration
# In our case, 'isotonic' worked better than default 'sigmoid'
# This is not always the case. Depending of the case, you have to test the two possibilities

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
#calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
calibrated_clf.fit(Xtrain, ytrain)
ypreds = calibrated_clf.predict_proba(Xtest)[:, 1]
print("loss WITH calibration : ",
      log_loss(ytest, ypreds, eps=1e-15, normalize=True))  # logloss会降低,auc升高
print("auc WITHOUT calibration : ", roc_auc_score(ytest, ypreds))

print(" ")
print(
    "Conclusion : in our case, calibration improved performance a lot ! (reduced loss)"
)
# We can see that we highly improved performance with calibration (loss is reduced) !
# Using calibration helped our team a lot to climb the leaderboard.
# In the future competitions, that's for sure, I will not forget to test this trick !
Beispiel #15
0
    tr_dtmat = tfidf.fit_transform(tr_dtmat)
    print("Done.\n")

if PRINT_FEATURES:
    feature_list = cv.get_feature_names()
    feature_map = sel.get_support()
    features = [i for i, j in zip(feature_list, feature_map) if j == True]
    print("Features: " + ','.join(features))

# Train Classifier
print("Training classifier\n---------")
if REGULARIZATION == 'l2':
    clf = LinearSVC(penalty='l2')
else:
    clf = LinearSVC()
clf = CalibratedClassifierCV(clf)
clf.fit(tr_dtmat, tr_y_)
print("Done.\n")

if RUN_VALIDATION_SET:
    print("Running validation set\n---------")
    va_dtmat = cv.transform(pva_x)
    va_dtmat = sel.transform(va_dtmat)
    if TF_IDF:
        va_dtmat = tfidf.transform(va_dtmat)
    predicted = clf.predict(va_dtmat)

    # Create probability mask
    probs = clf.predict_proba(va_dtmat)
    confidences = []
    prob_mask = []
 def create_classifier():
     return CalibratedClassifierCV(LinearSVC(C=0.1), cv=3)
Beispiel #17
0



# print full_data

 pipe_gauss = Pipeline([
     ('scale', StandardScaler()),
     ('pca', decomposition.PCA(n_components=6, whiten=False)),
     ('clf', BaggingClassifier(GaussianNB()))
])

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', decomposition.PCA(n_components=6, whiten=False)),
    ('clf', CalibratedClassifierCV(GradientBoostingClassifier(n_estimators=300, max_features=1.0, max_depth=6,
                                                              learning_rate=0.05, min_samples_leaf=150)))
])
pipe_ccv = Pipeline([
     ('scale', StandardScaler()),
     ('clf', CalibratedClassifierCV(BaggingClassifier(GradientBoostingClassifier(), n_jobs=-1, verbose=True),
                                    method='isotonic', cv=5))
 ])

gb_grid_params = {'clf__base_estimator__learning_rate': [0.1, 0.05, 0.02, 0.01],
             'clf__base_estimator__max_depth': [4, 6, 8],
             'clf__base_estimator__min_samples_leaf': [20, 50, 100, 150],
             'clf__base_estimator__max_features': [1.0, 0.3, 0.1]
             }

cv = KFold(n_splits=10, random_state=2)
Beispiel #18
0
 def calibrate(self, X, y):
     ccc = CalibratedClassifierCV(self.model, method='isotonic', cv='prefit')
     ccc.fit(X, y)
     self.model = ccc
     #  self.params = self.model.get_params()
     return self
Beispiel #19
0
            "logistic_ucb",
            "logistic_egreedy",
    ]:
        kwargs["epsilon"] = 0.01
    policy = counterfactual_policy_dict[counterfactual_policy](**kwargs)
    # compared OPE estimators
    ope_estimators = [
        DirectMethod(),
        InverseProbabilityWeighting(),
        SelfNormalizedInverseProbabilityWeighting(),
        DoublyRobust(),
        SelfNormalizedDoublyRobust(),
        SwitchDoublyRobust(),
    ]
    # a base ML model for regression model used in Direct Method and Doubly Robust
    base_model = CalibratedClassifierCV(RandomForest(**hyperparams))

    evaluation_of_ope_results = {
        est.estimator_name: np.zeros(n_runs)
        for est in ope_estimators
    }
    for i in np.arange(n_runs):
        # sample a new set of logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # run a counterfactual bandit algorithm on logged bandit feedback data
        selected_actions = run_bandit_simulation(
            bandit_feedback=bandit_feedback, policy=policy)
        # estimate the ground-truth policy values of the counterfactual policy
        # using the full expected reward contained in the bandit feedback dictionary
        ground_truth_policy_value = bandit_feedback["expected_reward"][
    def search(self, queryPath, limit=100):
        # initialize our dictionary of results

        No_of_visual_words = int(math.pow(k, l - 1))

        results = {}
        # open the index file for reading

        f1 = open("centroids.csv")
        r1 = csv.reader(f1)

        it1 = 0
        it2 = 0
        for row in r1:
            it3 = 0

            for j in range(128):

                centroids[it1][it2][it3] = float(row[j])
                it3 = it3 + 1
                #j=j+1
            it2 = (it2 + 1) % k
            if (it2 % k == 0):
                it1 = it1 + 1

        f1.close()

        im_features = []
        with open(self.indexPath) as f:
            # initialize the CSV reader
            reader = csv.reader(f)
            #word_count=np.zeros(No_of_visual_words)
            #words, distance = vq(queryFeatures,dictionary)
            #for w in words:
            #	word_count[w] += 1
            #print(word_count)
            #x=input()

            # loop over the rows in the index
            row_count = 0
            label_count = 0
            image_name = []
            labels = np.zeros(5292)
            for row in reader:

                if (row_count == 0):
                    idf = np.zeros(No_of_visual_words)

                    for j in range(No_of_visual_words + 1):
                        if j == 0:
                            continue
                        idf[j - 1] = float(row[j])
                    #print(No_of_visual_words)
                    #for w in range(No_of_visual_words):
                    #	word_count[w]=(word_count[w]/len(words))*idf[w];

                    #print(word_count)
                    #x=input()
                    row_count = row_count + 1
                    continue

                features = np.zeros(No_of_visual_words)
                for j in range(No_of_visual_words + 1):
                    if j == 0:
                        continue

                    features[j - 1] = float(row[j])
                im_features.append(features)

                #print(row[0])
                #print(features)
                #print(word_count)

                temp = row[0].partition('/')[-1].rpartition('/')[0]
                if (row_count != 1 and temp != prev):
                    label_count = label_count + 1
                prev = temp
                labels[row_count - 1] = label_count
                image_name.append(row[0])
                row_count = row_count + 1

                #x=input()

                #sum1=0
                #for i in range(No_of_visual_words):
                #	sum1=sum1+(features[i])*(features[i])
                #sum2=0
                #for i in range(No_of_visual_words):
                #	sum2=sum2+(word_count[i])*(word_count[i])
                #d=np.dot(features,word_count)/(math.sqrt(sum1)*math.sqrt(sum2))

                #results[row[0]] = d

            im_features = np.array(im_features)
            # close the reader
            f.close()
        '''
		c=0
		for p in glob.glob("Dataset"+'/'+"cup_noodles_shrimp_picante" + "/*.jpg"):
			if(c==0):
				image = cv2.imread(p)
				gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
				sift = cv2.xfeatures2d.SIFT_create()
				kp, dsc= sift.detectAndCompute(gray, None)
				word_count=np.zeros(No_of_visual_words)
				words, distance = vq(dsc,dictionary)
				for w in words:
					word_count[w] += 1
				print(word_count)
				print(len(words))
				print(idf)
				c=c+1	

		'''
        svc = LinearSVC()
        clf = CalibratedClassifierCV(svc, cv=10)
        clf.fit(im_features, labels)
        print(len(im_features))
        print(len(labels))

        directory = os.listdir(queryPath)
        #print(directory)
        all_query_features = []
        folder_cnt = 0
        test_labels = []

        #for d in directory:

        for p in glob.glob(queryPath + "/*.jpg"):
            image = cv2.imread(p)
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            sift = cv2.xfeatures2d.SIFT_create()
            kp, dsc = sift.detectAndCompute(gray, None)
            print(p)

            word_count = np.zeros(No_of_visual_words)
            for j in range(len(dsc)):
                c = go1(np.array(dsc[j]), 0)
                word_count[c] += 1

            #for j in range(No_of_visual_words):
            #	print(word_count[j])
            #print(len(dsc))
            #x=input()

            for w in range(No_of_visual_words):
                word_count[w] = word_count[w] * idf[w]
                #print(word_count[w])

            #print(word_count)
            #x=input()
            x = comparison(word_count, im_features)
            #print(type(x))
            name = str(p).split(".")[0] + ".txt"
            name = name.split("/")[1]
            #print(name)
            #x=input()

            file = open(name, "w")
            #r=csv.reader(file)
            for i in range(len(x)):
                category = image_name[x[i]].split("/")[1]
                im_No = image_name[x[i]].split("/")[2]
                #print(str(image_name[x[i]]))
                file.write(im_No + " ")
                file.write(category)

                file.write("\n")
            file.close()
            '''
			
			proba = clf.predict([word_count])
			print(proba)
			print(image_name[int(proba*63)])
			

			#all_query_features.append(word_count)
			#test_labels.append(folder_cnt)
				
			#print(p)
			#folder_cnt+=1
		#test_labels=np.array(test_labels)
		#np.array(all_query_features)
		'''
        '''
		svc = LinearSVC()
		clf = CalibratedClassifierCV(svc, cv=10)
		clf.fit(im_features, labels)
		proba = clf.predict(all_query_features)
		
		print("check")
		print(im_features[0])
		print(all_query_features[0])
		'''
        '''
		#print(image_name[int(proba*72)])
		'''
        #print(proba)
        #print(labels)
        '''
Beispiel #21
0
    # split the data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # Scale the variables to have 0 mean and unit variance
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Split the data into training and DSEL for DS techniques
    X_train, X_dsel, y_train, y_dsel = train_test_split(X_train,
                                                        y_train,
                                                        test_size=0.5)
    # Considering a pool composed of 10 base classifiers

    # Calibrating Perceptrons to estimate probabilities
    model = CalibratedClassifierCV(Perceptron(max_iter=100))

    # Train a pool of 10 classifiers
    pool_classifiers = BaggingClassifier(model, n_estimators=100)
    pool_classifiers.fit(X_train, y_train)

    # Initialize the DS techniques
    knorau = KNORAU(pool_classifiers)
    kne = KNORAE(pool_classifiers)
    desp = DESP(pool_classifiers)
    ola = OLA(pool_classifiers)
    mcb = MCB(pool_classifiers)
    apriori = APriori(pool_classifiers)
    meta = METADES(pool_classifiers)

    # Fit the des techniques
Beispiel #22
0
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train_valid, y_train_valid)

# %%
# To train the calibrated classifier, we start with the same
# :class:`~sklearn.ensemble.RandomForestClassifier` but train it using only
# the train data subset (600 samples) then calibrate, with `method='sigmoid'`,
# using the valid data subset (400 samples) in a 2-stage process.

from sklearn.calibration import CalibratedClassifierCV

clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
cal_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
cal_clf.fit(X_valid, y_valid)

# %%
# Compare probabilities
# ---------------------
# Below we plot a 2-simplex with arrows showing the change in predicted
# probabilities of the test samples.

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
colors = ["r", "g", "b"]

clf_probs = clf.predict_proba(X_test)
cal_clf_probs = cal_clf.predict_proba(X_test)
Beispiel #23
0
test_features = pd.read_csv("../CSV Files/test_features.csv", index_col=0)
test_labels = pd.read_csv("../CSV Files/test_labels.csv", index_col=0)
train_features = pd.read_csv("../CSV Files/train_features.csv", index_col=0)
train_labels = pd.read_csv("../CSV Files/train_labels.csv", index_col=0)
train_features["USERID"] = [str(i) for i in train_features["USERID"]]
test_features["USERID"] = [str(i) for i in test_features["USERID"]]

# Begin Training: Try to load the model and test data sets if the model already exists
if os.path.isfile('saved_model.pkl'):
    classifier = joblib.load('saved_model.pkl')
else:
    # Training
    classifier = neural_network.MLPClassifier(verbose=True,
                                              max_iter=200,
                                              hidden_layer_sizes=(10, 5))
    classifier = CalibratedClassifierCV(classifier, cv=3, method="isotonic")

    classifier.fit(train_features, train_labels)
    joblib.dump(classifier, 'saved_model.pkl')  # Save Model

    classifier.fit(train_features, train_labels)

# Testing: Place probabilities in a DF
test_predict_results_proba = classifier.predict_proba(test_features)
test_predict_results_proba = pd.DataFrame.from_records(
    test_predict_results_proba)

test_accuracy = classifier.score(test_features, test_labels)

print("Test Accuracy: ", test_accuracy)
print(test_predict_results_proba)
print('Original prediction:', lr.predict_proba(cv_test_features[idx])[0,1])
tmp = cv_test_features[idx].copy()
tmp[0,cv.vocabulary_['excellent']] = 0
tmp[0,cv.vocabulary_['see']] = 0
print('Prediction removing some features:', lr.predict_proba(tmp)[0,1])
print('Difference:', lr.predict_proba(tmp)[0,1] - lr.predict_proba(cv_test_features[idx])[0,1])

fig = exp.as_pyplot_figure()

exp.show_in_notebook(text=True)

## SVM

from sklearn.calibration import CalibratedClassifierCV 
calibrator = CalibratedClassifierCV(svm, cv='prefit')
svm2=calibrator.fit(cv_train_features, train_sentiments)

c2 = make_pipeline(cv, svm2)
print(c2.predict_proba([norm_test_reviews[0]]))



idx = 200
exp = explainer.explain_instance(norm_test_reviews[idx], c2.predict_proba, num_features=6)
print('Document id: %d' % idx)
print('Probability(negative) =', c2.predict_proba([norm_test_reviews[idx]])[0,1])

print('True class: %s' % test_sentiments[idx])

exp.as_list()
text_data = [overall_data[i][0] for i in range(len(overall_data))]
label = [overall_data[i][1] for i in range(len(overall_data))]

X_train, X_test, y_train, y_test = train_test_split(text_data,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=0)

###########
# Bagging #
###########

base1 = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=0)
base2 = MultinomialNB(random_state=0)
base3 = CalibratedClassifierCV(RidgeClassifier(solver='sparse_cg'),
                               cv=5,
                               random_state=0)
base4 = SGDClassifier(loss='log', random_state=0)
#base5 = RandomForestClassifier(n_estimators=10, max_depth=2,random_state=0)

# 10회 수행, 9개의 bags
bagging_accuracy = []

bg_clf1 = BaggingClassifier(base_estimator=base1, n_estimators=9)
bg_clf2 = BaggingClassifier(base_estimator=base2, n_estimators=9)
bg_clf3 = BaggingClassifier(base_estimator=base3, n_estimators=9)
bg_clf4 = BaggingClassifier(base_estimator=base4, n_estimators=9)
#bg_clf5=BaggingClassifier(base_estimator=base5, n_estimators=9)

for clf, label in zip([bg_clf1, bg_clf2, bg_clf3, bg_clf4], [
        'Logistic Regression', 'Multinomial NB', 'RidgeClassifier',
Beispiel #26
0
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if individual.count(0) != len(individual):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # X_subset = X
        #
        # for col in cols:
        #     X_subset[col].values[:] = 0

        # apply classification algorithm
        clf = AdaBoostClassifier()
        clf = BaggingClassifier()
        clf = BernoulliNB()

        clf = CalibratedClassifierCV()
        clf = CategoricalNB()
        clf = ClassifierChain()
        clf = ComplementNB()

        clf = DecisionTreeClassifier()
        clf = DummyClassifier()

        clf = ExtraTreeClassifier()
        clf = ExtraTreesClassifier()

        clf = GaussianNB()
        clf = GaussianProcessClassifier()
        clf = GradientBoostingClassifier()

        # clf = HistGradientBoostingClassifier()

        clf = KNeighborsClassifier()

        clf = LabelPropagation()
        clf = LabelSpreading()
        clf = LinearDiscriminantAnalysis()
        clf = LinearSVC()
        clf = LogisticRegression()
        clf = LogisticRegressionCV()

        clf = MLPClassifier()
        clf = MultiOutputClassifier()
        clf = MultinomialNB()

        clf = NearestCentroid()
        clf = NuSVC()

        clf = OneVsOneClassifier()
        clf = OneVsRestClassifier()
        clf = OutputCodeClassifier()

        clf = PassiveAggressiveClassifier()
        clf = Perceptron()

        clf = QuadraticDiscriminantAnalysis()

        clf = RadiusNeighborsClassifier()
        clf = RandomForestClassifier()
        clf = RidgeClassifier()
        clf = RidgeClassifierCV()

        clf = SGDClassifier()
        clf = SVC()
        clf = StackingClassifier()

        clf = VotingClassifier()

        # clf.fit(X, y)
        # clf.fit(X_subset, y_train)
        clf.fit(X_subset, y)

        # y_pred_ANN = clf.predict(X_test)
        # y_pred = clf.predict(X_subset)

        # score = cross_val_score(clf, X, y, cv=5)
        #
        # print(max(score), min(score))

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
        # return (avg(score),)
        # return accuracy_score(y, y_pred_ANN)
    else:
        return (0,)
Beispiel #27
0
print(log_loss(Y[:50400], KNC3[:50400, :]))
print(log_loss(Y[50400:], KNC3[50400:, :]))
print(KNC3.score(X[:50400, :], Y[:50400]))
print(KNC3.score(X[50400:, :], Y[50400:]))
A = list(KNC3.kneighbors(X, 6))[0].tolist()
dist = pd.DataFrame(A).reset_index()
dist['sum2'] = dist[[0, 1]].sum(axis=1)
dist['sum4'] = dist[[0, 1, 2, 3]].sum(axis=1)
dist['sum6'] = dist[[0, 1, 2, 3, 4, 5]].sum(axis=1)

SVM = svm.SVC(kernel='rbf', probability=True, class_weight='balanced')
bagg_SVM = Bag(base_estimator=SVM,
               n_estimators=200,
               bootstrap='true',
               max_samples=1000)
bagg_SVM_isotonic = CalibratedClassifierCV(bagg_SVM, cv=2, method='isotonic')
bagg_SVM_isotonic.fit(X[:50400, :], Y[:50400])
prob_SVM = bagg_SVM_isotonic.predict_proba(X)
print(log_loss(Y[:50400], prob_SVM[:50400, :]))
print(log_loss(Y[50400:], prob_SVM[50400:, :]))

GNB = gnb()
bagg_GNB = Bag(base_estimator=GNB,
               n_estimators=50,
               bootstrap='true',
               max_samples=500)
bagg_GNB_isotonic = CalibratedClassifierCV(bagg_GNB, cv=2, method='isotonic')
bagg_GNB_isotonic.fit(X[:70000, :], Y[:70000])
prob_GNB = bagg_GNB_isotonic.predict_proba(X)

GBC = GradientBoostingClassifier(n_estimators=100,
plt.legend(fontsize = 14)
plt.title('Precision-Recall plot', fontsize = 18)
fig = plt.gcf()
fig.set_size_inches(10, 8)
fig.savefig(base_dir+'/AUCPr.png')

sns.set_style("darkgrid",{'xtick.bottom': True,'xtick.top': False,
                'ytick.left': True, 'ytick.right': False,})

plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

clf = CalibratedClassifierCV(model, cv = 2, method='isotonic')
clf.fit(prob_pos[:,1].reshape(-1, 1), y_test)
prob_pos_test = clf.predict_proba(prob_pos[:,1].reshape(-1, 1))[:,1]
fraction_of_positives, mean_predicted_value = \
        calibration_curve(y_test, prob_pos_test, n_bins=5, normalize=True)

ax1.plot(mean_predicted_value, fraction_of_positives, "--",
             label="%s" % ('LR'))

ax2.hist(prob_pos_test, range=(0, 1), bins=10, label='LR',
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives", fontsize=16)
ax1.set_ylim([-0.02, 1.02])
ax1.set_xlim([-0.02, 1.02])
ax1.legend(loc='upper left', facecolor='white', fontsize = 12)
Beispiel #29
0
def main():
    args = define_args()

    Paragraph.set_reinterpretations(args.reinterpret)

    if args.dump_files:
        print('\ntraining_files:', args.training_files)
        print('\nevaluate_files:', args.evaluate_files)

    classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(max_iter=5, tol=None),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(max_iter=5, tol=-np.infty),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]
    vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

    fast_classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        # BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(max_iter=5, tol=None),
        RidgeClassifier(),
        # RidgeClassifierCV(),
        SGDClassifier(max_iter=5, tol=-np.infty),
        # OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        # KNeighborsClassifier()  # Actually not slow, but we run out of memory.
    ]
    fast_vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        # HashingVectorizer()
    ]

    if args.fast:
        classifiers = fast_classifiers
        vectorizers = fast_vectorizers
    try:
        i = [c.__class__.__name__ for c in classifiers].index(args.classifier)
    except ValueError:
        raise ValueError('Unknown classifier %s' % args.classifier)
    classifier = classifiers[i]

    try:
        i = [v.__class__.__name__ for v in vectorizers].index(args.vectorizer)
    except ValueError:
        raise ValueError('Unknown vectorizer %s' % args.vectorizer)
    vectorizer = vectorizers[i]

    if args.training_files:
        contents = read_files(args.training_files)

        if args.annotated_paragraphs:
            phase1 = parse_annotated(contents)
        else:
            phase1 = parse_paragraphs(contents)

        if 1 in args.dump_phase:
            print('Phase 1')
            print('=======')
            phase1 = list(phase1)
            print(repr(phase1))
            if 1 == max(args.dump_phase):
                sys.exit(0)

        if args.keep_interstitials:
            phase2 = phase1
        else:
            phase2 = remove_interstitials(phase1)
        phase1 = None  # Potentially recover memory.

        if 2 in args.dump_phase:
            print('Phase 2')
            print('=======')
            phase2 = list(phase2)
            print(repr(phase2))
            if 2 == max(args.dump_phase):
                sys.exit(0)

        # All labels need to be resolved for this phase. The easiest way
        # to assure this is to convert to list.
        phase3 = target_classes(
            list(phase2),
            default=Label('Misc-exposition'),
            keep=[Label(l) for l in args.labels]
        )

        if args.dump_input:
            phase3 = list(phase3)
            if args.output_annotated:
                if not args.output_labels:
                    print('\n'.join([pp.as_annotated() for pp in phase3]))
                else:
                    print('\n'.join([pp.as_annotated()
                                     for pp in phase3
                                     if pp.top_label() in args.output_labels]))
            else:
                print('\n'.join([str(pp) for pp in phase3]))

        phase2 = None

        if 3 in args.dump_phase:
            print('Phase 3')
            print('=======')
            phase3 = list(phase3)
            print(repr(phase3))
            if 3 == max(args.dump_phase):
                sys.exit(0)

        phase3 = list(phase3)
        sample_size = len(phase3)

        if args.group_paragraphs:
            writer = csv.DictWriter(sys.stdout, fieldnames=Taxon.FIELDNAMES)
            writer.writeheader()
            for taxon in group_paragraphs(phase3):
                for d in taxon.dictionaries():
                    writer.writerow(d)
            sys.exit(0)

        np.random.seed(SEED)
        cutoff = int(sample_size * 0.70)
        permutation = np.random.permutation(phase3)
        phase3 = None
        learn = paragraph.to_dataframe(permutation[:cutoff], args.suppress_text)
        test = paragraph.to_dataframe(permutation[cutoff:], args.suppress_text)

        if args.test_classifiers:
            perform(
                classifiers,
                vectorizers,
                learn,
                test
            )
            sys.exit(0)

        if args.test_classifiers_by_label:
            perform_confusion_matrix(
                classifiers,
                vectorizers,
                learn,
                test,
                emit_csv=args.csv
            )
            sys.exit(0)

    # train or load models
    if args.load_vectorizer:
        vectorizer = joblib.load(args.load_vectorizer)
        classifier = joblib.load(args.load_classifier)
    else:
        vectorize_text = vectorizer.fit_transform(learn.v2)
        classifier.fit(vectorize_text, learn.v1)

    # Dump trained models.
    if args.dump_vectorizer:
        joblib.dump(vectorizer, args.dump_vectorizer)
    if args.dump_classifier:
        joblib.dump(classifier, args.dump_classifier)

    if args.evaluate_files:
        phase4 = []
        # predict
        if args.keep_interstitials:
            evaluated = (
                parse_paragraphs(read_files(args.evaluate_files)))
        else:
            evaluated = remove_interstitials(
                parse_paragraphs(read_files(args.evaluate_files)))
        for pp in evaluated:
            text = str(pp)
            vectorize_text = vectorizer.transform([text])
            predict = classifier.predict(vectorize_text)[0]
            if args.insert_nomenclature and pp.contains_nomenclature():
                predict = 'Nomenclature'
            phase4.append(pp.replace_labels(labels=[Label(predict)]))


        if args.output_annotated:
            if not args.output_labels:
                print('\n'.join([pp.as_annotated() for pp in phase4]))
            else:
                print('\n'.join([pp.as_annotated()
                                 for pp in phase4
                                 if pp.top_label() in args.output_labels]))

    if 4 in args.dump_phase:
        print('Phase 4')
        print('=======')
        print(repr(phase4))
        if 4 == max(args.dump_phase):
            sys.exit(0)
Beispiel #30
0
def al(dataset_name='seed4', FOIT_type='cross-all', rounds=10, batch_size=50):
    data, label = utils.load_source_data(dataset_name=dataset_name,
                                         FOIT_type=FOIT_type)
    _, number_label, _ = utils.get_number_of_label_n_trial(dataset_name)
    # data, label = utils.load_session_data_label(dataset_name, 0) # as unlabelled data

    cd_count = 16 if dataset_name == 'seed4' else 9 if dataset_name == 'seed3' else print(
        'Wrong dataset_name')
    iteration_number = 3 if FOIT_type == 'cross-subject' else 15
    accs = [([]) for i in range(iteration_number)]
    times = [([]) for i in range(iteration_number)]

    for ite in range(iteration_number):
        session_id = -1
        sub_id = -1
        if FOIT_type == 'cross-subject':
            session_id = ite
            sub_id = 14
        elif FOIT_type == 'cross-session':
            session_id = 2
            sub_id = ite
        elif FOIT_type == 'cross-all':
            session_id = 1
            sub_id = ite
        else:
            print('Wrong FOIT type!')
        # print("Ite: ", ite)
        cd_data, cd_label, ud_data, ud_label = utils.pick_one_data(
            dataset_name,
            session_id=session_id,
            cd_count=cd_count,
            sub_id=sub_id)
        cd_data, cd_label = shuffle(cd_data, cd_label, random_state=0)
        ud_data, ud_label = shuffle(ud_data, ud_label, random_state=0)
        cd_data_min, cd_data_max = np.min(cd_data), np.max(cd_data)
        cd_data = utils.normalization(cd_data)  # labelled data
        ud_data = utils.normalization(ud_data)  # test data
        if FOIT_type == 'cross-all':
            data_ite, label_ite = data.copy(), label.copy()
            for i in range(len(data)):
                data_ite[i], label_ite[i] = shuffle(data_ite[i],
                                                    label_ite[i],
                                                    random_state=0)
            # data_ite, label_ite = shuffle(data, label, random_state=0)
            for i in range(len(data)):
                data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min,
                                                    cd_data_max)
            # data_ite = utils.normalization(data_ite)
        elif FOIT_type == 'cross-session':
            data_ite, label_ite = data[ite], label[ite]
            for i in range(len(data_ite)):
                data_ite[i], label_ite[i] = shuffle(data_ite[i],
                                                    label_ite[i],
                                                    random_state=0)
                # data_ite[i] = utils.normalization(data_ite[i])
                data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min,
                                                    cd_data_max)
            # data_ite = utils.normalization(data_ite)
        else:
            data_ite, label_ite = data[ite], label[ite]
            for i in range(len(data_ite)):
                data_ite[i], label_ite[i] = shuffle(data_ite[i],
                                                    label_ite[i],
                                                    random_state=0)
            # data_ite, label_ite = shuffle(data_ite, label_ite, random_state=0)
            for i in range(len(data_ite)):
                # data_ite[i] = utils.normalization(data_ite[i])
                data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min,
                                                    cd_data_max)
        # data_ite, label_ite = data.copy(), label.copy()
        # for i in range(len(data)):
        #     data_ite[i], label_ite[i] = shuffle(data_ite[i], label_ite[i], random_state=0)
        # for i in range(len(data)):
        #     data_ite[i] = utils.norm_with_range(data_ite[i], cd_data_min, cd_data_max)

        # baseline
        clf = svm.LinearSVC(max_iter=30000)
        clf = CalibratedClassifierCV(clf, cv=5)
        since = time.time()
        clf.fit(cd_data, cd_label.squeeze())
        time_baseline = time.time() - since
        scoreA = utils.test(clf, ud_data, ud_label.squeeze())
        accs[ite].append(scoreA)
        times[ite].append(time_baseline)

        # select the data from the reservoir iteratively
        s_data_all, s_label_all = utils.stack_list(data_ite, label_ite)
        L_S_data = None
        L_S_label = None
        for i in range(rounds):
            # print("Rounds: ", i)
            # print(type(s_data_all))
            # print(s_data_all.shape)
            s_data_all_predict_proba = clf.predict_proba(s_data_all)
            s_label_all_proba = utils.get_one_hot(s_label_all.squeeze(),
                                                  number_label)
            confidence = np.zeros((s_label_all_proba.shape[0], 1))
            for i in range(s_label_all_proba.shape[0]):
                confidence[i] = s_label_all_proba[i].dot(
                    s_data_all_predict_proba[i].T)
                # confidence[i] = log_loss(s_label_all_proba[i], s_data_all_predict_proba[i])
            indices = np.argsort(confidence,
                                 axis=0)  # take the minimum topK indices
            topK_indices = indices[:batch_size]
            S_data = None
            S_label = None
            for i in topK_indices:
                one_data = s_data_all[i]
                one_label = s_label_all[i]
                if S_data is not None:
                    S_data = np.vstack((S_data, one_data))
                    S_label = np.vstack((S_label, one_label))
                else:
                    S_data = one_data
                    S_label = one_label
            for i in range(len(s_data_all) - 1, -1, -1):
                if i in topK_indices:
                    s_data_all = np.delete(s_data_all, i, axis=0)
                    s_label_all = np.delete(s_label_all, i, axis=0)
            if L_S_data is None:
                L_S_data = cd_data.copy()
                L_S_label = cd_label.copy()
            else:
                pass
            L_S_data = np.vstack((L_S_data, S_data))
            L_S_label = np.vstack((L_S_label, S_label))
            L_S_data, L_S_label = shuffle(L_S_data, L_S_label, random_state=0)
            clf.fit(L_S_data, L_S_label.squeeze())
            time_updated_time = time.time() - since
            times[ite].append(time_updated_time)
            scoreTMP = utils.test(clf, ud_data, ud_label.squeeze())
            accs[ite].append(scoreTMP)
    ResultTime = []
    ResultAcc = []
    ResultStd = []
    for i in range(rounds + 1):
        tmpTime = []
        tmpAcc = []
        for j in range(iteration_number):
            tmpTime.append(times[j][i])
            tmpAcc.append(accs[j][i])
        ResultTime.append(np.mean(tmpTime))
        ResultAcc.append(np.mean(tmpAcc))
        ResultStd.append(np.std(tmpAcc))
    print("Time: ", ResultTime)
    print("Accs: ", ResultAcc)
    print("Stds: ", ResultStd)