Beispiel #1
0
def predict_groupkfold_ML(data, label, features, group_label, cv_type, clf, seed, cvfolds):

	X = data.loc[:,features]
	Y = data.loc[:,[label]].astype(bool)
	G = data.loc[:, group_label]

	if (cv_type == 'stratifiedgroupkfold'):
		gkf = StratifiedGroupKFold(cvfolds, random_state=seed, shuffle=True)
	elif (cv_type == 'groupkfold'):
		X, Y, G = sk_u.shuffle(X,Y,G, random_state=seed)
		gkf = GroupKFold(cvfolds)
	else:
		raise('incompatible crossvalidation type')

	predicted_probability = []
	true_label = []

	for train_index, test_index in gkf.split(X,Y,G):
		X_train, X_test = X.iloc[train_index], X.iloc[test_index]
		Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
		G_train, G_test = G.iloc[train_index], G.iloc[test_index]

		try:
			clf.fit(X_train, Y_train.values.ravel().astype(int), groups=G_train)
		except:
			clf.fit(X_train, Y_train.values.ravel().astype(int))

		if hasattr(clf, 'best_estimator_'):
			calibrated_clf = sk_cal.CalibratedClassifierCV(clf.best_estimator_, method='isotonic', cv=10)
		else:
			calibrated_clf = sk_cal.CalibratedClassifierCV(clf, method='isotonic', cv=10)

		try:
			calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int), groups=G_train)
		except:
			calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int))

		try:
			Y_prob = calibrated_clf.predict_proba(X_test)
			predicted_probability.append(Y_prob[:,1])
		except:
			Y_prob = calibrated_clf.decision_function(X_test)
			predicted_probability.append(Y_prob)
		true_label.append(list(Y_test.values.flat))

	tl_pp_dict={"true_label":true_label, "pred_prob":predicted_probability}

	return tl_pp_dict
Beispiel #2
0
 def __init__(self,
              name='default',
              pos_weight=2.0,
              c=1e-5,
              threshold=1.1,
              limit_retrain=100):
     self.svm_object = sklearn_svm.LinearSVC(C=c,
                                             class_weight={
                                                 1: pos_weight,
                                                 -1: 1.0
                                             },
                                             verbose=0,
                                             penalty='l2',
                                             loss='hinge',
                                             dual=True)
     self.pos_feats = None
     self.neg_feats = None
     self.thr = threshold
     self.limit_retrain = limit_retrain
     self.neg_cache_feats = []
     self.name = name
     self.initialized = False
     self.clb_object = sklearn_clb.CalibratedClassifierCV(self.svm_object,
                                                          method='sigmoid',
                                                          cv=3)
Beispiel #3
0
    def fit(self, X, y):

        if not MulticlassClassifierOptimizer.fitted_model(self.model):

            print('      -> Fitting base model (wasn\'t fitted).')

            self.model.fit(X=X, y=y)

        print('      -> Model calibration.')

        self.model = skc.CalibratedClassifierCV(base_estimator=self.model,
                                                method='sigmoid',
                                                cv='prefit')

        self.model.fit(X=X,
                       y=y,
                       sample_weight=skcw.compute_sample_weight(
                           class_weight='balanced', y=y))

        print('      -> Optimizing multiclass thresholds.')

        self.thresholds = MulticlassClassifierOptimizer.get_optimized_thresholds(
            scoring_function=self.scoring_function,
            y_true=MulticlassClassifierOptimizer.one_hot_encode(y=y),
            y_score=self.model.predict_proba(X=X))

        self.optimized = True

        return self
 def get_default_classifier(self):
     self.main_classifier = sklinear.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) 
     # classifier = nb.MultinomialNB()
     self.clsf_name = self.main_classifier.__class__.__name__
     classifier = skcalibrated.CalibratedClassifierCV(self.main_classifier, 
                                                      cv=CLSF_CONSTANTS._calibration_nfolds, 
                                                      method=CLSF_CONSTANTS._calibration_method)
     
     return classifier
Beispiel #5
0
 def fit(self, X, y):
     self.num_classes = len(np.unique(y)) 
     self.model_fit = self.model.fit(X, y)
     if self.calibrate:
         self.calibrated = calibration.CalibratedClassifierCV(self.model_fit,
                                                              method='sigmoid',
                                                              cv=10)
     else:
         self.calibrated = None
     return copy.deepcopy(self)
Beispiel #6
0
def calibrate_model(
        model: "Estimator",
        X: pd.DataFrame,
        y: pd.DataFrame,
        path: str = None,
        X_test: Union[np.ndarray, None] = None) -> Optional[np.array]:
    """
    Calibrates an estimator and generates the calibration plots
    before and after calibration process.
    """
    # Evaluate baseline model.
    print("Evaluating the uncalibrated model...")
    X_train, X_dev, y_train, y_dev = model_selection.train_test_split(
        X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    pred_dev = model.predict_proba(X_dev)
    prob_true, prob_pred = calibration.calibration_curve(y_dev,
                                                         pred_dev[:, 1],
                                                         n_bins=50)
    plt.close()
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.plot(prob_pred, prob_true, marker=".")
    plt.xlabel("Predicted Probability")
    plt.ylabel("True Probability")
    if path is None:
        plt.show()
    else:
        plt.savefig(path + "calibration_curve_raw_model.png", dpi=300)
    # Evaluate the calibrated model.
    print("Evaluating the calibrated model...")
    calibrator = calibration.CalibratedClassifierCV(model,
                                                    method="isotonic",
                                                    cv=10)
    calibrator.fit(X_train, y_train)
    pred_dev = calibrator.predict_proba(X_dev)

    prob_true, prob_pred = calibration.calibration_curve(y_dev,
                                                         pred_dev[:, 1],
                                                         n_bins=50)
    plt.close()
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.plot(prob_pred, prob_true, marker=".")
    plt.xlabel("Predicted Probability")
    plt.ylabel("True Probability")
    if path is None:
        plt.show()
    else:
        plt.savefig(path + "calibration_curve_calibrated_model.png", dpi=300)
    # Predicting on the test set
    if X_test is not None:
        print("Predicting test set results...")
        # Train using the whole training set.
        calibrator.fit(X, y)
        pred_test = calibrator.predict_proba(X_test)
        return pred_test
def digits_recognition():

    FILE_NAME = "digits_recognition.pickle"

    # load dataset MINIST offline
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    #mostra 4 amostras do dataset na tela
    _, axes = plt.subplots(2, 4)
    images_and_labels = list(zip(x_train, y_train))
    plot_number(axes, images_and_labels, line=0, prelabel='Training')

    # flatten 28*28 images to a 784 vector for each image
    num_pixels = x_train.shape[1] * x_train.shape[2]
    image_size = x_train.shape[1]
    x_train = x_train.reshape((x_train.shape[0], num_pixels))
    x_test = x_test.reshape((x_test.shape[0], num_pixels))
    x_train = x_train / 255.0
    x_test = x_test / 255.0

    model_file = Path(FILE_NAME)
    if not model_file.is_file():

        # Create a classifier: Utilizado Linear SVC porque o SVC apanha para datasets muito grandes
        svmcl = svm.LinearSVC()
        classifier = calibration.CalibratedClassifierCV(svmcl)
        print("Training model...\n")
        # We learn the digits

        classifier.fit(x_train, y_train)

        # save the model to disk
        filename = FILE_NAME
        pickle.dump(classifier, open(filename, 'wb'))

    else:
        print("Loading saved model...\n")
        classifier = pickle.load(open(FILE_NAME, 'rb'))

    # Now predict the value of the digit on the second half:
    predicted = classifier.predict(x_test)

    accuracy = 100 * accuracy_score(y_test, predicted)
    print('SVC accuracy: [%.2f]' % (accuracy))

    #results = classifier.predict_proba(X_test)[0]

    #retorna formato de 28x28 das imagens usadas para teste
    images_restored = x_test.reshape((-1, image_size, image_size))
    images_and_predictions = list(zip(images_restored, predicted))

    plot_number(axes, images_and_predictions, line=1, prelabel='Prediction')

    return classifier
Beispiel #8
0
def predict_filter_kfold_ML(data, label, features, filter_function, clf, seed, cvfolds):

	kf = sk_ms.KFold(cvfolds, random_state=seed, shuffle=True)

	predicted_probability = []
	true_label = []

	for train_index, test_index in kf.split(data):
		data_train, data_test = data.iloc[train_index], data.iloc[test_index]

		X_train = filter_function(data_train).loc[:,features]
		Y_train = filter_function(data_train).loc[:,[label]]

		X_train = X_train.loc[~Y_train[label].isnull()]
		Y_train = Y_train.loc[~Y_train[label].isnull()]

		X_test = data_test.loc[:,features]
		Y_test = data_test.loc[:,[label]]

		clf.fit(X_train, Y_train.values.ravel().astype(int))

		if hasattr(clf, 'best_estimator_'):
			calibrated_clf = sk_cal.CalibratedClassifierCV(clf.best_estimator_, method='isotonic', cv=10)
		else:
			calibrated_clf = sk_cal.CalibratedClassifierCV(clf, method='isotonic', cv=10)

		calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int))

		try:
			Y_prob = calibrated_clf.predict_proba(X_test)
			predicted_probability.append(Y_prob[:,1])
		except:
			Y_prob = calibrated_clf.decision_function(X_test)
			predicted_probability.append(Y_prob)
		true_label.append(list(Y_test.values.flat))

	tl_pp_dict={"true_label":true_label, "pred_prob":predicted_probability}

	return tl_pp_dict
Beispiel #9
0
    def svm_classify(self, train_set, train_tag, test_set, test_tag):
        svc = svm.LinearSVC()
        clf = calibration.CalibratedClassifierCV(svc)
        clf_res = clf.fit(train_set, train_tag)
        train_pred = clf_res.predict(train_set)
        test_pred = clf_res.predict(test_set)

        train_err_num, train_err_ratio = self.checkPred(train_tag, train_pred)
        test_err_num, test_err_ratio = self.checkPred(test_tag, test_pred)

        print('=== 分类训练完毕,分类结果如下 ===')
        print('训练集误差: {e}'.format(e=train_err_ratio))
        print('检验集误差: {e}'.format(e=test_err_ratio))

        return clf_res
Beispiel #10
0
    def __init__(self, feature_pipelines=None, classifier=None):

        if feature_pipelines:
            self.feature_pipelines = feature_pipelines
        else:
            self.feature_pipelines = self.get_default_features()

        if classifier:
            #self.classifier = classifier
            self.clsf_name = classifier.__class__.__name__
            self.classifier = skcalibrated.CalibratedClassifierCV(
                classifier,
                cv=clsf_constants._calibration_nfolds,
                method=clsf_constants._calibration_method)
        else:
            self.classifier = self.get_default_classifier()
Beispiel #11
0
 def build(self, input_model, model_calibrator_id, model_calibrator_params):
     """Build a model calibrator using the specified id"""
     if model_calibrator_id == 'sklearn_CalibratedClassifierCV':
         params = model_calibrator_params
         params['base_estimator'] = input_model
         return calibration.CalibratedClassifierCV(**params)
     elif model_calibrator_id == 'sklearn_GridSearchCV':
         params = model_calibrator_params
         params['estimator'] = input_model
         return model_selection.GridSearchCV(**params)
     elif model_calibrator_id == 'sklearn_OneVsRestClassifier':
         params = model_calibrator_params
         params['estimator'] = input_model
         return multiclass.OneVsRestClassifier(**params)
     elif model_calibrator_id == 'sklearn_OneVsOneClassifier':
         params = model_calibrator_params
         params['estimator'] = input_model
         return multiclass.OneVsOneClassifier(**params)
     return None
Beispiel #12
0
 def __init__(self, task_name="", feature_config=None, classifier=None):
     # classifier can be inside feature_config
     
     self.task_name = task_name
     
     if feature_config:
         self.feature_config = feature_config
     else:
         self.feature_config = self.get_default_feature_config()
     
     
     # not very safe!! this should make sure feature_config is assigned
     self.feature_union = self._generate_feature_extraction_pipeline()
     
     if classifier:
         self.clsf_name = classifier.__class__.__name__
         self.classifier = skcalibrated.CalibratedClassifierCV(classifier, 
                                                               cv=CLSF_CONSTANTS._calibration_nfolds, 
                                                               method=CLSF_CONSTANTS._calibration_method)
     else:
         self.classifier = self.get_default_classifier()
def svm_classify(x, y):
    '''
    FUNC: train SVM classifier with input data x and label y
    ARG:
        - x: input data, HOG features
        - y: label of x, face or non-face
    RET:
        - clf: a SVM classifier using sklearn.svm. (You can use your favorite
               SVM library but there will be some places to be modified in
               later-on prediction code)
    '''
    #########################################
    ##          you code here              ##
    #########################################
    clf = svm.LinearSVC(C=0.05)
    clf = calibration.CalibratedClassifierCV(clf, method='sigmoid', cv=5)
    clf.fit(x, y)
    #########################################
    ##          you code here              ##
    #########################################

    return clf
Beispiel #14
0
def train_classifier(
    words: List[TreeNode],
    bool_result: bool,
    c: float = 100.0,
    prob: bool = False,
):

    # Extract feature data
    X_data = csr_matrix([w.getFeatures() for w in words])
    y_data = create_result_data(words, bool_result)
    print(f"{y_data[:20]=}")
    # assert(len(X_data) == len(y_data))
    assert (X_data.shape[0] == len(y_data))

    # Check the distribution of y_data
    occurance_class, max_class, avg = dist_max_avg(y_data)
    ratio = occurance_class[max_class] / avg
    ratio_string = f"Ratio between the most occuring and the avg: {occurance_class[max_class]/avg}"
    if bool_result:
        report = f"Report of the y_data distribution\nThe different classes occurances: {occurance_class}\n{ratio_string}"
    else:
        report = f"Report of the y_data distribution\nThe most occuring class: {max_class} {occurance_class[max_class]}\n{ratio_string}"
    print(report)
    # Create a classifier: a support vector classifier
    clf = svm.LinearSVC(
        C=c,
        verbose=False,
        random_state=1,
        max_iter=100000,
    )
    if prob:
        clf = calibration.CalibratedClassifierCV(clf)

    # Learn the data on the train subset
    with parallel_backend("threading", n_jobs=-1):
        clf.fit(X_data, y_data)

    return clf, report
Beispiel #15
0
def train_svm(
    x_data: List,
    y_data: List,
    gamma: float = 0.001,
    c: float = 100.0,
    kernel: str = "rbf",
    cache_size: int = 1000,
    prob: bool = False,
):

    # Create a classifier: a support vector classifier
    clf = svm.LinearSVC(
        C=c,
        random_state=1,
        max_iter=100000,
    )
    if prob:
        clf = calibration.CalibratedClassifierCV(clf)

    # Learn the data on the train subset
    with parallel_backend("threading", n_jobs=-1):
        clf.fit(x_data, y_data)

    return clf
Beispiel #16
0
def svm_analysis(X_train, y_train, X_test, y_test, grid=False):
    # Perform analysis using Support Vector Machine
    print("Performing Support Vector Machine analysis...")

    # SVM makes predictions!
    if not grid:
        clf = svm.SVC(C=0.1, kernel="rbf", degree=2)
        clf_c = calibration.CalibratedClassifierCV(clf)
        clf_c.fit(X_train, y_train)
        score = clf_c.score(X_test.astype("float64"), y_test.astype("float64"))

        proba = clf_c.predict_proba(X_test)
        pred = clf_c.predict(X_test)
        for x in range(len(X_test)):
            print("Predicted: {}\tProbabilities: {}\tActual: {}".\
             format(pred[x], proba[x], y_test[x]))

        print(score)
        print(sum(y_test) / len(y_test))

        return score

    else:
        tuned_params = [{
            "C": [5, 10, 100],
            "kernel": ["rbf"],
            "gamma": [0.0001]
        }]
        clf = model_selection.GridSearchCV(svm.SVC(),
                                           tuned_params,
                                           scoring="accuracy")
        clf.fit(X_train, y_train)

        print(clf.best_params_)
        y_pred = clf.predict(X_test)
        print(metrics.classification_report(y_test, y_pred))
def main():
    X_train = train_df.drop('home_team', axis=1).drop(
        'away_team',
        axis=1).drop('year', axis=1).drop('home_team_won', axis=1).drop(
            'date', axis=1).drop('starting_home', axis=1).drop('starting_away',
                                                               axis=1)
    Y_train = train_df['home_team_won']

    X_playoff_train = playoff_train_df.drop('home_team', axis=1).drop(
        'away_team', axis=1).drop('year', axis=1).drop('home_team_won',
                                                       axis=1).drop('date',
                                                                    axis=1)
    Y_playoff_train = playoff_train_df['home_team_won']

    X_series_train = series_train_df.drop('series_id', axis=1).drop(
        'winning_team', axis=1).drop('losing_team',
                                     axis=1).drop('year',
                                                  axis=1).drop('home_team_won',
                                                               axis=1)
    Y_series_train = series_train_df['home_team_won']

    X_test = test_df.drop('home_team', axis=1).drop('away_team', axis=1).drop(
        'year',
        axis=1).drop('home_team_won',
                     axis=1).drop('date',
                                  axis=1).drop('starting_home',
                                               axis=1).drop('starting_away',
                                                            axis=1)
    X_series_test = series_test_df.drop('series_id', axis=1).drop(
        'winning_team', axis=1).drop('losing_team',
                                     axis=1).drop('year',
                                                  axis=1).drop('home_team_won',
                                                               axis=1)

    scaler = preprocessing.StandardScaler().fit(X_train)
    columns = X_train.columns
    X_train = pd.DataFrame(scaler.transform(X_train), columns=columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=columns)

    playoff_scaler = preprocessing.StandardScaler().fit(X_playoff_train)
    playoff_columns = X_playoff_train.columns
    X_playoff_train = pd.DataFrame(playoff_scaler.transform(X_playoff_train),
                                   columns=playoff_columns)

    series_scaler = preprocessing.StandardScaler().fit(X_series_train)
    series_columns = X_series_train.columns
    X_series_train = pd.DataFrame(series_scaler.transform(X_series_train),
                                  columns=series_columns)
    X_series_test = pd.DataFrame(series_scaler.transform(X_series_test),
                                 columns=series_columns)

    Y_test = test_df['home_team_won']
    Y_series_test = series_test_df['home_team_won']

    sgd = linear_model.SGDClassifier(max_iter=1000, tol=None)
    clf = sgd.fit(X_train, Y_train)
    calibrator = calibration.CalibratedClassifierCV(clf, cv='prefit')
    calibrator = build_model(calibrator, X_train, Y_train, X_test, Y_test)

    playoff_sgd = linear_model.SGDClassifier(max_iter=1000, tol=None)
    playoff_clf = playoff_sgd.fit(X_train, Y_train)
    playoff_calibrator = calibration.CalibratedClassifierCV(playoff_clf,
                                                            cv='prefit')
    playoff_calibrator = build_model(playoff_calibrator, X_playoff_train,
                                     Y_playoff_train, X_test, Y_test)

    # playoff_sgd = build_model(linear_model.SGDClassifier(max_iter=1000, tol=None, penalty='l2', loss='squared_hinge', learning_rate='adaptive', eta0 = 10, class_weight = {1:0.6, 0:0.4}, alpha=0.01),
    #                             X_playoff_train, Y_playoff_train, X_test, Y_test, True)
    series_sgd = linear_model.SGDClassifier(max_iter=1000,
                                            tol=None,
                                            penalty='l2')
    series_clf = series_sgd.fit(X_series_train, Y_series_train)
    series_calibrator = calibration.CalibratedClassifierCV(series_clf,
                                                           cv='prefit')
    series_calibrator = build_model(series_calibrator, X_series_train,
                                    Y_series_train, X_series_test,
                                    Y_series_test)

    #series_sgd = build_model(linear_model.SGDClassifier(max_iter=1000, tol=None, penalty = 'l2', loss='perceptron', learning_rate='constant', eta0=1, class_weight={1: 0.5, 0:0.5}, alpha = 10), X_series_train, Y_series_train, X_series_test, Y_series_test, True)
    # Random Forest
    random_forest = build_model(
        RandomForestClassifier(n_estimators=50,
                               min_samples_split=4,
                               max_features='log2',
                               criterion='entropy',
                               class_weight='balanced',
                               ccp_alpha=.001), X_train, Y_train, X_test,
        Y_test)

    Y_prob_pred = random_forest.predict_proba(X_test)

    probability_evaluation(Y_test, Y_prob_pred, test_df)

    playoff_random_forest = build_model(
        RandomForestClassifier(n_estimators=50,
                               min_samples_split=4,
                               max_features='log2',
                               criterion='entropy',
                               class_weight='balanced',
                               ccp_alpha=.001), X_playoff_train,
        Y_playoff_train, X_test, Y_test, True)

    series_random_forest = build_model(
        RandomForestClassifier(n_estimators=50,
                               min_samples_split=4,
                               max_features='log2',
                               criterion='entropy',
                               class_weight='balanced',
                               ccp_alpha=.001), X_series_train, Y_series_train,
        X_series_test, Y_series_test, True)

    # Logistic Regression
    log = build_model(
        LogisticRegression(max_iter=10000,
                           tol=0.001,
                           solver='liblinear',
                           penalty='l1',
                           multi_class='ovr',
                           class_weight={
                               1: 0.5,
                               0: 0.5
                           },
                           C=0.1), X_train, Y_train, X_test, Y_test)

    playoff_log = build_model(
        LogisticRegression(max_iter=10000,
                           tol=0.001,
                           solver='liblinear',
                           penalty='l1',
                           multi_class='ovr',
                           class_weight={
                               1: 0.5,
                               0: 0.5
                           },
                           C=0.1), X_playoff_train, Y_playoff_train, X_test,
        Y_test)

    series_log = build_model(
        LogisticRegression(max_iter=10000,
                           tol=0.001,
                           solver='liblinear',
                           penalty='l1',
                           multi_class='ovr',
                           class_weight={
                               1: 0.5,
                               0: 0.5
                           },
                           C=0.1), X_series_train, Y_series_train,
        X_series_test, Y_series_test)

    # KNN
    # build_model(KNeighborsClassifier(n_neighbors = 3), X_train, Y_train, X_test)

    # Gaussian
    gaussian = GaussianNB()
    build_model(gaussian, X_train, Y_train, X_test, Y_test)
    cross_val_score(gaussian, X_train, Y_train, cv=5, scoring='accuracy')

    playoff_gaussian = GaussianNB()
    build_model(playoff_gaussian, X_playoff_train, Y_playoff_train, X_test,
                Y_test)
    cross_val_score(playoff_gaussian,
                    X_train,
                    Y_train,
                    cv=5,
                    scoring='accuracy')

    series_gaussian = GaussianNB()
    build_model(series_gaussian, X_series_train, Y_series_train, X_series_test,
                Y_series_test)
    cross_val_score(series_gaussian,
                    X_series_train,
                    Y_series_train,
                    cv=5,
                    scoring='accuracy')

    # Perceptron
    perceptron = build_model(
        Perceptron(max_iter=10000,
                   penalty='l2',
                   eta0=10,
                   class_weight={
                       1: 0.6,
                       0: 0.4
                   },
                   alpha=0.0001), X_train, Y_train, X_test, Y_test)

    playoff_perceptron = build_model(
        Perceptron(max_iter=10000,
                   penalty='l2',
                   eta0=10,
                   class_weight={
                       1: 0.6,
                       0: 0.4
                   },
                   alpha=0.0001), X_playoff_train, Y_playoff_train, X_test,
        Y_test, True)

    series_perceptron = build_model(
        Perceptron(max_iter=10000,
                   penalty='l2',
                   eta0=1,
                   class_weight={
                       1: 0.4,
                       0: 0.6
                   },
                   alpha=10), X_series_train, Y_series_train, X_series_test,
        Y_series_test, True)

    # Decision Tree
    d_tree = build_model(
        DecisionTreeClassifier(splitter='best',
                               min_samples_split=4,
                               max_features='log2',
                               criterion='entropy',
                               class_weight={
                                   1: 0.5,
                                   0: 0.5
                               },
                               ccp_alpha=0.0001), X_train, Y_train, X_test,
        Y_test)
    playoff_d_tree = build_model(
        DecisionTreeClassifier(splitter='best',
                               min_samples_split=4,
                               max_features='log2',
                               criterion='entropy',
                               class_weight={
                                   1: 0.5,
                                   0: 0.5
                               },
                               ccp_alpha=0.0001), X_playoff_train,
        Y_playoff_train, X_test, Y_test, True)
    series_d_tree = build_model(
        DecisionTreeClassifier(splitter='best',
                               min_samples_split=3,
                               max_features='log2',
                               criterion='entropy',
                               class_weight={
                                   1: 0.5,
                                   0: 0.5
                               },
                               ccp_alpha=0.0001), X_series_train,
        Y_series_train, X_series_test, Y_series_test, True)

    eclf1 = VotingClassifier(estimators=[('sgd', sgd), ('rf', random_forest),
                                         ('gnb', gaussian), ('dtree', d_tree)],
                             voting='hard')

    eclf2 = VotingClassifier(estimators=[('rf', random_forest),
                                         ('gnb', gaussian), ('dtree', d_tree)],
                             voting='soft')

    eclf = EnsembleClassifier(clfs=[random_forest, gaussian, d_tree])
    build_model(eclf1, X_train, Y_train, X_test, Y_test)
    build_model(eclf2, X_train, Y_train, X_test, Y_test)
    eclf = build_model(eclf, X_train, Y_train, X_test, Y_test)

    Y_prob_pred = eclf.predict_proba(X_train)

    probability_evaluation(Y_train, Y_prob_pred, train_df)
Beispiel #18
0
    def __init__(self, models=None, params=None, calibrator=None, run_calibration=None,
                 average_proba=True, labels=None, good_bands=None, reducer=None):
        """Creates an object to build the CCB-ID models. Should approximate the functionality
        of the sklearn classifier modules, though not perfectly.
        
        Args:
            models          - a list containing the sklearn models for classification
                              (defaults to using gradient boosting and random forest classifiers)
            params          - a list of parameter values used for each model. This should be a list of length 
                              n_models, with each item containing a dictionary with model-specific parameters
            calibrator      - an sklearn CalibratedClassifier object (or other calibration object)
            run_calibration - a boolean array with True values for models you want to calibrate, 
                              and False values for models that do not require calibration
            average_proba   - flag to report the output probabilities as the average across models
            labels          - the species labels for each class
            good_bands      - a boolean array of good band values to store (but not used by this object)
            reducer         - the data reducer/transformer to apply to input data
            
        Returns:
            a CCB-ID model object with totally cool functions and attributes.
        """
        # set the base attributes for the model object
        if models is None:
            gbc = _ensemble.GradientBoostingClassifier()
            rfc = _ensemble.RandomForestClassifier()
            self.models_ = [gbc, rfc]
        else:
            # if a single model is passed, convert to a list so it is iterable
            if type(models) is not list:
                models = list(models)
            self.models_ = models

        # set an attribute with the number of models
        self.n_models_ = len(self.models_)

        # set the model parameters if specified
        if params is not None:
            for i in range(self.n_models_):
                self.models_[i].set_params(**params[i])

        # set the model calibration function
        if calibrator is None:
            self.calibrator = _calibration.CalibratedClassifierCV(method='sigmoid', cv=3)
        else:
            self.calibrator = calibrator

        # set the attribute determining whether to perform calibration on a per-model basis
        if run_calibration is None:
            self.run_calibration_ = _np.repeat(True, self.n_models_)
        else:
            self.run_calibration_ = run_calibration

        # set an attribute to hold the final calibrated models
        self.calibrated_models_ = _np.repeat(None, self.n_models_)

        # set the flag to average the probability outputs    
        self.average_proba_ = average_proba

        # and set some properties that will be referenced later
        #  like species labels and a list of good bands
        if labels is None:
            self.labels_ = None
        else:
            self.labels_ = labels

        if good_bands is None:
            self.good_bands_ = None
        else:
            self.good_bands_ = good_bands

        if reducer is None:
            self.reducer = None
        else:
            self.reducer = reducer

        self.n_features_ = None
Beispiel #19
0
    def fit(self, X, y):
        # keep 5% for calibration later
        sss = cross_validation.StratifiedShuffleSplit(y, test_size=0.05)
        for tr, cal in sss:
            break

        # define the two classifiers
        self.clf1 = xgb.XGBClassifier(objective="multi:softprob",
                                      n_estimators=400,
                                      max_depth=8)
        self.clf2 = calibration.CalibratedClassifierCV(
            ensemble.RandomForestClassifier(n_estimators=1000,
                                            n_jobs=8,
                                            class_weight='auto'),
            method='isotonic')
        self.clf3 = NNEnsemble()

        # fit the classifiers
        self.clf1.fit(X.iloc[tr], y[tr])
        self.clf2.fit(X.iloc[tr], y[tr])
        self.clf3.fit(X.iloc[tr], y[tr])

        # predict everything before ensembling
        self.pr1 = self.clf1.predict_proba(X.iloc[cal])
        self.pr2 = self.clf2.predict_proba(X.iloc[cal])
        self.pr3 = self.clf3.predict_proba(X.iloc[cal])

        self.pr1 = preprocessing.normalize(self.pr1, axis=1, norm='l1')
        self.pr2 = preprocessing.normalize(self.pr2, axis=1, norm='l1')
        self.pr3 = preprocessing.normalize(self.pr3, axis=1, norm='l1')

        print(("XGB log loss:", metrics.log_loss(y[cal], self.pr1)))
        print(("RF log loss:", metrics.log_loss(y[cal], self.pr2)))
        print(("NN log loss:", metrics.log_loss(y[cal], self.pr3)))
        print(("XGB+RF+NN log loss:",
               metrics.log_loss(y[cal], (self.pr1 + self.pr2 + self.pr3) / 3)))

        self.clfs = [self.clf1, self.clf2, self.clf3]

        predictions = []
        for clf in self.clfs:
            predictions.append(clf.predict_proba(X.iloc[cal]))

        self.cal_y = y[cal]

        def log_loss_func(weights):
            ''' scipy minimize will pass the weights as a numpy array '''
            final_prediction = 0
            for weight, prediction in zip(weights, predictions):
                final_prediction += weight * prediction

            return metrics.log_loss(self.cal_y, final_prediction)

        scores = []
        wghts = []
        for i in range(20):
            if not i:
                starting_values = [1 / 3] * len(self.clfs)
            else:
                starting_values = np.random.uniform(size=len(self.clfs))

            cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
            bounds = [(0, 1)] * len(predictions)

            res = scopt.minimize(log_loss_func,
                                 starting_values,
                                 method='SLSQP',
                                 bounds=bounds,
                                 constraints=cons)

            scores.append(res['fun'])
            wghts.append(res['x'])

        bestSC = np.min(scores)
        bestWght = wghts[np.argmin(scores)]
        self.weights = bestWght

        print(('Ensamble Score: {best_score}'.format(best_score=bestSC)))
        print(('Best Weights: {weights}'.format(weights=bestWght)))
Beispiel #20
0
    def __init__(
        self, n_estimators=300, min_iterations=10, gll_early_stop_threshold=None, max_iterations=20, rf_params=None, 
        calibrator=None, run_calibration=None,  average_proba=True, labels=None, good_bands=None, reducer=None
    ):
        self.min_iterations = min_iterations
        self.gll_early_stop_threshold = gll_early_stop_threshold
        self.max_iterations = max_iterations

        self.rf_params = {"n_estimators": n_estimators} if rf_params is None else rf_params
        self.rf_params.update({"oob_score": True, "n_jobs": -1})
        if "n_estimators" not in self.rf_params:
            self.rf_params.update({"n_estimators": n_estimators})
            
        # initialize parameters from CCB-ID                
         # set an attribute with the number of models
        self.n_models_ = 1
        
        # set the model calibration function
        if calibrator is None:
            self.calibrator = _calibration.CalibratedClassifierCV(method='sigmoid', cv=3)
        else:
            self.calibrator = calibrator
            
         # set the attribute determining whether to perform calibration on a per-model basis
        if run_calibration is None:
            self.run_calibration_ = np.repeat(True, self.n_models_)
        else:
            self.run_calibration_ = run_calibration

        # set an attribute to hold the final calibrated models
        self.calibrated_models_ = np.repeat(None, self.n_models_)

        # set the flag to average the probability outputs    
        self.average_proba_ = average_proba
        
        # and set some properties that will be referenced later
        #  like species labels and a list of good bands
        if labels is None:
            self.labels_ = None
        else:
            self.labels_ = labels

        if good_bands is None:
            self.good_bands_ = None
        else:
            self.good_bands_ = good_bands

        if reducer is None:
            self.reducer = None
        else:
            self.reducer = reducer

        self.n_features_ = None
        
        #MERF specific arguments
        
        self.cluster_counts = None
        self.trained_rf = None
        self.trained_b = None

        self.b_hat_history = []
        self.sigma2_hat_history = []
        self.D_hat_history = []
        self.gll_history = []
    # Test baseline models
    for (model, name, params, pred_fn) in best_models:
        print("Running CV for: {}".format(name))
        start_time = datetime.now()
        constructed_model = model(**params)
        cv_loss = model_cv_test(constructed_model,
                                X_train.values,
                                y_train.values,
                                pred_fn=pred_fn,
                                n_fold=5)
        record.append(record_cv_loss(name, cv_loss))
        end_time = datetime.now()
        print("Time taken: {}".format(str(end_time - start_time)))
    # Test Calibrated models
    for (base_model, name, params, pred_fn) in best_models:
        name = name + "_cali"
        print("Running CV for: {}".format(name))
        start_time = datetime.now()
        constructed_model = calibration.CalibratedClassifierCV(
            base_model(**params))
        cv_loss = model_cv_test(constructed_model,
                                X_train.values,
                                y_train.values,
                                pred_fn=pred_fn,
                                n_fold=5)
        record.append(record_cv_loss(name, cv_loss))
        end_time = datetime.now()
        print("Time taken: {}".format(str(end_time - start_time)))
    record = pd.concat(record)
    record.to_csv(args.logdir, index=False)
Beispiel #22
0
#!/usr/bin/python
#
# Runs verious statistics against data bundle
#
#######################################################

import numpy as np
from sklearn import calibration

ccv_bad = calibration.CalibratedClassifierCV()
ccv_good = calibration.CalibratedClassifierCV()
def svmTrain(i):
    #   seg_ratio is the ratio between labeled samples and unlabeled samples
    #   number_iteration
    seg_ratio = 1
    MaxNumPerClassPerIteration = 10
    # to
    correct = 0
    total = 0
    wrong = 0
    correctPerClass = [0 for k in range(21)]
    wrongPerClass = [0 for k in range(21)]
    numPerClass = [0 for k in range(21)]

    addedsamplenum = 0
    # caffe construct the net
    #
    caffe.set_mode_gpu()
    net = caffe.Classifier(
        SVM_deployPath,
        caffeModelPath,
        mean=np.load(
            os.path.join(
                caffe_path,
                'python/caffe/imagenet/ilsvrc_2012_mean.npy')).mean(1).mean(1),
        channel_swap=(2, 1, 0),
        raw_scale=255,
        image_dims=(256, 256))

    net2 = caffe.Classifier(
        SVM_deployPath2,
        caffeModelPath2,
        mean=np.load(
            os.path.join(
                caffe_path,
                'python/caffe/imagenet/ilsvrc_2012_mean.npy')).mean(1).mean(1),
        channel_swap=(2, 1, 0),
        raw_scale=255,
        image_dims=(256, 256))

    X = []
    y = []
    samples = []
    with open("csvfold/Train_" + str(i) + ".csv", "rb") as csvFile:
        csvReader = csv.reader(csvFile, delimiter=' ')
        for row in csvReader:
            samples.append(row[0])
            y.append(row[1])

#divided the dataset into labeled and unlabeled section according the ratio seg_ratio
    labeled_sample, labeled_y = samples[:210 * seg_ratio], y[:210 * seg_ratio]
    unlabeled_sample, unlabeled_y = samples[210 * seg_ratio:], y[210 *
                                                                 seg_ratio:]

    #use EL to save the learnt result
    EL_samples, EL_y = [], []
    j = 1
    train_sample, train_y = labeled_sample, labeled_y

    #train_X is the vector collection of CNN1 and train_X2 is the vector collection of CNN2, in this way clf2 is the classifier of CNN2

    train_X, train_X2 = [], []
    lowConfidence_sample, lowConfidence_y = [], []

    for k in train_sample:
        prediction = classify(net, [k])
        train_X.append(prediction[0])
        prediction = classify(net2, [k])
        train_X2.append(prediction[0])
    #train svm
    lenofvector = len(train_X[0])
    lenofvector2 = len(train_X2[0])
    clf = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000))
    clf.fit(train_X, train_y)

    clf2 = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000))
    clf2.fit(train_X2, train_y)
    label_order = clf.classes_.tolist()
    claName = []

    for k in label_order:
        for o in labels:
            if labels[o] == int(k):
                claName.append(o)

    print("Labeled order is ", label_order)
    print("class  name order is ", claName)
    #test the performance without learning unlabeled data
    test_pred, test_label = [], []
    with open("csvfold/Test_" + str(i) + ".csv", "rb") as csvFile:
        csvReader = csv.reader(csvFile, delimiter=' ')
        for row in csvReader:
            features = classify(net, [row[0]])
            features2 = classify(net2, [row[0]])

            # the test classifier is not determined
            prediction = clf.predict(
                np.array(features[0]).reshape(1, lenofvector))
            prediction2 = clf2.predict(
                np.array(features2[0].reshape(1, lenofvector2)))
            proba_list = clf.predict_proba(
                np.array(features[0]).reshape(1, lenofvector))
            proba_list2 = clf2.predict_proba(
                np.array(features2[0]).reshape(1, lenofvector2))
            proba = proba_list[0][int(label_order.index(prediction))]
            proba2 = proba_list2[0][int(label_order.index(prediction2))]

            #decide the terminal result
            if prediction == prediction2:
                prediction = prediction
            else:
                if proba >= proba2:
                    prediction = prediction
                else:
                    prediction = prediction2

            if prediction == row[1]:
                correct += 1
            else:
                wrong += 1
            total += 1
            # paratmeter used for consturcting confusion matrix
            test_pred.append(prediction)
            test_label.append(row[1])
    print("TOTAL: " + str(total))
    print("CORRECT: " + str(correct))
    print("WRONG: " + str(wrong))
    output = "the iteration round order is" + str(j) + "\n"
    output += "the accuracy ratio is " + str(
        float(correct) / float(total) * 100) + "\n\n"
    open("results.txt", "a").write(output + "\n")
    # paint the confusion matrix graph
    plot_save_graph(test_label, test_pred, 0, claName)

    while True:
        #		print( "the value of j is : " , str(j))
        addedsamplePerClass = [0 for k in range(21)]
        batch_sample, batch_y = unlabeled_sample[210 * (j - 1):210 *
                                                 j], unlabeled_y[210 *
                                                                 (j - 1):210 *
                                                                 j]
        # the order is the label order learnt in svm

        # feature is the output vector of CNN
        # use  svm to predict the unlabeled  and save the resutl into EL_sample and EL_y
        #
        for k in range(210):

            features = classify(net, [batch_sample[k]])
            features2 = classify(net2, [batch_sample[k]])
            prediction = clf.predict(
                np.array(features[0]).reshape(1, lenofvector))
            prediction2 = clf2.predict(
                np.array(features2[0]).reshape(1, lenofvector2))
            proba_list = clf.predict_proba(
                np.array(features[0]).reshape(1, lenofvector))
            proba_list2 = clf2.predict_proba(
                np.array(features2[0]).reshape(1, lenofvector2))
            proba = proba_list[0][int(label_order.index(prediction))]
            proba2 = proba_list2[0][int(label_order.index(prediction2))]
            # classPointer save the value of the lable of prediciton

            #			print("the accuracy of ", batch_sample[k], " is ", str(proba*100),"%")
            #
            if prediction[0] == prediction2[0] and (proba >= 0.3
                                                    or proba2 >= 0.3):
                classPointer = int(prediction[0])
                if addedsamplePerClass[
                        classPointer] < MaxNumPerClassPerIteration:
                    addedsamplePerClass[classPointer] += 1
                    EL_samples.append(batch_sample[k])
                    EL_y.append(str(prediction[0]))
                    addedsamplenum += 1
                else:
                    lowConfidence_sample.append(batch_sample[k])
                    lowConfidence_y.append(str(prediction[0]))
#

#train the clf

        train_sample, train_X, train_X2, train_y = [], [], [], []
        train_sample, train_y = labeled_sample + EL_samples, labeled_y + EL_y
        print("the len of train_sample is :", str(len(train_sample)),
              "the number of added samples is :", str(addedsamplenum))

        for k in range(len(train_sample)):
            features = classify(net, [train_sample[k]])
            train_X.append(features[0])
            features2 = classify(net2, [train_sample[k]])
            train_X2.append(features2[0])


#		pdb.set_trace()
        clf = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000))
        clf.fit(train_X, train_y)
        clf2 = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000))
        clf2.fit(train_X2, train_y)

        correct = 0
        total = 0
        wrong = 0

        test_pred, test_label = [], []
        with open("csvfold/Test_" + str(i) + ".csv", "rb") as csvFile:
            csvReader = csv.reader(csvFile, delimiter=' ')
            for row in csvReader:
                features = classify(net, [row[0]])
                features2 = classify(net2, [row[0]])

                # the test classifier is not determined
                prediction = clf.predict(
                    np.array(features[0]).reshape(1, lenofvector))
                prediction2 = clf2.predict(
                    np.array(features2[0].reshape(1, lenofvector2)))
                proba_list = clf.predict_proba(
                    np.array(features[0]).reshape(1, lenofvector))
                proba_list2 = clf2.predict_proba(
                    np.array(features2[0]).reshape(1, lenofvector2))
                proba = proba_list[0][int(label_order.index(prediction))]
                proba2 = proba_list2[0][int(label_order.index(prediction2))]

                #decide the terminal result
                if prediction == prediction2:
                    prediction = prediction
                else:
                    if proba >= proba2:
                        prediction = prediction
                    else:
                        prediction = prediction2

                if prediction == row[1]:
                    correct += 1
                else:
                    wrong += 1
                total += 1
                test_pred.append(prediction)
                test_label.append(row[1])
        print("TOTAL: " + str(total))
        print("CORRECT: " + str(correct))
        print("WRONG: " + str(wrong))
        output = "the iteration round order is" + str(j) + "\n"
        output += "the accuracy ratio is " + str(
            float(correct) / float(total) * 100) + "\n\n"
        open("results.txt", "a").write(output + "\n")

        plot_save_graph(test_label, test_pred, j, claName)

        j += 1
        if j > 8:
            break

    print("train the low confidence samples")

    #count save the  last time value of addedsamplenum
    count = 0
    # iteration learning the lowconfidence samples
    while count != addedsamplenum:
        j = j + 1
        lowConfidenceMid_sample = []
        count = addedsamplenum

        for k in range(len(lowConfidence_sample)):
            sample = lowConfidence_sample[k]
            features = classify(net, [sample])
            features2 = classify(net2, [sample])
            prediction = clf.predict(
                np.array(features[0]).reshape(1, lenofvector))
            prediction2 = clf2.predict(
                np.array(features2[0]).reshape(1, lenofvector2))
            proba_list = clf.predict_proba(
                np.array(features[0]).reshape(1, lenofvector))
            proba_list2 = clf2.predict_proba(
                np.array(features2[0]).reshape(1, lenofvector2))
            proba = proba_list[0][int(label_order.index(prediction))]
            proba2 = proba_list2[0][int(label_order.index(prediction2))]

            # classPointer save the value of the lable of prediciton
            #			print("the accuracy of ", batch_sample[k], " is ", str(proba*100),"%")
            if prediction[0] == prediction2[0] and (proba >= 0.3
                                                    or proba2 >= 0.3):
                EL_samples.append(sample)
                EL_y.append(str(prediction[0]))
                addedsamplenum += 1
            else:
                lowConfidenceMid_sample.append(sample)

        train_sample, train_X, train_y, train_X2 = [], [], [], []
        train_sample, train_y = labeled_sample + EL_samples, labeled_y + EL_y
        print("the len of train_sample is :", str(len(train_sample)),
              "the number of added samples is :", str(addedsamplenum))

        for k in range(len(train_sample)):
            features = classify(net, [train_sample[k]])
            train_X.append(features[0])
            features2 = classify(net2, [train_sample[k]])
            train_X2.append(features2[0])
        #		pdb.set_trace()
        clf = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000))
        clf.fit(train_X, train_y)
        clf2 = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000))
        clf2.fit(train_X2, train_y)

        correct = 0
        total = 0
        wrong = 0

        test_label, test_pred = [], []
        with open("csvfold/Test_" + str(i) + ".csv", "rb") as csvFile:
            csvReader = csv.reader(csvFile, delimiter=' ')
            for row in csvReader:
                features = classify(net, [row[0]])
                features2 = classify(net2, [row[0]])

                # the test classifier is not determined
                prediction = clf.predict(
                    np.array(features[0]).reshape(1, lenofvector))
                prediction2 = clf2.predict(
                    np.array(features2[0].reshape(1, lenofvector2)))
                proba_list = clf.predict_proba(
                    np.array(features[0]).reshape(1, lenofvector))
                proba_list2 = clf2.predict_proba(
                    np.array(features2[0]).reshape(1, lenofvector2))
                proba = proba_list[0][int(label_order.index(prediction))]
                proba2 = proba_list2[0][int(label_order.index(prediction2))]

                #decide the terminal result
                if prediction == prediction2:
                    prediction = prediction
                else:
                    if proba >= proba2:
                        prediction = prediction
                    else:
                        prediction = prediction2

                if prediction == row[1]:
                    correct += 1
                else:
                    wrong += 1
                total += 1
                test_pred.append(prediction)
                test_label.append(row[1])
        print("TOTAL: " + str(total))
        print("CORRECT: " + str(correct))
        print("WRONG: " + str(wrong))
        output = "the iteration round order is" + str(j) + "\n"
        output += "the accuracy ratio is " + str(
            float(correct) / float(total) * 100) + "\n\n"
        open("results.txt", "a").write(output + "\n")
        plot_save_graph(test_label, test_pred, j, claName)
        lowConfidence_sample = []
        lowConfidence_sample = lowConfidenceMid_sample
Beispiel #24
0
# -*- coding: utf-8 -*-
"""
Created on Mon May 21 11:57:51 2018

@author: Sven

Simple decisiopn tree classifier example for learning python.
"""
from sklearn import calibration
import sex_data as sd

data = sd.create_data()

clf = calibration.CalibratedClassifierCV()
clf = clf.fit(data.loc[:, ['height', 'weight', 'shoe_size']], data.loc[:,
                                                                       'sex'])

prediction = clf.predict([[190, 70, 43]])
print(prediction)
Beispiel #25
0
def _train_svm(feats, labels, prim_id, ex_size, num_ex):
    logger.info("Training Primitive {}.".format(prim_id))

    # split examplars
    pos_img_ids = np.where(labels)[0]
    pos_img_splits = [pos_img_ids] if num_ex == 1 else [pos_img_ids] + [
        np.random.choice(
            pos_img_ids, size=min(ex_size, pos_img_ids.size), replace=False)
        for _ in range(num_ex)
    ]
    logger.info("Primitive {} has {} exemplars.".format(
        prim_id, len(pos_img_splits)))
    svms, clbs = [], []
    for ex_id, pos_ex_ids in enumerate(pos_img_splits):
        if len(pos_ex_ids) > 0:
            logger.info("Primitive {} training exemplar {} ...".format(
                prim_id, ex_id))
            svm_object = sklearn_svm.LinearSVC(C=1e-3,
                                               class_weight={
                                                   1: 2,
                                                   -1: 1.0
                                               },
                                               verbose=0,
                                               penalty='l2',
                                               loss='hinge',
                                               dual=True)
            neg_ex_ids = np.array(
                [idx for idx in range(labels.size) if idx not in pos_ex_ids])
            X = np.vstack([feats[pos_ex_ids], feats[neg_ex_ids]])
            Y = np.hstack(
                [np.ones(pos_ex_ids.size), -1.0 * np.ones(neg_ex_ids.size)])
            svm_object.fit(X, Y)
            train_acc = svm_object.score(X, Y)
            svms.append(svm_object)
            logger.info(
                "SVM (Primitive {} examplar {}) has {} positives, {} negatives and accuracy {}."
                .format(prim_id, ex_id, pos_ex_ids.size, neg_ex_ids.size,
                        train_acc))
            if ex_id == 0:
                svm_object_clb = sklearn_svm.LinearSVC(C=1e-3,
                                                       class_weight={
                                                           1: 2,
                                                           -1: 1.0
                                                       },
                                                       verbose=0,
                                                       penalty='l2',
                                                       loss='hinge',
                                                       dual=True)
                np.random.shuffle(pos_ex_ids)
                np.random.shuffle(neg_ex_ids)
                pos_split_point = int(np.ceil(0.9 * len(pos_ex_ids)))
                cls_pos_idx, calib_pos_idx = pos_ex_ids[:
                                                        pos_split_point], pos_ex_ids[
                                                            pos_split_point:]
                neg_split_point = int(np.ceil(0.9 * len(neg_ex_ids)))
                cls_neg_idx, calib_neg_idx = neg_ex_ids[:
                                                        neg_split_point], neg_ex_ids[
                                                            neg_split_point:]
                X = np.vstack([feats[cls_pos_idx], feats[cls_neg_idx]])
                Y = np.hstack([
                    np.ones(cls_pos_idx.size), -1.0 * np.ones(cls_neg_idx.size)
                ])
                svm_object_clb.fit(X, Y)
                clb_object = sklearn_clb.CalibratedClassifierCV(svm_object_clb,
                                                                cv='prefit')
                X = np.vstack([feats[calib_pos_idx], feats[calib_neg_idx]])
                Y = np.hstack([
                    np.ones(calib_pos_idx.size),
                    -1.0 * np.ones(calib_neg_idx.size)
                ])
                clb_object.fit(X, Y)
                clbs.append(clb_object)
                clb_object.score(X, Y)
                logger.info(
                    "Calibrated SVM (Primitive {} examplar {}) has {} positives, {} negatives and accuracy {}."
                    .format(prim_id, ex_id, pos_ex_ids.size, neg_ex_ids.size,
                            train_acc))
    return svms, clbs
Beispiel #26
0
def predict_kfold_ML(data, label, features, cv_type, clf, calibration, seed,
                     cvfolds):

    X = data.loc[:, features]
    Y = data.loc[:, [label]].astype(bool)

    if (cv_type == 'stratifiedkfold'):
        skf = sk_ms.StratifiedKFold(cvfolds, random_state=seed, shuffle=True)
    elif (cv_type == 'kfold'):
        skf = sk_ms.KFold(cvfolds, random_state=seed, shuffle=True)
    else:
        raise ('incompatible crossvalidation type')

    predicted_probability = []
    true_label = []

    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

        if (calibration is None):
            clf.fit(X_train, Y_train.values.ravel().astype(int))
            calibrated_clf = clf
        else:
            if hasattr(clf, 'best_estimator_'):
                clf.fit(X_train, Y_train.values.ravel().astype(int))
                if (calibration == 'isotonic'):
                    calibrated_clf = sk_cal.CalibratedClassifierCV(
                        clf.best_estimator_, method='isotonic', cv=10)
                    calibrated_clf.fit(X_train,
                                       Y_train.values.ravel().astype(int))
                elif (calibration == 'sigmoid'):
                    calibrated_clf = sk_cal.CalibratedClassifierCV(
                        clf.best_estimator_, method='sigmoid', cv=10)
                    calibrated_clf.fit(X_train,
                                       Y_train.values.ravel().astype(int))
                else:
                    print('Unknown Calibration type')
                    raise
            else:
                if (calibration == 'isotonic'):
                    calibrated_clf = sk_cal.CalibratedClassifierCV(
                        clf, method='isotonic', cv=10)
                    calibrated_clf.fit(X_train,
                                       Y_train.values.ravel().astype(int))
                elif (calibration == 'sigmoid'):
                    calibrated_clf = sk_cal.CalibratedClassifierCV(
                        clf, method='sigmoid', cv=10)
                    calibrated_clf.fit(X_train,
                                       Y_train.values.ravel().astype(int))
        try:
            Y_prob = calibrated_clf.predict_proba(X_test)
            predicted_probability.append(Y_prob[:, 1])
        except:
            Y_prob = calibrated_clf.decision_function(X_test)
            predicted_probability.append(Y_prob)
        true_label.append(list(Y_test.values.flat))

    tl_pp_dict = {"true_label": true_label, "pred_prob": predicted_probability}

    return tl_pp_dict
Beispiel #27
0
        X_test, y_test, test_size=0.82, random_state=1, stratify=y_test)

    print("X_train\t\t%sx%s" % (X_train.shape))
    print("X_test\t\t%sx%s" % (X_test.shape))
    print("X_explain\t%sx%s" % (X_explain.shape))

    # Create an ensemble blackbox classifier and predict test and explain set
    clf_svm = svm.SVC(probability=True, kernel="linear", random_state=1)
    clf_svm.fit(X_train, y_train)
    svm_preds = clf_svm.predict_proba(X_test)
    svm_explanations = clf_svm.predict_proba(X_explain)

    clf_base = ensemble.ExtraTreesClassifier(n_jobs=-1,
                                             n_estimators=1000,
                                             random_state=1)
    clf_cet = calibration.CalibratedClassifierCV(base_estimator=clf_base)
    clf_cet.fit(X_train, y_train)
    et_preds = clf_cet.predict_proba(X_test)
    et_explanations = clf_cet.predict_proba(X_explain)

    blackbox_preds = (et_preds + svm_preds) / 2.
    blackbox_explanations = (et_explanations + svm_explanations) / 2.

    print("\n\nSupport Vector Machine with linear kernel")
    print("Accuracy Score:\t%f" %
          accuracy_score(y_test, np.argmax(svm_preds, axis=1)))
    print("Multi-Log loss:\t%f" % log_loss(y_test, svm_preds))

    print("\n\nCalibrated Extremely Randomized Trees")
    print("Accuracy Score:\t%f" %
          accuracy_score(y_test, np.argmax(et_preds, axis=1)))
Beispiel #28
0
        elif args.dataset == CKPLUS.name:
            get_data_ckplus(clahe, detector, predictor, selected_labels,
                            SAVE_IMAGES)

    if train:
        #for C in [1.5*1e-3, 3e-3, 4.5*1e-3, 6*1e-3, 7.5*1e-3, 9e-3]:
        print("building model...")
        #clf = svm.LinearSVC(C=0.01, random_state=0, tol=1e-4, dual=False)
        # CK+: C=0.1 or 0.01
        # fer2013: C=1e-3
        #clf = calibration.CalibratedClassifierCV(clf, method='sigmoid', cv=5)
        if args.dataset == FER2013.name:
            OUTPUT_FOLDER_NAME = FER2013.name
            clf = svm.LinearSVC(C=0.001, random_state=0, tol=1e-4, dual=False)
            clf = calibration.CalibratedClassifierCV(clf,
                                                     method='sigmoid',
                                                     cv=5)
            with open(OUTPUT_FOLDER_NAME + '/Training/landmarks_feats.pkl',
                      'rb') as f:
                feats_data = pickle.load(f)
            with open(OUTPUT_FOLDER_NAME + '/Training/hog_feats.pkl',
                      'rb') as f:
                hog_feats = pickle.load(f)
            with open(OUTPUT_FOLDER_NAME + '/Training/labels.pkl', 'rb') as f:
                labels = pickle.load(f)
            feats_data = np.concatenate([feats_data, hog_feats], axis=1)

            with open(OUTPUT_FOLDER_NAME + '/PrivateTest/landmarks_feats.pkl',
                      'rb') as f:
                feats_data2 = pickle.load(f)
            with open(OUTPUT_FOLDER_NAME + '/PrivateTest/hog_feats.pkl',
Beispiel #29
0
                      proj_mask=args.proj_mask,
                      online_learn=args.online_learn,
                      svm_model=args.svm_model,
                      epochs=args.epochs)
    else:
        logger.info('Using SVM algo: SVC.')
        clf = svc_fit(train=(X_train, y_train),
                      proj_mask=args.proj_mask,
                      epochs=args.epochs)

    # Generate feature vectors.
    X_val_fv = common.process_samples(X_val, proj_mask=proj_mask)
    X_test_fv = common.process_samples(X_test, proj_mask=proj_mask)

    logger.info('Calibrating classifier.')
    cal_clf = calibration.CalibratedClassifierCV(base_estimator=clf,
                                                 cv='prefit')
    cal_clf.fit(X_val_fv, y_val)

    logger.info('Evaluating final classifier on test set.')
    evaluate_model(cal_clf, X_test_fv, y_test, class_names, args.svm_cm)

    logger.info(f'Saving svm model to: {args.svm_model}.')
    with open(args.svm_model, 'wb') as outfile:
        outfile.write(pickle.dumps(cal_clf))

    # Do not overwrite label encoder if online learning was performed.
    if not args.online_learn or args.use_svc:
        logger.info(f'Saving label encoder to: {args.label_encoder}.')
        with open(args.label_encoder, 'wb') as outfile:
            outfile.write(pickle.dumps(le))
    def train_main(self):
        data = pd.DataFrame()
        model_dict = dict()
        train_data_path = self.train_data_path

        for i in train_data_path:
            data_tmp = pd.read_excel(i, header=0)
            data_tmp.columns = ["pid", "label", "context"]

            data = pd.concat([data, data_tmp])

        data = shuffle(data)

        data["context_ngram"] = data[["context"]].applymap(ngram_process)
        context = data["context_ngram"].values

        label = data[["label"]].applymap(fun_map).values

        data_test = pd.read_excel(self.test_data_path, header=0)
        data_test.columns = ["pid", "label", "context"]

        data_test["context_ngram"] = data_test[["context"]].applymap(ngram_process)

        test_context = data_test["context_ngram"].values
        test_label = data_test[["label"]].applymap(fun_map).values

        # tf idf
        tf_idf = TfidfVectorizer(analyzer=fun_1, min_df=50)
        tf_idf.fit(context)

        model_dict["model_1"] = pickle.dumps(tf_idf)

        feature_names = tf_idf.get_feature_names()
        model_dict["feature_names"] = pickle.dumps(feature_names)
        print("feature num", len(feature_names))

        x_train = tf_idf.transform(context)
        x_test = tf_idf.transform(test_context)

        # chi
        model = SelectKBest(chi2, k="all")
        model.fit(x_train, label)

        model_dict["model_2"] = pickle.dumps(model)

        x_train = model.transform(x_train)
        x_test = model.transform(x_test)

        classify = svm.LinearSVC(C=0.9)

        # param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
        # grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2)
        # grid = xgb.XGBClassifier()
        # print(grid.best_params_)

        classify = calibration.CalibratedClassifierCV(classify, cv=10)

        classify.fit(x_train, label)
        y_predict = classify.predict(x_test)

        print(metrics.classification_report(test_label, y_predict))
        print("accuracy:", metrics.accuracy_score(test_label, y_predict))

        model_dict["model_3"] = pickle.dumps(classify)

        with open(self.model_path, mode='wb') as fm:
            joblib.dump(model_dict, fm)