def predict_groupkfold_ML(data, label, features, group_label, cv_type, clf, seed, cvfolds): X = data.loc[:,features] Y = data.loc[:,[label]].astype(bool) G = data.loc[:, group_label] if (cv_type == 'stratifiedgroupkfold'): gkf = StratifiedGroupKFold(cvfolds, random_state=seed, shuffle=True) elif (cv_type == 'groupkfold'): X, Y, G = sk_u.shuffle(X,Y,G, random_state=seed) gkf = GroupKFold(cvfolds) else: raise('incompatible crossvalidation type') predicted_probability = [] true_label = [] for train_index, test_index in gkf.split(X,Y,G): X_train, X_test = X.iloc[train_index], X.iloc[test_index] Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index] G_train, G_test = G.iloc[train_index], G.iloc[test_index] try: clf.fit(X_train, Y_train.values.ravel().astype(int), groups=G_train) except: clf.fit(X_train, Y_train.values.ravel().astype(int)) if hasattr(clf, 'best_estimator_'): calibrated_clf = sk_cal.CalibratedClassifierCV(clf.best_estimator_, method='isotonic', cv=10) else: calibrated_clf = sk_cal.CalibratedClassifierCV(clf, method='isotonic', cv=10) try: calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int), groups=G_train) except: calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int)) try: Y_prob = calibrated_clf.predict_proba(X_test) predicted_probability.append(Y_prob[:,1]) except: Y_prob = calibrated_clf.decision_function(X_test) predicted_probability.append(Y_prob) true_label.append(list(Y_test.values.flat)) tl_pp_dict={"true_label":true_label, "pred_prob":predicted_probability} return tl_pp_dict
def __init__(self, name='default', pos_weight=2.0, c=1e-5, threshold=1.1, limit_retrain=100): self.svm_object = sklearn_svm.LinearSVC(C=c, class_weight={ 1: pos_weight, -1: 1.0 }, verbose=0, penalty='l2', loss='hinge', dual=True) self.pos_feats = None self.neg_feats = None self.thr = threshold self.limit_retrain = limit_retrain self.neg_cache_feats = [] self.name = name self.initialized = False self.clb_object = sklearn_clb.CalibratedClassifierCV(self.svm_object, method='sigmoid', cv=3)
def fit(self, X, y): if not MulticlassClassifierOptimizer.fitted_model(self.model): print(' -> Fitting base model (wasn\'t fitted).') self.model.fit(X=X, y=y) print(' -> Model calibration.') self.model = skc.CalibratedClassifierCV(base_estimator=self.model, method='sigmoid', cv='prefit') self.model.fit(X=X, y=y, sample_weight=skcw.compute_sample_weight( class_weight='balanced', y=y)) print(' -> Optimizing multiclass thresholds.') self.thresholds = MulticlassClassifierOptimizer.get_optimized_thresholds( scoring_function=self.scoring_function, y_true=MulticlassClassifierOptimizer.one_hot_encode(y=y), y_score=self.model.predict_proba(X=X)) self.optimized = True return self
def get_default_classifier(self): self.main_classifier = sklinear.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) # classifier = nb.MultinomialNB() self.clsf_name = self.main_classifier.__class__.__name__ classifier = skcalibrated.CalibratedClassifierCV(self.main_classifier, cv=CLSF_CONSTANTS._calibration_nfolds, method=CLSF_CONSTANTS._calibration_method) return classifier
def fit(self, X, y): self.num_classes = len(np.unique(y)) self.model_fit = self.model.fit(X, y) if self.calibrate: self.calibrated = calibration.CalibratedClassifierCV(self.model_fit, method='sigmoid', cv=10) else: self.calibrated = None return copy.deepcopy(self)
def calibrate_model( model: "Estimator", X: pd.DataFrame, y: pd.DataFrame, path: str = None, X_test: Union[np.ndarray, None] = None) -> Optional[np.array]: """ Calibrates an estimator and generates the calibration plots before and after calibration process. """ # Evaluate baseline model. print("Evaluating the uncalibrated model...") X_train, X_dev, y_train, y_dev = model_selection.train_test_split( X, y, test_size=0.3, random_state=42) model.fit(X_train, y_train) pred_dev = model.predict_proba(X_dev) prob_true, prob_pred = calibration.calibration_curve(y_dev, pred_dev[:, 1], n_bins=50) plt.close() plt.plot([0, 1], [0, 1], linestyle="--") plt.plot(prob_pred, prob_true, marker=".") plt.xlabel("Predicted Probability") plt.ylabel("True Probability") if path is None: plt.show() else: plt.savefig(path + "calibration_curve_raw_model.png", dpi=300) # Evaluate the calibrated model. print("Evaluating the calibrated model...") calibrator = calibration.CalibratedClassifierCV(model, method="isotonic", cv=10) calibrator.fit(X_train, y_train) pred_dev = calibrator.predict_proba(X_dev) prob_true, prob_pred = calibration.calibration_curve(y_dev, pred_dev[:, 1], n_bins=50) plt.close() plt.plot([0, 1], [0, 1], linestyle="--") plt.plot(prob_pred, prob_true, marker=".") plt.xlabel("Predicted Probability") plt.ylabel("True Probability") if path is None: plt.show() else: plt.savefig(path + "calibration_curve_calibrated_model.png", dpi=300) # Predicting on the test set if X_test is not None: print("Predicting test set results...") # Train using the whole training set. calibrator.fit(X, y) pred_test = calibrator.predict_proba(X_test) return pred_test
def digits_recognition(): FILE_NAME = "digits_recognition.pickle" # load dataset MINIST offline (x_train, y_train), (x_test, y_test) = mnist.load_data() #mostra 4 amostras do dataset na tela _, axes = plt.subplots(2, 4) images_and_labels = list(zip(x_train, y_train)) plot_number(axes, images_and_labels, line=0, prelabel='Training') # flatten 28*28 images to a 784 vector for each image num_pixels = x_train.shape[1] * x_train.shape[2] image_size = x_train.shape[1] x_train = x_train.reshape((x_train.shape[0], num_pixels)) x_test = x_test.reshape((x_test.shape[0], num_pixels)) x_train = x_train / 255.0 x_test = x_test / 255.0 model_file = Path(FILE_NAME) if not model_file.is_file(): # Create a classifier: Utilizado Linear SVC porque o SVC apanha para datasets muito grandes svmcl = svm.LinearSVC() classifier = calibration.CalibratedClassifierCV(svmcl) print("Training model...\n") # We learn the digits classifier.fit(x_train, y_train) # save the model to disk filename = FILE_NAME pickle.dump(classifier, open(filename, 'wb')) else: print("Loading saved model...\n") classifier = pickle.load(open(FILE_NAME, 'rb')) # Now predict the value of the digit on the second half: predicted = classifier.predict(x_test) accuracy = 100 * accuracy_score(y_test, predicted) print('SVC accuracy: [%.2f]' % (accuracy)) #results = classifier.predict_proba(X_test)[0] #retorna formato de 28x28 das imagens usadas para teste images_restored = x_test.reshape((-1, image_size, image_size)) images_and_predictions = list(zip(images_restored, predicted)) plot_number(axes, images_and_predictions, line=1, prelabel='Prediction') return classifier
def predict_filter_kfold_ML(data, label, features, filter_function, clf, seed, cvfolds): kf = sk_ms.KFold(cvfolds, random_state=seed, shuffle=True) predicted_probability = [] true_label = [] for train_index, test_index in kf.split(data): data_train, data_test = data.iloc[train_index], data.iloc[test_index] X_train = filter_function(data_train).loc[:,features] Y_train = filter_function(data_train).loc[:,[label]] X_train = X_train.loc[~Y_train[label].isnull()] Y_train = Y_train.loc[~Y_train[label].isnull()] X_test = data_test.loc[:,features] Y_test = data_test.loc[:,[label]] clf.fit(X_train, Y_train.values.ravel().astype(int)) if hasattr(clf, 'best_estimator_'): calibrated_clf = sk_cal.CalibratedClassifierCV(clf.best_estimator_, method='isotonic', cv=10) else: calibrated_clf = sk_cal.CalibratedClassifierCV(clf, method='isotonic', cv=10) calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int)) try: Y_prob = calibrated_clf.predict_proba(X_test) predicted_probability.append(Y_prob[:,1]) except: Y_prob = calibrated_clf.decision_function(X_test) predicted_probability.append(Y_prob) true_label.append(list(Y_test.values.flat)) tl_pp_dict={"true_label":true_label, "pred_prob":predicted_probability} return tl_pp_dict
def svm_classify(self, train_set, train_tag, test_set, test_tag): svc = svm.LinearSVC() clf = calibration.CalibratedClassifierCV(svc) clf_res = clf.fit(train_set, train_tag) train_pred = clf_res.predict(train_set) test_pred = clf_res.predict(test_set) train_err_num, train_err_ratio = self.checkPred(train_tag, train_pred) test_err_num, test_err_ratio = self.checkPred(test_tag, test_pred) print('=== 分类训练完毕,分类结果如下 ===') print('训练集误差: {e}'.format(e=train_err_ratio)) print('检验集误差: {e}'.format(e=test_err_ratio)) return clf_res
def __init__(self, feature_pipelines=None, classifier=None): if feature_pipelines: self.feature_pipelines = feature_pipelines else: self.feature_pipelines = self.get_default_features() if classifier: #self.classifier = classifier self.clsf_name = classifier.__class__.__name__ self.classifier = skcalibrated.CalibratedClassifierCV( classifier, cv=clsf_constants._calibration_nfolds, method=clsf_constants._calibration_method) else: self.classifier = self.get_default_classifier()
def build(self, input_model, model_calibrator_id, model_calibrator_params): """Build a model calibrator using the specified id""" if model_calibrator_id == 'sklearn_CalibratedClassifierCV': params = model_calibrator_params params['base_estimator'] = input_model return calibration.CalibratedClassifierCV(**params) elif model_calibrator_id == 'sklearn_GridSearchCV': params = model_calibrator_params params['estimator'] = input_model return model_selection.GridSearchCV(**params) elif model_calibrator_id == 'sklearn_OneVsRestClassifier': params = model_calibrator_params params['estimator'] = input_model return multiclass.OneVsRestClassifier(**params) elif model_calibrator_id == 'sklearn_OneVsOneClassifier': params = model_calibrator_params params['estimator'] = input_model return multiclass.OneVsOneClassifier(**params) return None
def __init__(self, task_name="", feature_config=None, classifier=None): # classifier can be inside feature_config self.task_name = task_name if feature_config: self.feature_config = feature_config else: self.feature_config = self.get_default_feature_config() # not very safe!! this should make sure feature_config is assigned self.feature_union = self._generate_feature_extraction_pipeline() if classifier: self.clsf_name = classifier.__class__.__name__ self.classifier = skcalibrated.CalibratedClassifierCV(classifier, cv=CLSF_CONSTANTS._calibration_nfolds, method=CLSF_CONSTANTS._calibration_method) else: self.classifier = self.get_default_classifier()
def svm_classify(x, y): ''' FUNC: train SVM classifier with input data x and label y ARG: - x: input data, HOG features - y: label of x, face or non-face RET: - clf: a SVM classifier using sklearn.svm. (You can use your favorite SVM library but there will be some places to be modified in later-on prediction code) ''' ######################################### ## you code here ## ######################################### clf = svm.LinearSVC(C=0.05) clf = calibration.CalibratedClassifierCV(clf, method='sigmoid', cv=5) clf.fit(x, y) ######################################### ## you code here ## ######################################### return clf
def train_classifier( words: List[TreeNode], bool_result: bool, c: float = 100.0, prob: bool = False, ): # Extract feature data X_data = csr_matrix([w.getFeatures() for w in words]) y_data = create_result_data(words, bool_result) print(f"{y_data[:20]=}") # assert(len(X_data) == len(y_data)) assert (X_data.shape[0] == len(y_data)) # Check the distribution of y_data occurance_class, max_class, avg = dist_max_avg(y_data) ratio = occurance_class[max_class] / avg ratio_string = f"Ratio between the most occuring and the avg: {occurance_class[max_class]/avg}" if bool_result: report = f"Report of the y_data distribution\nThe different classes occurances: {occurance_class}\n{ratio_string}" else: report = f"Report of the y_data distribution\nThe most occuring class: {max_class} {occurance_class[max_class]}\n{ratio_string}" print(report) # Create a classifier: a support vector classifier clf = svm.LinearSVC( C=c, verbose=False, random_state=1, max_iter=100000, ) if prob: clf = calibration.CalibratedClassifierCV(clf) # Learn the data on the train subset with parallel_backend("threading", n_jobs=-1): clf.fit(X_data, y_data) return clf, report
def train_svm( x_data: List, y_data: List, gamma: float = 0.001, c: float = 100.0, kernel: str = "rbf", cache_size: int = 1000, prob: bool = False, ): # Create a classifier: a support vector classifier clf = svm.LinearSVC( C=c, random_state=1, max_iter=100000, ) if prob: clf = calibration.CalibratedClassifierCV(clf) # Learn the data on the train subset with parallel_backend("threading", n_jobs=-1): clf.fit(x_data, y_data) return clf
def svm_analysis(X_train, y_train, X_test, y_test, grid=False): # Perform analysis using Support Vector Machine print("Performing Support Vector Machine analysis...") # SVM makes predictions! if not grid: clf = svm.SVC(C=0.1, kernel="rbf", degree=2) clf_c = calibration.CalibratedClassifierCV(clf) clf_c.fit(X_train, y_train) score = clf_c.score(X_test.astype("float64"), y_test.astype("float64")) proba = clf_c.predict_proba(X_test) pred = clf_c.predict(X_test) for x in range(len(X_test)): print("Predicted: {}\tProbabilities: {}\tActual: {}".\ format(pred[x], proba[x], y_test[x])) print(score) print(sum(y_test) / len(y_test)) return score else: tuned_params = [{ "C": [5, 10, 100], "kernel": ["rbf"], "gamma": [0.0001] }] clf = model_selection.GridSearchCV(svm.SVC(), tuned_params, scoring="accuracy") clf.fit(X_train, y_train) print(clf.best_params_) y_pred = clf.predict(X_test) print(metrics.classification_report(y_test, y_pred))
def main(): X_train = train_df.drop('home_team', axis=1).drop( 'away_team', axis=1).drop('year', axis=1).drop('home_team_won', axis=1).drop( 'date', axis=1).drop('starting_home', axis=1).drop('starting_away', axis=1) Y_train = train_df['home_team_won'] X_playoff_train = playoff_train_df.drop('home_team', axis=1).drop( 'away_team', axis=1).drop('year', axis=1).drop('home_team_won', axis=1).drop('date', axis=1) Y_playoff_train = playoff_train_df['home_team_won'] X_series_train = series_train_df.drop('series_id', axis=1).drop( 'winning_team', axis=1).drop('losing_team', axis=1).drop('year', axis=1).drop('home_team_won', axis=1) Y_series_train = series_train_df['home_team_won'] X_test = test_df.drop('home_team', axis=1).drop('away_team', axis=1).drop( 'year', axis=1).drop('home_team_won', axis=1).drop('date', axis=1).drop('starting_home', axis=1).drop('starting_away', axis=1) X_series_test = series_test_df.drop('series_id', axis=1).drop( 'winning_team', axis=1).drop('losing_team', axis=1).drop('year', axis=1).drop('home_team_won', axis=1) scaler = preprocessing.StandardScaler().fit(X_train) columns = X_train.columns X_train = pd.DataFrame(scaler.transform(X_train), columns=columns) X_test = pd.DataFrame(scaler.transform(X_test), columns=columns) playoff_scaler = preprocessing.StandardScaler().fit(X_playoff_train) playoff_columns = X_playoff_train.columns X_playoff_train = pd.DataFrame(playoff_scaler.transform(X_playoff_train), columns=playoff_columns) series_scaler = preprocessing.StandardScaler().fit(X_series_train) series_columns = X_series_train.columns X_series_train = pd.DataFrame(series_scaler.transform(X_series_train), columns=series_columns) X_series_test = pd.DataFrame(series_scaler.transform(X_series_test), columns=series_columns) Y_test = test_df['home_team_won'] Y_series_test = series_test_df['home_team_won'] sgd = linear_model.SGDClassifier(max_iter=1000, tol=None) clf = sgd.fit(X_train, Y_train) calibrator = calibration.CalibratedClassifierCV(clf, cv='prefit') calibrator = build_model(calibrator, X_train, Y_train, X_test, Y_test) playoff_sgd = linear_model.SGDClassifier(max_iter=1000, tol=None) playoff_clf = playoff_sgd.fit(X_train, Y_train) playoff_calibrator = calibration.CalibratedClassifierCV(playoff_clf, cv='prefit') playoff_calibrator = build_model(playoff_calibrator, X_playoff_train, Y_playoff_train, X_test, Y_test) # playoff_sgd = build_model(linear_model.SGDClassifier(max_iter=1000, tol=None, penalty='l2', loss='squared_hinge', learning_rate='adaptive', eta0 = 10, class_weight = {1:0.6, 0:0.4}, alpha=0.01), # X_playoff_train, Y_playoff_train, X_test, Y_test, True) series_sgd = linear_model.SGDClassifier(max_iter=1000, tol=None, penalty='l2') series_clf = series_sgd.fit(X_series_train, Y_series_train) series_calibrator = calibration.CalibratedClassifierCV(series_clf, cv='prefit') series_calibrator = build_model(series_calibrator, X_series_train, Y_series_train, X_series_test, Y_series_test) #series_sgd = build_model(linear_model.SGDClassifier(max_iter=1000, tol=None, penalty = 'l2', loss='perceptron', learning_rate='constant', eta0=1, class_weight={1: 0.5, 0:0.5}, alpha = 10), X_series_train, Y_series_train, X_series_test, Y_series_test, True) # Random Forest random_forest = build_model( RandomForestClassifier(n_estimators=50, min_samples_split=4, max_features='log2', criterion='entropy', class_weight='balanced', ccp_alpha=.001), X_train, Y_train, X_test, Y_test) Y_prob_pred = random_forest.predict_proba(X_test) probability_evaluation(Y_test, Y_prob_pred, test_df) playoff_random_forest = build_model( RandomForestClassifier(n_estimators=50, min_samples_split=4, max_features='log2', criterion='entropy', class_weight='balanced', ccp_alpha=.001), X_playoff_train, Y_playoff_train, X_test, Y_test, True) series_random_forest = build_model( RandomForestClassifier(n_estimators=50, min_samples_split=4, max_features='log2', criterion='entropy', class_weight='balanced', ccp_alpha=.001), X_series_train, Y_series_train, X_series_test, Y_series_test, True) # Logistic Regression log = build_model( LogisticRegression(max_iter=10000, tol=0.001, solver='liblinear', penalty='l1', multi_class='ovr', class_weight={ 1: 0.5, 0: 0.5 }, C=0.1), X_train, Y_train, X_test, Y_test) playoff_log = build_model( LogisticRegression(max_iter=10000, tol=0.001, solver='liblinear', penalty='l1', multi_class='ovr', class_weight={ 1: 0.5, 0: 0.5 }, C=0.1), X_playoff_train, Y_playoff_train, X_test, Y_test) series_log = build_model( LogisticRegression(max_iter=10000, tol=0.001, solver='liblinear', penalty='l1', multi_class='ovr', class_weight={ 1: 0.5, 0: 0.5 }, C=0.1), X_series_train, Y_series_train, X_series_test, Y_series_test) # KNN # build_model(KNeighborsClassifier(n_neighbors = 3), X_train, Y_train, X_test) # Gaussian gaussian = GaussianNB() build_model(gaussian, X_train, Y_train, X_test, Y_test) cross_val_score(gaussian, X_train, Y_train, cv=5, scoring='accuracy') playoff_gaussian = GaussianNB() build_model(playoff_gaussian, X_playoff_train, Y_playoff_train, X_test, Y_test) cross_val_score(playoff_gaussian, X_train, Y_train, cv=5, scoring='accuracy') series_gaussian = GaussianNB() build_model(series_gaussian, X_series_train, Y_series_train, X_series_test, Y_series_test) cross_val_score(series_gaussian, X_series_train, Y_series_train, cv=5, scoring='accuracy') # Perceptron perceptron = build_model( Perceptron(max_iter=10000, penalty='l2', eta0=10, class_weight={ 1: 0.6, 0: 0.4 }, alpha=0.0001), X_train, Y_train, X_test, Y_test) playoff_perceptron = build_model( Perceptron(max_iter=10000, penalty='l2', eta0=10, class_weight={ 1: 0.6, 0: 0.4 }, alpha=0.0001), X_playoff_train, Y_playoff_train, X_test, Y_test, True) series_perceptron = build_model( Perceptron(max_iter=10000, penalty='l2', eta0=1, class_weight={ 1: 0.4, 0: 0.6 }, alpha=10), X_series_train, Y_series_train, X_series_test, Y_series_test, True) # Decision Tree d_tree = build_model( DecisionTreeClassifier(splitter='best', min_samples_split=4, max_features='log2', criterion='entropy', class_weight={ 1: 0.5, 0: 0.5 }, ccp_alpha=0.0001), X_train, Y_train, X_test, Y_test) playoff_d_tree = build_model( DecisionTreeClassifier(splitter='best', min_samples_split=4, max_features='log2', criterion='entropy', class_weight={ 1: 0.5, 0: 0.5 }, ccp_alpha=0.0001), X_playoff_train, Y_playoff_train, X_test, Y_test, True) series_d_tree = build_model( DecisionTreeClassifier(splitter='best', min_samples_split=3, max_features='log2', criterion='entropy', class_weight={ 1: 0.5, 0: 0.5 }, ccp_alpha=0.0001), X_series_train, Y_series_train, X_series_test, Y_series_test, True) eclf1 = VotingClassifier(estimators=[('sgd', sgd), ('rf', random_forest), ('gnb', gaussian), ('dtree', d_tree)], voting='hard') eclf2 = VotingClassifier(estimators=[('rf', random_forest), ('gnb', gaussian), ('dtree', d_tree)], voting='soft') eclf = EnsembleClassifier(clfs=[random_forest, gaussian, d_tree]) build_model(eclf1, X_train, Y_train, X_test, Y_test) build_model(eclf2, X_train, Y_train, X_test, Y_test) eclf = build_model(eclf, X_train, Y_train, X_test, Y_test) Y_prob_pred = eclf.predict_proba(X_train) probability_evaluation(Y_train, Y_prob_pred, train_df)
def __init__(self, models=None, params=None, calibrator=None, run_calibration=None, average_proba=True, labels=None, good_bands=None, reducer=None): """Creates an object to build the CCB-ID models. Should approximate the functionality of the sklearn classifier modules, though not perfectly. Args: models - a list containing the sklearn models for classification (defaults to using gradient boosting and random forest classifiers) params - a list of parameter values used for each model. This should be a list of length n_models, with each item containing a dictionary with model-specific parameters calibrator - an sklearn CalibratedClassifier object (or other calibration object) run_calibration - a boolean array with True values for models you want to calibrate, and False values for models that do not require calibration average_proba - flag to report the output probabilities as the average across models labels - the species labels for each class good_bands - a boolean array of good band values to store (but not used by this object) reducer - the data reducer/transformer to apply to input data Returns: a CCB-ID model object with totally cool functions and attributes. """ # set the base attributes for the model object if models is None: gbc = _ensemble.GradientBoostingClassifier() rfc = _ensemble.RandomForestClassifier() self.models_ = [gbc, rfc] else: # if a single model is passed, convert to a list so it is iterable if type(models) is not list: models = list(models) self.models_ = models # set an attribute with the number of models self.n_models_ = len(self.models_) # set the model parameters if specified if params is not None: for i in range(self.n_models_): self.models_[i].set_params(**params[i]) # set the model calibration function if calibrator is None: self.calibrator = _calibration.CalibratedClassifierCV(method='sigmoid', cv=3) else: self.calibrator = calibrator # set the attribute determining whether to perform calibration on a per-model basis if run_calibration is None: self.run_calibration_ = _np.repeat(True, self.n_models_) else: self.run_calibration_ = run_calibration # set an attribute to hold the final calibrated models self.calibrated_models_ = _np.repeat(None, self.n_models_) # set the flag to average the probability outputs self.average_proba_ = average_proba # and set some properties that will be referenced later # like species labels and a list of good bands if labels is None: self.labels_ = None else: self.labels_ = labels if good_bands is None: self.good_bands_ = None else: self.good_bands_ = good_bands if reducer is None: self.reducer = None else: self.reducer = reducer self.n_features_ = None
def fit(self, X, y): # keep 5% for calibration later sss = cross_validation.StratifiedShuffleSplit(y, test_size=0.05) for tr, cal in sss: break # define the two classifiers self.clf1 = xgb.XGBClassifier(objective="multi:softprob", n_estimators=400, max_depth=8) self.clf2 = calibration.CalibratedClassifierCV( ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=8, class_weight='auto'), method='isotonic') self.clf3 = NNEnsemble() # fit the classifiers self.clf1.fit(X.iloc[tr], y[tr]) self.clf2.fit(X.iloc[tr], y[tr]) self.clf3.fit(X.iloc[tr], y[tr]) # predict everything before ensembling self.pr1 = self.clf1.predict_proba(X.iloc[cal]) self.pr2 = self.clf2.predict_proba(X.iloc[cal]) self.pr3 = self.clf3.predict_proba(X.iloc[cal]) self.pr1 = preprocessing.normalize(self.pr1, axis=1, norm='l1') self.pr2 = preprocessing.normalize(self.pr2, axis=1, norm='l1') self.pr3 = preprocessing.normalize(self.pr3, axis=1, norm='l1') print(("XGB log loss:", metrics.log_loss(y[cal], self.pr1))) print(("RF log loss:", metrics.log_loss(y[cal], self.pr2))) print(("NN log loss:", metrics.log_loss(y[cal], self.pr3))) print(("XGB+RF+NN log loss:", metrics.log_loss(y[cal], (self.pr1 + self.pr2 + self.pr3) / 3))) self.clfs = [self.clf1, self.clf2, self.clf3] predictions = [] for clf in self.clfs: predictions.append(clf.predict_proba(X.iloc[cal])) self.cal_y = y[cal] def log_loss_func(weights): ''' scipy minimize will pass the weights as a numpy array ''' final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return metrics.log_loss(self.cal_y, final_prediction) scores = [] wghts = [] for i in range(20): if not i: starting_values = [1 / 3] * len(self.clfs) else: starting_values = np.random.uniform(size=len(self.clfs)) cons = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)}) bounds = [(0, 1)] * len(predictions) res = scopt.minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons) scores.append(res['fun']) wghts.append(res['x']) bestSC = np.min(scores) bestWght = wghts[np.argmin(scores)] self.weights = bestWght print(('Ensamble Score: {best_score}'.format(best_score=bestSC))) print(('Best Weights: {weights}'.format(weights=bestWght)))
def __init__( self, n_estimators=300, min_iterations=10, gll_early_stop_threshold=None, max_iterations=20, rf_params=None, calibrator=None, run_calibration=None, average_proba=True, labels=None, good_bands=None, reducer=None ): self.min_iterations = min_iterations self.gll_early_stop_threshold = gll_early_stop_threshold self.max_iterations = max_iterations self.rf_params = {"n_estimators": n_estimators} if rf_params is None else rf_params self.rf_params.update({"oob_score": True, "n_jobs": -1}) if "n_estimators" not in self.rf_params: self.rf_params.update({"n_estimators": n_estimators}) # initialize parameters from CCB-ID # set an attribute with the number of models self.n_models_ = 1 # set the model calibration function if calibrator is None: self.calibrator = _calibration.CalibratedClassifierCV(method='sigmoid', cv=3) else: self.calibrator = calibrator # set the attribute determining whether to perform calibration on a per-model basis if run_calibration is None: self.run_calibration_ = np.repeat(True, self.n_models_) else: self.run_calibration_ = run_calibration # set an attribute to hold the final calibrated models self.calibrated_models_ = np.repeat(None, self.n_models_) # set the flag to average the probability outputs self.average_proba_ = average_proba # and set some properties that will be referenced later # like species labels and a list of good bands if labels is None: self.labels_ = None else: self.labels_ = labels if good_bands is None: self.good_bands_ = None else: self.good_bands_ = good_bands if reducer is None: self.reducer = None else: self.reducer = reducer self.n_features_ = None #MERF specific arguments self.cluster_counts = None self.trained_rf = None self.trained_b = None self.b_hat_history = [] self.sigma2_hat_history = [] self.D_hat_history = [] self.gll_history = []
# Test baseline models for (model, name, params, pred_fn) in best_models: print("Running CV for: {}".format(name)) start_time = datetime.now() constructed_model = model(**params) cv_loss = model_cv_test(constructed_model, X_train.values, y_train.values, pred_fn=pred_fn, n_fold=5) record.append(record_cv_loss(name, cv_loss)) end_time = datetime.now() print("Time taken: {}".format(str(end_time - start_time))) # Test Calibrated models for (base_model, name, params, pred_fn) in best_models: name = name + "_cali" print("Running CV for: {}".format(name)) start_time = datetime.now() constructed_model = calibration.CalibratedClassifierCV( base_model(**params)) cv_loss = model_cv_test(constructed_model, X_train.values, y_train.values, pred_fn=pred_fn, n_fold=5) record.append(record_cv_loss(name, cv_loss)) end_time = datetime.now() print("Time taken: {}".format(str(end_time - start_time))) record = pd.concat(record) record.to_csv(args.logdir, index=False)
#!/usr/bin/python # # Runs verious statistics against data bundle # ####################################################### import numpy as np from sklearn import calibration ccv_bad = calibration.CalibratedClassifierCV() ccv_good = calibration.CalibratedClassifierCV()
def svmTrain(i): # seg_ratio is the ratio between labeled samples and unlabeled samples # number_iteration seg_ratio = 1 MaxNumPerClassPerIteration = 10 # to correct = 0 total = 0 wrong = 0 correctPerClass = [0 for k in range(21)] wrongPerClass = [0 for k in range(21)] numPerClass = [0 for k in range(21)] addedsamplenum = 0 # caffe construct the net # caffe.set_mode_gpu() net = caffe.Classifier( SVM_deployPath, caffeModelPath, mean=np.load( os.path.join( caffe_path, 'python/caffe/imagenet/ilsvrc_2012_mean.npy')).mean(1).mean(1), channel_swap=(2, 1, 0), raw_scale=255, image_dims=(256, 256)) net2 = caffe.Classifier( SVM_deployPath2, caffeModelPath2, mean=np.load( os.path.join( caffe_path, 'python/caffe/imagenet/ilsvrc_2012_mean.npy')).mean(1).mean(1), channel_swap=(2, 1, 0), raw_scale=255, image_dims=(256, 256)) X = [] y = [] samples = [] with open("csvfold/Train_" + str(i) + ".csv", "rb") as csvFile: csvReader = csv.reader(csvFile, delimiter=' ') for row in csvReader: samples.append(row[0]) y.append(row[1]) #divided the dataset into labeled and unlabeled section according the ratio seg_ratio labeled_sample, labeled_y = samples[:210 * seg_ratio], y[:210 * seg_ratio] unlabeled_sample, unlabeled_y = samples[210 * seg_ratio:], y[210 * seg_ratio:] #use EL to save the learnt result EL_samples, EL_y = [], [] j = 1 train_sample, train_y = labeled_sample, labeled_y #train_X is the vector collection of CNN1 and train_X2 is the vector collection of CNN2, in this way clf2 is the classifier of CNN2 train_X, train_X2 = [], [] lowConfidence_sample, lowConfidence_y = [], [] for k in train_sample: prediction = classify(net, [k]) train_X.append(prediction[0]) prediction = classify(net2, [k]) train_X2.append(prediction[0]) #train svm lenofvector = len(train_X[0]) lenofvector2 = len(train_X2[0]) clf = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000)) clf.fit(train_X, train_y) clf2 = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000)) clf2.fit(train_X2, train_y) label_order = clf.classes_.tolist() claName = [] for k in label_order: for o in labels: if labels[o] == int(k): claName.append(o) print("Labeled order is ", label_order) print("class name order is ", claName) #test the performance without learning unlabeled data test_pred, test_label = [], [] with open("csvfold/Test_" + str(i) + ".csv", "rb") as csvFile: csvReader = csv.reader(csvFile, delimiter=' ') for row in csvReader: features = classify(net, [row[0]]) features2 = classify(net2, [row[0]]) # the test classifier is not determined prediction = clf.predict( np.array(features[0]).reshape(1, lenofvector)) prediction2 = clf2.predict( np.array(features2[0].reshape(1, lenofvector2))) proba_list = clf.predict_proba( np.array(features[0]).reshape(1, lenofvector)) proba_list2 = clf2.predict_proba( np.array(features2[0]).reshape(1, lenofvector2)) proba = proba_list[0][int(label_order.index(prediction))] proba2 = proba_list2[0][int(label_order.index(prediction2))] #decide the terminal result if prediction == prediction2: prediction = prediction else: if proba >= proba2: prediction = prediction else: prediction = prediction2 if prediction == row[1]: correct += 1 else: wrong += 1 total += 1 # paratmeter used for consturcting confusion matrix test_pred.append(prediction) test_label.append(row[1]) print("TOTAL: " + str(total)) print("CORRECT: " + str(correct)) print("WRONG: " + str(wrong)) output = "the iteration round order is" + str(j) + "\n" output += "the accuracy ratio is " + str( float(correct) / float(total) * 100) + "\n\n" open("results.txt", "a").write(output + "\n") # paint the confusion matrix graph plot_save_graph(test_label, test_pred, 0, claName) while True: # print( "the value of j is : " , str(j)) addedsamplePerClass = [0 for k in range(21)] batch_sample, batch_y = unlabeled_sample[210 * (j - 1):210 * j], unlabeled_y[210 * (j - 1):210 * j] # the order is the label order learnt in svm # feature is the output vector of CNN # use svm to predict the unlabeled and save the resutl into EL_sample and EL_y # for k in range(210): features = classify(net, [batch_sample[k]]) features2 = classify(net2, [batch_sample[k]]) prediction = clf.predict( np.array(features[0]).reshape(1, lenofvector)) prediction2 = clf2.predict( np.array(features2[0]).reshape(1, lenofvector2)) proba_list = clf.predict_proba( np.array(features[0]).reshape(1, lenofvector)) proba_list2 = clf2.predict_proba( np.array(features2[0]).reshape(1, lenofvector2)) proba = proba_list[0][int(label_order.index(prediction))] proba2 = proba_list2[0][int(label_order.index(prediction2))] # classPointer save the value of the lable of prediciton # print("the accuracy of ", batch_sample[k], " is ", str(proba*100),"%") # if prediction[0] == prediction2[0] and (proba >= 0.3 or proba2 >= 0.3): classPointer = int(prediction[0]) if addedsamplePerClass[ classPointer] < MaxNumPerClassPerIteration: addedsamplePerClass[classPointer] += 1 EL_samples.append(batch_sample[k]) EL_y.append(str(prediction[0])) addedsamplenum += 1 else: lowConfidence_sample.append(batch_sample[k]) lowConfidence_y.append(str(prediction[0])) # #train the clf train_sample, train_X, train_X2, train_y = [], [], [], [] train_sample, train_y = labeled_sample + EL_samples, labeled_y + EL_y print("the len of train_sample is :", str(len(train_sample)), "the number of added samples is :", str(addedsamplenum)) for k in range(len(train_sample)): features = classify(net, [train_sample[k]]) train_X.append(features[0]) features2 = classify(net2, [train_sample[k]]) train_X2.append(features2[0]) # pdb.set_trace() clf = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000)) clf.fit(train_X, train_y) clf2 = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000)) clf2.fit(train_X2, train_y) correct = 0 total = 0 wrong = 0 test_pred, test_label = [], [] with open("csvfold/Test_" + str(i) + ".csv", "rb") as csvFile: csvReader = csv.reader(csvFile, delimiter=' ') for row in csvReader: features = classify(net, [row[0]]) features2 = classify(net2, [row[0]]) # the test classifier is not determined prediction = clf.predict( np.array(features[0]).reshape(1, lenofvector)) prediction2 = clf2.predict( np.array(features2[0].reshape(1, lenofvector2))) proba_list = clf.predict_proba( np.array(features[0]).reshape(1, lenofvector)) proba_list2 = clf2.predict_proba( np.array(features2[0]).reshape(1, lenofvector2)) proba = proba_list[0][int(label_order.index(prediction))] proba2 = proba_list2[0][int(label_order.index(prediction2))] #decide the terminal result if prediction == prediction2: prediction = prediction else: if proba >= proba2: prediction = prediction else: prediction = prediction2 if prediction == row[1]: correct += 1 else: wrong += 1 total += 1 test_pred.append(prediction) test_label.append(row[1]) print("TOTAL: " + str(total)) print("CORRECT: " + str(correct)) print("WRONG: " + str(wrong)) output = "the iteration round order is" + str(j) + "\n" output += "the accuracy ratio is " + str( float(correct) / float(total) * 100) + "\n\n" open("results.txt", "a").write(output + "\n") plot_save_graph(test_label, test_pred, j, claName) j += 1 if j > 8: break print("train the low confidence samples") #count save the last time value of addedsamplenum count = 0 # iteration learning the lowconfidence samples while count != addedsamplenum: j = j + 1 lowConfidenceMid_sample = [] count = addedsamplenum for k in range(len(lowConfidence_sample)): sample = lowConfidence_sample[k] features = classify(net, [sample]) features2 = classify(net2, [sample]) prediction = clf.predict( np.array(features[0]).reshape(1, lenofvector)) prediction2 = clf2.predict( np.array(features2[0]).reshape(1, lenofvector2)) proba_list = clf.predict_proba( np.array(features[0]).reshape(1, lenofvector)) proba_list2 = clf2.predict_proba( np.array(features2[0]).reshape(1, lenofvector2)) proba = proba_list[0][int(label_order.index(prediction))] proba2 = proba_list2[0][int(label_order.index(prediction2))] # classPointer save the value of the lable of prediciton # print("the accuracy of ", batch_sample[k], " is ", str(proba*100),"%") if prediction[0] == prediction2[0] and (proba >= 0.3 or proba2 >= 0.3): EL_samples.append(sample) EL_y.append(str(prediction[0])) addedsamplenum += 1 else: lowConfidenceMid_sample.append(sample) train_sample, train_X, train_y, train_X2 = [], [], [], [] train_sample, train_y = labeled_sample + EL_samples, labeled_y + EL_y print("the len of train_sample is :", str(len(train_sample)), "the number of added samples is :", str(addedsamplenum)) for k in range(len(train_sample)): features = classify(net, [train_sample[k]]) train_X.append(features[0]) features2 = classify(net2, [train_sample[k]]) train_X2.append(features2[0]) # pdb.set_trace() clf = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000)) clf.fit(train_X, train_y) clf2 = calibration.CalibratedClassifierCV(svm.LinearSVC(C=100000)) clf2.fit(train_X2, train_y) correct = 0 total = 0 wrong = 0 test_label, test_pred = [], [] with open("csvfold/Test_" + str(i) + ".csv", "rb") as csvFile: csvReader = csv.reader(csvFile, delimiter=' ') for row in csvReader: features = classify(net, [row[0]]) features2 = classify(net2, [row[0]]) # the test classifier is not determined prediction = clf.predict( np.array(features[0]).reshape(1, lenofvector)) prediction2 = clf2.predict( np.array(features2[0].reshape(1, lenofvector2))) proba_list = clf.predict_proba( np.array(features[0]).reshape(1, lenofvector)) proba_list2 = clf2.predict_proba( np.array(features2[0]).reshape(1, lenofvector2)) proba = proba_list[0][int(label_order.index(prediction))] proba2 = proba_list2[0][int(label_order.index(prediction2))] #decide the terminal result if prediction == prediction2: prediction = prediction else: if proba >= proba2: prediction = prediction else: prediction = prediction2 if prediction == row[1]: correct += 1 else: wrong += 1 total += 1 test_pred.append(prediction) test_label.append(row[1]) print("TOTAL: " + str(total)) print("CORRECT: " + str(correct)) print("WRONG: " + str(wrong)) output = "the iteration round order is" + str(j) + "\n" output += "the accuracy ratio is " + str( float(correct) / float(total) * 100) + "\n\n" open("results.txt", "a").write(output + "\n") plot_save_graph(test_label, test_pred, j, claName) lowConfidence_sample = [] lowConfidence_sample = lowConfidenceMid_sample
# -*- coding: utf-8 -*- """ Created on Mon May 21 11:57:51 2018 @author: Sven Simple decisiopn tree classifier example for learning python. """ from sklearn import calibration import sex_data as sd data = sd.create_data() clf = calibration.CalibratedClassifierCV() clf = clf.fit(data.loc[:, ['height', 'weight', 'shoe_size']], data.loc[:, 'sex']) prediction = clf.predict([[190, 70, 43]]) print(prediction)
def _train_svm(feats, labels, prim_id, ex_size, num_ex): logger.info("Training Primitive {}.".format(prim_id)) # split examplars pos_img_ids = np.where(labels)[0] pos_img_splits = [pos_img_ids] if num_ex == 1 else [pos_img_ids] + [ np.random.choice( pos_img_ids, size=min(ex_size, pos_img_ids.size), replace=False) for _ in range(num_ex) ] logger.info("Primitive {} has {} exemplars.".format( prim_id, len(pos_img_splits))) svms, clbs = [], [] for ex_id, pos_ex_ids in enumerate(pos_img_splits): if len(pos_ex_ids) > 0: logger.info("Primitive {} training exemplar {} ...".format( prim_id, ex_id)) svm_object = sklearn_svm.LinearSVC(C=1e-3, class_weight={ 1: 2, -1: 1.0 }, verbose=0, penalty='l2', loss='hinge', dual=True) neg_ex_ids = np.array( [idx for idx in range(labels.size) if idx not in pos_ex_ids]) X = np.vstack([feats[pos_ex_ids], feats[neg_ex_ids]]) Y = np.hstack( [np.ones(pos_ex_ids.size), -1.0 * np.ones(neg_ex_ids.size)]) svm_object.fit(X, Y) train_acc = svm_object.score(X, Y) svms.append(svm_object) logger.info( "SVM (Primitive {} examplar {}) has {} positives, {} negatives and accuracy {}." .format(prim_id, ex_id, pos_ex_ids.size, neg_ex_ids.size, train_acc)) if ex_id == 0: svm_object_clb = sklearn_svm.LinearSVC(C=1e-3, class_weight={ 1: 2, -1: 1.0 }, verbose=0, penalty='l2', loss='hinge', dual=True) np.random.shuffle(pos_ex_ids) np.random.shuffle(neg_ex_ids) pos_split_point = int(np.ceil(0.9 * len(pos_ex_ids))) cls_pos_idx, calib_pos_idx = pos_ex_ids[: pos_split_point], pos_ex_ids[ pos_split_point:] neg_split_point = int(np.ceil(0.9 * len(neg_ex_ids))) cls_neg_idx, calib_neg_idx = neg_ex_ids[: neg_split_point], neg_ex_ids[ neg_split_point:] X = np.vstack([feats[cls_pos_idx], feats[cls_neg_idx]]) Y = np.hstack([ np.ones(cls_pos_idx.size), -1.0 * np.ones(cls_neg_idx.size) ]) svm_object_clb.fit(X, Y) clb_object = sklearn_clb.CalibratedClassifierCV(svm_object_clb, cv='prefit') X = np.vstack([feats[calib_pos_idx], feats[calib_neg_idx]]) Y = np.hstack([ np.ones(calib_pos_idx.size), -1.0 * np.ones(calib_neg_idx.size) ]) clb_object.fit(X, Y) clbs.append(clb_object) clb_object.score(X, Y) logger.info( "Calibrated SVM (Primitive {} examplar {}) has {} positives, {} negatives and accuracy {}." .format(prim_id, ex_id, pos_ex_ids.size, neg_ex_ids.size, train_acc)) return svms, clbs
def predict_kfold_ML(data, label, features, cv_type, clf, calibration, seed, cvfolds): X = data.loc[:, features] Y = data.loc[:, [label]].astype(bool) if (cv_type == 'stratifiedkfold'): skf = sk_ms.StratifiedKFold(cvfolds, random_state=seed, shuffle=True) elif (cv_type == 'kfold'): skf = sk_ms.KFold(cvfolds, random_state=seed, shuffle=True) else: raise ('incompatible crossvalidation type') predicted_probability = [] true_label = [] for train_index, test_index in skf.split(X, Y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index] if (calibration is None): clf.fit(X_train, Y_train.values.ravel().astype(int)) calibrated_clf = clf else: if hasattr(clf, 'best_estimator_'): clf.fit(X_train, Y_train.values.ravel().astype(int)) if (calibration == 'isotonic'): calibrated_clf = sk_cal.CalibratedClassifierCV( clf.best_estimator_, method='isotonic', cv=10) calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int)) elif (calibration == 'sigmoid'): calibrated_clf = sk_cal.CalibratedClassifierCV( clf.best_estimator_, method='sigmoid', cv=10) calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int)) else: print('Unknown Calibration type') raise else: if (calibration == 'isotonic'): calibrated_clf = sk_cal.CalibratedClassifierCV( clf, method='isotonic', cv=10) calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int)) elif (calibration == 'sigmoid'): calibrated_clf = sk_cal.CalibratedClassifierCV( clf, method='sigmoid', cv=10) calibrated_clf.fit(X_train, Y_train.values.ravel().astype(int)) try: Y_prob = calibrated_clf.predict_proba(X_test) predicted_probability.append(Y_prob[:, 1]) except: Y_prob = calibrated_clf.decision_function(X_test) predicted_probability.append(Y_prob) true_label.append(list(Y_test.values.flat)) tl_pp_dict = {"true_label": true_label, "pred_prob": predicted_probability} return tl_pp_dict
X_test, y_test, test_size=0.82, random_state=1, stratify=y_test) print("X_train\t\t%sx%s" % (X_train.shape)) print("X_test\t\t%sx%s" % (X_test.shape)) print("X_explain\t%sx%s" % (X_explain.shape)) # Create an ensemble blackbox classifier and predict test and explain set clf_svm = svm.SVC(probability=True, kernel="linear", random_state=1) clf_svm.fit(X_train, y_train) svm_preds = clf_svm.predict_proba(X_test) svm_explanations = clf_svm.predict_proba(X_explain) clf_base = ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=1000, random_state=1) clf_cet = calibration.CalibratedClassifierCV(base_estimator=clf_base) clf_cet.fit(X_train, y_train) et_preds = clf_cet.predict_proba(X_test) et_explanations = clf_cet.predict_proba(X_explain) blackbox_preds = (et_preds + svm_preds) / 2. blackbox_explanations = (et_explanations + svm_explanations) / 2. print("\n\nSupport Vector Machine with linear kernel") print("Accuracy Score:\t%f" % accuracy_score(y_test, np.argmax(svm_preds, axis=1))) print("Multi-Log loss:\t%f" % log_loss(y_test, svm_preds)) print("\n\nCalibrated Extremely Randomized Trees") print("Accuracy Score:\t%f" % accuracy_score(y_test, np.argmax(et_preds, axis=1)))
elif args.dataset == CKPLUS.name: get_data_ckplus(clahe, detector, predictor, selected_labels, SAVE_IMAGES) if train: #for C in [1.5*1e-3, 3e-3, 4.5*1e-3, 6*1e-3, 7.5*1e-3, 9e-3]: print("building model...") #clf = svm.LinearSVC(C=0.01, random_state=0, tol=1e-4, dual=False) # CK+: C=0.1 or 0.01 # fer2013: C=1e-3 #clf = calibration.CalibratedClassifierCV(clf, method='sigmoid', cv=5) if args.dataset == FER2013.name: OUTPUT_FOLDER_NAME = FER2013.name clf = svm.LinearSVC(C=0.001, random_state=0, tol=1e-4, dual=False) clf = calibration.CalibratedClassifierCV(clf, method='sigmoid', cv=5) with open(OUTPUT_FOLDER_NAME + '/Training/landmarks_feats.pkl', 'rb') as f: feats_data = pickle.load(f) with open(OUTPUT_FOLDER_NAME + '/Training/hog_feats.pkl', 'rb') as f: hog_feats = pickle.load(f) with open(OUTPUT_FOLDER_NAME + '/Training/labels.pkl', 'rb') as f: labels = pickle.load(f) feats_data = np.concatenate([feats_data, hog_feats], axis=1) with open(OUTPUT_FOLDER_NAME + '/PrivateTest/landmarks_feats.pkl', 'rb') as f: feats_data2 = pickle.load(f) with open(OUTPUT_FOLDER_NAME + '/PrivateTest/hog_feats.pkl',
proj_mask=args.proj_mask, online_learn=args.online_learn, svm_model=args.svm_model, epochs=args.epochs) else: logger.info('Using SVM algo: SVC.') clf = svc_fit(train=(X_train, y_train), proj_mask=args.proj_mask, epochs=args.epochs) # Generate feature vectors. X_val_fv = common.process_samples(X_val, proj_mask=proj_mask) X_test_fv = common.process_samples(X_test, proj_mask=proj_mask) logger.info('Calibrating classifier.') cal_clf = calibration.CalibratedClassifierCV(base_estimator=clf, cv='prefit') cal_clf.fit(X_val_fv, y_val) logger.info('Evaluating final classifier on test set.') evaluate_model(cal_clf, X_test_fv, y_test, class_names, args.svm_cm) logger.info(f'Saving svm model to: {args.svm_model}.') with open(args.svm_model, 'wb') as outfile: outfile.write(pickle.dumps(cal_clf)) # Do not overwrite label encoder if online learning was performed. if not args.online_learn or args.use_svc: logger.info(f'Saving label encoder to: {args.label_encoder}.') with open(args.label_encoder, 'wb') as outfile: outfile.write(pickle.dumps(le))
def train_main(self): data = pd.DataFrame() model_dict = dict() train_data_path = self.train_data_path for i in train_data_path: data_tmp = pd.read_excel(i, header=0) data_tmp.columns = ["pid", "label", "context"] data = pd.concat([data, data_tmp]) data = shuffle(data) data["context_ngram"] = data[["context"]].applymap(ngram_process) context = data["context_ngram"].values label = data[["label"]].applymap(fun_map).values data_test = pd.read_excel(self.test_data_path, header=0) data_test.columns = ["pid", "label", "context"] data_test["context_ngram"] = data_test[["context"]].applymap(ngram_process) test_context = data_test["context_ngram"].values test_label = data_test[["label"]].applymap(fun_map).values # tf idf tf_idf = TfidfVectorizer(analyzer=fun_1, min_df=50) tf_idf.fit(context) model_dict["model_1"] = pickle.dumps(tf_idf) feature_names = tf_idf.get_feature_names() model_dict["feature_names"] = pickle.dumps(feature_names) print("feature num", len(feature_names)) x_train = tf_idf.transform(context) x_test = tf_idf.transform(test_context) # chi model = SelectKBest(chi2, k="all") model.fit(x_train, label) model_dict["model_2"] = pickle.dumps(model) x_train = model.transform(x_train) x_test = model.transform(x_test) classify = svm.LinearSVC(C=0.9) # param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} # grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2) # grid = xgb.XGBClassifier() # print(grid.best_params_) classify = calibration.CalibratedClassifierCV(classify, cv=10) classify.fit(x_train, label) y_predict = classify.predict(x_test) print(metrics.classification_report(test_label, y_predict)) print("accuracy:", metrics.accuracy_score(test_label, y_predict)) model_dict["model_3"] = pickle.dumps(classify) with open(self.model_path, mode='wb') as fm: joblib.dump(model_dict, fm)