def _oversample(X, y, method='SMOTE', strat='not majority'): # compute minimum number of samples per class min_samples = len(y) for l in set(y): if y.tolist().count(l) < min_samples: min_samples = y.tolist().count(l) if min_samples <= 5: method = 'RNDM' if method == 'ADASYN': ios = imbover.ADASYN(sampling_strategy=strat, random_state=42) elif method == 'SMOTE': ios = imbover.SMOTE(sampling_strategy=strat, random_state=42) elif method == 'SMOTENC': ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42) elif method == 'BORDERSMOTE': ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42) elif method == 'SVMSMOTE': ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42) elif method == 'KMEANSSMOTE': ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42) elif method == 'RNDM': ios = imbover.RandomOverSampler(sampling_strategy=strat, random_state=42) X_resampled, y_resampled = ios.fit_resample(X, y) return X_resampled, y_resampled
def oversample(classifier): print('*** OVERSAMPLE ***') pipe = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), classifier, ]) X, y = prepare_data() y_pred = model_selection.cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1) c = Counter(over_sampling.RandomOverSampler().fit_sample(X, y)[1]) return y, y_pred, c
def create_predictions(model, x_df, y_df): """create_predictions splits the data into a training and testing set, oversamples the training set, and predicts the testing set. Args: model(sklearn.ensemble.RandomForestClassifier): The machine learning model to train and predict with x_df(pd.DataFrame): Input "x" vector y_df(pd.DataFrame): Output "y" vector """ # Split the data into training and testing data x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(x_df, y_df) # Oversample the training data oversampling = over_sampling.RandomOverSampler() x_resampled, y_resampled = oversampling.fit_sample(x_train_df, y_train_df) # Train the model on the oversampled training data model.fit(x_resampled, y_resampled) # Use model to predict the testing set y_pred_df = model.predict(x_test_df) # Create a confusion matrix and write to file. cm_df = pd.DataFrame(metrics.confusion_matrix(y_test_df, y_pred_df), index=["actual_negative", "actual_positive"], columns=["predicted_negative", "predicted_positive"]) cm_df.to_csv( (DIRECTORY + "results/" + DATE + LOC + TYPE + "confusion_matrix.csv"), sep='\t') # Create a file to store metrics. metrics_file = open( (DIRECTORY + "results/" + DATE + LOC + TYPE + "metrics.txt"), "w+") metrics_file.write(metrics.classification_report(y_test_df, y_pred_df))
def balance_classes(self, X, y, max): classes = np.unique(y) balanced_X = None balanced_y = None oversample = over_sampling.RandomOverSampler( sampling_strategy='minority') balanced_X, balanced_y = oversample.fit_resample(X, y) # for c in classes: # if len(np.where(y == c)[0])>max: # indices=np.where(y==c) # print(indices) # indices=(indices[0][0:max],) # print(indices) # if balanced_X is None: # balanced_X=np.take(X,indices=indices,axis=0) # balanced_y = np.take(y, indices=indices, axis=0) # else: # balanced_X=np.concatenate((balanced_X,np.take(X,indices=indices,axis=0)), axis=1) # balanced_y=np.append(balanced_y,np.take(y,indices=indices,axis=0)) # else: # indices=np.where(y==c) # if balanced_X is None: # balanced_X=np.take(X,indices=indices,axis=0) # balanced_y = np.take(y, indices=indices, axis=0) # else: # balanced_X=np.concatenate((balanced_X,np.take(X,indices=indices,axis=0)), axis=1) # balanced_y=np.append(balanced_y,np.take(y,indices=indices,axis=0)) # # balanced_X = balanced_X.reshape((balanced_X.shape[1], balanced_X.shape[2])) return balanced_X, balanced_y
def get_confusion_matrix(model_data, feature_cols, dependent_variable, seed_val=0, test_size=0.25, solver='liblinear', oversample_training_data=False, sampling_strategy=0.5): # Split into independent and dependent variables X = model_data[feature_cols] y = model_data[dependent_variable] # Split into train and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed_val) # Oversample training data if specified oversample_obj = imbsample.RandomOverSampler( sampling_strategy=sampling_strategy, random_state=seed_val) X_train, y_train = oversample_obj.fit_resample(X_train, y_train) # Instantiate the model lr = LogisticRegression(solver=solver) lr.fit(X_train, y_train) y_pred = lr.predict(X_test) cnf_matrix = metrics.confusion_matrix(y_test, y_pred) return cnf_matrix
def oversample(self, x, y): ''' oversample minority actions in case of sparse actions ''' ros = imb.RandomOverSampler(random_state=0) x, y = ros.fit_resample(x, y) self.check_balance() return x, y
def ReSampling(self, data, labels, over_s=True): label_status = Counter(labels) print(self.tasktype, "data " + self.tasktype, label_status) featurelen = len(data[0]) if 1 not in label_status.keys(): x, y = np.zeros(shape=featurelen, dtype=np.int), 1 elif 0 not in label_status.keys(): x, y = np.zeros(shape=featurelen, dtype=np.int), 0 else: x, y = None, None if x is not None: data = np.insert(data, 0, x, 0) labels = np.insert(labels, 0, y, 0) if len(label_status) < 2: print(self.tasktype, "no need to resample") return data, labels if label_status[1] / label_status[0] < 5. and label_status[ 1] / label_status[0] > 0.2: print("data are not biased too much") return data, labels maxSamples = label_status[0] if label_status[1] > label_status[0]: maxSamples = label_status[1] resampling = over_sampling.ADASYN(ratio={ 1: maxSamples, 0: int(0.4 * maxSamples) }) else: resampling = over_sampling.ADASYN(ratio={ 0: maxSamples, 1: int(0.4 * maxSamples) }) try: data, labels = resampling.fit_sample(data, labels) except: print(self.tasktype, "resampling using random method") if over_s: resampling = over_sampling.RandomOverSampler() else: resampling = under_sampling.RandomUnderSampler() data, labels = resampling.fit_sample(data, labels) label_status = Counter(labels) print(self.tasktype, "sampling status=", label_status) return data, labels
def us_os_bac(base_clf, X_train, y_train, X_test, y_test): us = under_sampling.RandomUnderSampler() os = over_sampling.RandomOverSampler() X_us, y_us = us.fit_sample(X_train, y_train) X_os, y_os = os.fit_sample(X_train, y_train) us_clf = base.clone(base_clf) os_clf = base.clone(base_clf) us_clf.fit(X_us, y_us) os_clf.fit(X_os, y_os) us_pred = us_clf.predict(X_test) os_pred = os_clf.predict(X_test) return ( metrics.balanced_accuracy_score(y_test, us_pred), metrics.balanced_accuracy_score(y_test, os_pred), )
def fit(self, X_train, y_train): # Save X and y self.X_train = X_train self.y_train = y_train # Firstly we analyze the training set to find majority class and to # establish the imbalance ratio self.classes, c_counts = np.unique(y_train, return_counts=True) majority_c = 0 if c_counts[0] > c_counts[1] else 1 minority_c = 1 - majority_c min_idx = np.where(y_train == minority_c)[0] maj_idx = np.where(y_train == majority_c)[0] # K is the imbalanced ratio round to int, being also a number of # ensemble members. imbalance_ratio = c_counts[majority_c] / c_counts[minority_c] self.k = int(np.around(imbalance_ratio)) self.k = self.k if self.k > 2 else 2 # We use k to KFold division of majority class self.clfs = [] kf = model_selection.KFold(n_splits=self.k, shuffle=True) for _, index in kf.split(maj_idx): fold_idx = np.concatenate([min_idx, maj_idx[index]]) X_train_f, y_train_f = X_train[fold_idx], y_train[fold_idx] clf = base.clone(self.base_clf) clf.fit(X_train_f, y_train_f) self.clfs.append(clf) # Add OS clf = base.clone(self.base_clf) os = over_sampling.RandomOverSampler() X_os, y_os = os.fit_sample(self.X_train, self.y_train) clf.fit(X_os, y_os) self.clfs.append(clf) # Calculate weights as balanced accuracy on whole set self.weights = np.array([ metrics.balanced_accuracy_score(self.y_train, clf.predict(self.X_train)) for clf in self.clfs ]) scaler = preprocessing.MinMaxScaler() self.nweights = scaler.fit_transform(self.weights.reshape(-1, 1)).T[0] self.nweights += 0.01
def get_oversampled_df(model_data, feature_columns, dependent_variable, sampling_strategy=0.5, seed_val=0): X = model_data[feature_columns] y = model_data[dependent_variable] oversample_obj = imbsample.RandomOverSampler( sampling_strategy=sampling_strategy, random_state=seed_val) X_over, y_over = oversample_obj.fit_resample(X, y) oversampled_df = X_over oversampled_df['Competed'] = y_over return oversampled_df
def curves(): from sklearn import multiclass from itertools import cycle mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify() baseline = ipipeline.make_pipeline( over_sampling.RandomOverSampler(random_state=SEED), dummy.DummyClassifier(strategy='constant', constant=1, random_state=SEED), ) logreg = ipipeline.make_pipeline( over_sampling.RandomOverSampler(random_state=SEED), linear_model.LogisticRegression(solver='lbfgs', random_state=SEED), ) dtree = ipipeline.make_pipeline( over_sampling.RandomOverSampler(random_state=SEED), tree.DecisionTreeClassifier(max_depth=3, random_state=SEED), ) models = ( ('Constant', baseline), ('Logistic Regression', logreg), ('Decision Tree', dtree), ) for name, pipe in models: classifier = multiclass.OneVsRestClassifier(pipe) df = prepare_data() df = df[[*features, 'class']].dropna() X, y = df[features].values, df['class'].ravel() y = preprocessing.label_binarize(y, classes=labels) n_labels = 3 #n_samples, n_features = X.shape X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.2, random_state=SEED) classifier.fit(X_train, y_train) if hasattr(classifier, 'decision_function'): y_score = classifier.decision_function(X_test) if hasattr(classifier, 'predict_proba'): y_score = classifier.predict_proba(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_labels): fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = metrics.auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_labels)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_labels): mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_labels fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves latexify(columns=1) f, ax = plt.subplots() ax.plot( fpr["micro"], tpr["micro"], label=f'micro-average ROC curve (area = {roc_auc["micro"]:0.2f})', color='deeppink', linestyle=':' ) ax.plot( fpr["macro"], tpr["macro"], label=f'macro-average ROC curve (area = {roc_auc["macro"]:0.2f})', color='navy', linestyle=':' ) colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(n_labels), colors): ax.plot( fpr[i], tpr[i], color=color, label=f'ROC curve of class {labels[i]} (area = {roc_auc[i]:0.2f})' ) ax.plot([0, 1], [0, 1], 'k--') ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.05]) ax.set_xlabel('FP Rate') ax.set_ylabel('TP Rate') #ax.set_title(f'Multi-class ROC curves for {name}') ax.legend(loc="lower right") format_axes(ax) #f.tight_layout() f.savefig(f'./output/roc-{name.replace(" ", "-").lower()}.pdf', bbox_inches='tight')
def multiple_figures(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify() cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) scaler = preprocessing.StandardScaler() resample = over_sampling.RandomOverSampler() baseline = pipeline.make_pipeline( scaler, resample, dummy.DummyClassifier(strategy='constant', constant='good')) logreg = pipeline.make_pipeline( scaler, resample, linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr'), ) dtree = pipeline.make_pipeline( scaler, resample, tree.DecisionTreeClassifier(), ) knn = pipeline.make_pipeline( scaler, resample, neighbors.KNeighborsClassifier(), ) mlp = pipeline.make_pipeline( scaler, resample, neural_network.MLPClassifier(hidden_layer_sizes=( 100, 100, 100, ), activation='relu', solver='adam'), ) svc = pipeline.make_pipeline( scaler, resample, svm.LinearSVC(), ) RForest = pipeline.make_pipeline( scaler, resample, ensemble.RandomForestClassifier(n_estimators=100), ) models = ( ('Constant', baseline), ('Logistic Regression', logreg), ('Decision Tree', dtree), #('kNN', knn), ('Multi-Layer Perceptron', mlp), ('linearSVM', svc), ('Random Forest', RForest), ) # Special case of baseline filename = 'baseline-link-overall' df = prepare_data() y, y_pred = df['class'].ravel(), df['class_overall'].ravel() acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title( f'accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})') format_axes_for_cm(ax) fig.tight_layout() ensure_dir('./output/models/') fig.savefig(f'./output/models/{filename}.pdf', dpi=92, bbox_inches='tight') plt.close(fig) print(f'Done {filename}') for name, pipe in models: filename = name.lower().replace(' ', '_') y, y_pred = different_models(pipe) acc = metrics.accuracy_score(y, y_pred) #prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) #recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) print(name) print(metrics.classification_report(y, y_pred, labels=labels)) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) fig, ax = plt.subplots(dpi=92) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title(f'accuracy={acc:.3f}') format_axes_for_cm(ax) fig.tight_layout() ensure_dir('./output/models/') fig.savefig(f'./output/models/{filename}.pdf', dpi=92, bbox_inches='tight') plt.close(fig)
def main(): cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) poly = preprocessing.PolynomialFeatures(degree=2) scaler = preprocessing.StandardScaler() resample = over_sampling.RandomOverSampler() baseline = pipeline.make_pipeline( scaler, resample, dummy.DummyClassifier(strategy='constant', constant='good')) logreg = pipeline.make_pipeline( scaler, resample, linear_model.LogisticRegression(), ) dtree = pipeline.make_pipeline( scaler, resample, tree.DecisionTreeClassifier(), ) #knn = pipeline.make_pipeline( # scaler, # resample, # neighbors.KNeighborsClassifier() #) mlp = pipeline.make_pipeline(scaler, resample, neural_network.MLPClassifier()) svc = pipeline.make_pipeline(scaler, resample, svm.LinearSVC()) RForest = pipeline.make_pipeline(scaler, resample, ensemble.RandomForestClassifier()) models = ( ('Constant', baseline), ('Logistic Regression', logreg), ('Decision Tree', dtree), #('kNN', knn), ('Multi-Layer Perceptron', mlp), ('SVM (linear kernel)', svc), ('Random Forest', RForest), ) fig, axes = plt.subplots(nrows=2, ncols=3, dpi=96, sharey=True, sharex=True) for (name, pipe), ax in zip(models, axes.reshape(-1)): y, y_pred = different_models(pipe) acc = metrics.accuracy_score(y, y_pred) prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels) recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels) cm = metrics.confusion_matrix(y, y_pred, labels=labels) cm = norm_cm(cm) cm = pd.DataFrame(cm, index=labels, columns=labels) sns.heatmap(cm, vmin=0, vmax=1, annot=True, fmt='.2f', cmap='Greys', ax=ax, cbar=False, square=True) ax.set_title( f'{name}\naccuracy={acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})' ) fig.tight_layout() fig.savefig('./different_models.pdf', bbox_inches='tight') plt.show()
def run(self, mode: str, pred: list = [], prob: list = [], **kwargs): if 'oversample' in kwargs: sampler = over_sampling.RandomOverSampler() bins = np.linspace(-1, 1, num=20) if mode == 'warmup': all_dataset = AffectNet( self.root_dir, img_transform=self.transform_train, annotation_filename= 'affectnet_annotations_train_all_ext_det_noisy.json', target_transform=self.target_transform, mode=None, filter_expressions=self.filter_expression, **self.kwargs) # debug line if 'oversample' in kwargs: idx = list(range(len(all_dataset))) idx = np.asarray(idx).reshape(-1, 1) labels = [ y['arousal'] for y in all_dataset.data['annotations'] ] labels = np.digitize(labels, bins) labels -= 1 new_idx, _ = sampler.fit_resample(idx, labels) all_dataset = Subset(all_dataset, new_idx.reshape(-1).tolist()) print('# Train Images ' + str(len(all_dataset))) train_loader = DataLoader(dataset=all_dataset, batch_size=self.batch_size * 2, shuffle=True, num_workers=self.num_workers, pin_memory=True) return train_loader elif mode == 'train': labeled_dataset = AffectNet( self.root_dir, img_transform=self.transform_train, annotation_filename= 'affectnet_annotations_train_all_ext_det_noisy.json', target_transform=self.target_transform, mode='labeled', pred=pred, probability=prob, filter_expressions=self.filter_expression, **self.kwargs) unlabeled_dataset = AffectNet( self.root_dir, img_transform=self.transform_train, annotation_filename= 'affectnet_annotations_train_all_ext_det_noisy.json', target_transform=self.target_transform, mode='unlabeled', pred=pred, probability=prob, filter_expressions=self.filter_expression, **self.kwargs) labeled_loader = DataLoader(dataset=labeled_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, pin_memory=True) unlabeled_loader = DataLoader(dataset=unlabeled_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, pin_memory=True) return labeled_loader, unlabeled_loader elif mode == 'test': test_dataset = AffectNet( self.root_dir, img_transform=self.transform_train, annotation_filename= 'affectnet_annotations_val_all_ext_det.json', mode=None, target_transform=self.target_transform, filter_expressions=self.filter_expression_test, **self.kwargs) # debug line print('# Test Images: ' + str(len(test_dataset))) test_loader = DataLoader(dataset=test_dataset, batch_size=self.batch_size * 2, shuffle=True, num_workers=self.num_workers, pin_memory=True) return test_loader elif mode == 'eval_train': eval_dataset = AffectNet( self.root_dir, img_transform=self.transform_train, annotation_filename= 'affectnet_annotations_train_all_ext_det_noisy.json', mode=None, target_transform=self.target_transform, filter_expressions=self.filter_expression, **self.kwargs) eval_loader = DataLoader(dataset=eval_dataset, batch_size=self.batch_size * 2, shuffle=False, num_workers=self.num_workers, pin_memory=True) return eval_loader
y_pred = cls.predict(X_test) print('ROC_AUC ', metrics.roc_auc_score(y_test, y_pred)) print('Recall ', metrics.recall_score(y_test, y_pred)) print('Precision ', metrics.precision_score(y_test, y_pred)) print('F1 ', metrics.f1_score(y_test, y_pred)) metrics.plot_confusion_matrix(cls, X_test, y_test, normalize='true') # ## Abordagem Random Over Sample # ### Ajustando balanceamento # In[6]: from imblearn import over_sampling as over oversample = over.RandomOverSampler(sampling_strategy='minority') X_over, y_over = oversample.fit_resample(X, y) y_over.value_counts() # In[7]: X_over.shape[0] == y_over.shape[0] # ### Distribuição em 2D # # *Não há alteração perceptível porque os data points se sobrepõem* # In[8]: from sklearn.decomposition import TruncatedSVD
[(['c1', 'c2', 'c3', 'c4', 'c5'], uf.Straight(), { 'alias': 'has_straight' }) #, # (['s1', 's2', 's3', 's4', 's5'], None) ], input_df=True, df_out=True, default=False) features_pipeline = ppl.make_union(engineered_feature_pipeline1, engineered_feature_pipeline2, engineered_feature_pipeline3, engineered_feature_pipeline4) sampling_pipeline = imbppl.make_pipeline( over_sampling.RandomOverSampler(random_state=9565)) model_pipeline = imbppl.make_pipeline( LogisticRegression(multi_class='multinomial', penalty='l2', random_state=9546, solver="lbfgs")) pipe = imbppl.Pipeline([('prep', features_pipeline), ('sample', sampling_pipeline), ('clf', model_pipeline)]) y = d_in.hand X = d_in.loc[:, 's1':'c5'] # produces a copy # split - results in < 5 observations for a the smallest class (need for sampling)
def hyperopt(param_space, X_train, y_train, X_test, y_test, args): resampling = over_sampling.RandomOverSampler(sampling_strategy='auto', random_state=42) start = time.time() def objective_function(params): classifier_type = params['type'] del params['type'] if classifier_type == 'rf': clf = RandomForestClassifier(**params) elif classifier_type == 'svm': clf = SVC(**params) else: return 0 pl = make_pipeline_imb(resampling, clf) score = cross_val_score(pl, X_train, y_train, n_jobs=args.cpus, cv=3).mean() return {'loss': -score, 'status': STATUS_OK} rstate = np.random.RandomState(1) # <== Use any number here but fixed trials = Trials() best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=args.num_eval, trials=trials, rstate=rstate) loss = [x['result']['loss'] for x in trials.trials] joblib.dump( trials, os.path.join( args.modeldir, 'hyperopt_trials_niters{}_ssize{}.pkl'.format( args.num_eval, args.ssize))) # best_param_values = [ x for x in best_param.values() ] # # del best_param_values['classifier_type'] # # if best_param_values[2] == 0: # max_features = 'auto' # else: # max_features = 'sqrt' # # if best_param_values[0] == 0: # bootstrap = 'True' # else: # bootstrap = 'False' # # print("Best parameters: ", best_param) # # clf_best = RandomForestClassifier(n_estimators=int(best_param_values[5]), # max_features=max_features, # max_depth=int(best_param_values[1]), # min_samples_leaf=int(best_param_values[3]), # min_samples_split=int(best_param_values[4]), # bootstrap=bootstrap, # n_jobs=args.cpus) # # pl = make_pipeline_imb(resampling, clf_best) # # # clf_best.fit(X_train, y_train) # estimator_fit = pl.fit(X_train, y_train) # print("") print("##### Results") print("Score best parameters: ", min(loss) * -1) print("Best parameters: ", best_param) # print("Test Score: ", estimator_fit.score(X_test, y_test)) print("Time elapsed: ", round(time.time() - start, 2)) print("Parameter combinations evaluated: ", args.num_eval) # # if args.writemodel: # model_file = os.path.join(args.modeldir, 'model-' + args.classifier + '.h5') # # -- save the model # joblib.dump(clf_best, model_file) # print("Writing the model over path {}".format(model_file)) return trials
nd1 = preprocessing.scale(df1.values) logger.info(f"Data loaded") jn = pushbulletNotifier.JobNotification(devices="phone") processes = 25 try: X_train, X_test, y_train, y_test = model_selection.train_test_split(nd1, gender.values, test_size=0.2, stratify=gender.values) logger.info(f"Split data in to training set and validation set.") classifier = ['logisticregression', linear_model.LogisticRegression(max_iter=250)] sampler_lst = [['smote', over_sampling.SMOTE()], ['adasyn', over_sampling.ADASYN()], ['random¬oversampler', over_sampling.RandomOverSampler()]] pipeline_lst = [ [f'{sampler[0]}-{classifier[0]}', make_pipeline(sampler[1], classifier[1])] for sampler in sampler_lst ] # noqa param_grid = { 'logisticregression__C': 2.0**np.linspace(-8, 5, 15) } # noqa for name, pipe in pipeline_lst: jn.send(message=f"Starding cross validation with resampling method {name}") logger.info(f"Starting cross validation") est = model_selection.GridSearchCV(pipe, param_grid, scoring='roc_auc', cv=5, verbose=49, refit=True, n_jobs=processes, pre_dispatch=processes, return_train_score=True) est.fit(X_train, y_train) _, yhat = est.predict_proba(X_test).T try: logger.info(f"Cross validation done, best score was {est.best_score_}") logger.info(f"Best params were {est.best_params_}")
def random_oversampling(features, labels): ros = over_sampling.RandomOverSampler(random_state=0) return ros.fit_resample(X=features, y=labels)
def resample_classes(X, Y, how='und1', random_state=None, test_size=0.3, n_jobs=2, split=True, verbose=True): """ """ if how == 'und1': if verbose: msg = 'Under-sampling the majority class(es) by randomly picking ' msg += 'samples without replacement' print msg samp = imbus.RandomUnderSampler(random_state=random_state, replacement=False) X_res, y_res = samp.fit_sample(X, Y) elif how == 'und2': if verbose: msg = 'Under-sampling by generating centroids based on clustering ' msg += 'methods' print msg samp = imbus.ClusterCentroids(ratio='auto', random_state=random_state, estimator=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X, Y) elif how == 'und3': if verbose: print 'Under-sampling based on NearMiss methods' samp = imbus.NearMiss(ratio='auto', return_indices=False, random_state=random_state, version=1, size_ngh=None, n_neighbors=3, ver3_samp_ngh=None, n_neighbors_ver3=3, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X, Y) elif how == 'over1': if verbose: msg = 'Over-sampling the minority class(es) by picking samples at ' msg += 'random with replacement' print samp = imbov.RandomOverSampler(random_state=random_state) X_res, y_res = samp.fit_sample(X, Y) elif how == 'over2': if verbose: msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling ' msg += 'Technique' print msg X_res, y_res = X, Y for i in range(3): samp = imbov.SMOTE(random_state=random_state, ratio=.99, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) elif how == 'over3': if verbose: msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling ' msg += 'Approach for Imbalanced Learning' print msg X_res, y_res = X, Y for i in range(3): samp = imbov.ADASYN(ratio=.93, random_state=random_state, k=None, n_neighbors=5, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) elif how == 'comb1': if verbose: print 'Combine over- and under-sampling using SMOTE and Tomek links.' X_res, y_res = X, Y for i in range(3): samp = imbcom.SMOTETomek(ratio=.99, random_state=random_state, smote=None, tomek=None, k=None, m=None, out_step=None, kind_smote=None, n_jobs=n_jobs) X_res, y_res = samp.fit_sample(X_res, y_res) else: print 'Sampling approach not recognized' return if verbose: print '\t\t\t1\t2\t3\t4' val_y = pd.Series(Y).value_counts(sort=False).values msg = 'Counts in y_init:\t{}\t{}\t{}\t{} ' print msg.format(val_y[0], val_y[1], val_y[2], val_y[3]) val_yres = pd.Series(y_res).value_counts(sort=False).values msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} ' print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3]) if split: X_train, X_test, y_train, y_test = train_test_split( X_res, y_res, test_size=test_size, random_state=random_state) if verbose: val_ytr = pd.Series(y_train).value_counts(sort=False).values msg = 'Counts in y_train:\t{}\t{}\t{}\t{} ' print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3]) val_yte = pd.Series(y_test).value_counts(sort=False).values msg = 'Counts in y_test:\t{}\t{}\t{}\t{} ' print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3]) print 'X_train:', X_train.shape, ', X_test:', X_test.shape return X_train, X_test, y_train, y_test else: return X_res, y_res
from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from imblearn import over_sampling #oluşturduğumuz veriyi kullanmak için kütüphane import eder gibi alıyoruz from student import egitimGirdi, egitimCikti, valGirdi, valCikti print(egitimGirdi.shape) #### SENTETİK VERİ ÜRETİMİ ros = over_sampling.RandomOverSampler() rosEgitimGirdi, rosEgitimCikti = ros.fit_sample(egitimGirdi, egitimCikti) print(rosEgitimGirdi.shape) smote = over_sampling.SMOTE() smoteEgitimGirdi, smoteEgitimCikti = smote.fit_sample(egitimGirdi, egitimCikti) print(smoteEgitimGirdi.shape) ada = over_sampling.ADASYN(ratio='minority') adasynEgitimGirdi, adasynEgitimCikti = ada.fit_sample(egitimGirdi, egitimCikti) print(adasynEgitimGirdi.shape) #print(adasynEgitimGirdi.shape)
def main(): mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale']) latexify(columns=2) #cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) #poly = preprocessing.PolynomialFeatures(degree=2) scaler = preprocessing.StandardScaler() resample = over_sampling.RandomOverSampler() baseline = pipeline.make_pipeline( scaler, resample, dummy.DummyClassifier(strategy='constant', constant=0)) logreg = pipeline.make_pipeline( scaler, resample, linear_model.LogisticRegression(), ) sgd = pipeline.make_pipeline( scaler, resample, linear_model.SGDClassifier(), ) dtree = pipeline.make_pipeline( scaler, resample, tree.DecisionTreeClassifier(), ) mlp = pipeline.make_pipeline(scaler, resample, neural_network.MLPClassifier()) svc = pipeline.make_pipeline(scaler, resample, svm.LinearSVC()) RForest = pipeline.make_pipeline(scaler, resample, ensemble.RandomForestClassifier()) models = ( ('Constant', baseline), ('Logistic Reg.', logreg), ('Decision Tree', dtree), #('kNN', knn), ('Multi-Layer Perceptron', mlp), ('SVM (linear kernel)', svc), ('Random Forest', RForest), ) colors = sns.color_palette("cubehelix", len(models)) fig, ax = plt.subplots(dpi=92) # Setup a figure #ax.set_title('Precision-Recall curve') #ax.set_xlim(0, 1) #ax.set_ylim(0, 1) ax.set_xlabel('Recall = $\\frac{{TP}}{{TP+FN}}$') ax.set_ylabel('Precision = $\\frac{{TP}}{{TP+FP}}$') # Prepare data for processing data = prepare_data() X, y = data[['rssi', 'rssi_avg', 'rssi_std']].values, data['class'].ravel() Y = preprocessing.label_binarize(y, classes=classes) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=0.2, random_state=random_state) for (name, model), color in zip(models, colors): classifier = multiclass.OneVsRestClassifier( model) # Make model support *.decision_function classifier.fit(X_train, y_train) # generate y_score if hasattr(classifier, 'decision_function'): y_score = classifier.decision_function(X_test) else: y_score = classifier.predict_proba(X_test) #continue # generate probabilities #y_proba = classifier.predict_proba(X_test) # generate predictions y_pred = classifier.predict(X_test) precision = dict() recall = dict() average_precision = dict() acc = metrics.accuracy_score(y_test, y_pred) for i in [1]: # We observe only intermediate class precision[i], recall[i], _ = metrics.precision_recall_curve( y_test[:, i], y_score[:, i]) average_precision[i] = metrics.average_precision_score( y_test[:, i], y_score[:, i]) ax.step(recall[i], precision[i], where='post', color=color, alpha=0.65, label=f'{name}') print(f'Plotted {name}') ax.legend(loc="best") format_axes_for_chart(ax) fig.tight_layout() ensure_dir('./output/') fig.savefig('./output/precision-recall-curve.pdf', dpi=92, bbox_inches='tight') #plt.show() plt.close(fig)
def load_rutgers_with_quantiles(): from glob import glob files = glob('../../featureGenerator/datasets/dataset-2-rutgers-wifi' + '/with-quantiles/*.csv', recursive=True) traces = [parse_rutgers_with_quantiles(df) for df in files] return traces cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) pipe_logreg = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), ('clf', linear_model.LogisticRegression()), ]) pipe_dtree = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), ('clf', tree.DecisionTreeClassifier()), ]) @memory.cache def prepare_data(): dataset = load_rutgers_with_quantiles() print('Rutgers loaded ...')
def run_model(args): classif_type = ['RF', 'SVM'] if args.classifier not in classif_type: print('ERR: select an available classifier (RF, SVM)') sys.exit(1) X_train, y_train, ids_train = prep_data('train', args) X_val, y_val, ids_val = prep_data('val', args) STATUS_OK = 'ok' trials = pkl.load( open( os.path.join( args.modeldir, 'hpt', 'hyperopt_trials_niters{}_ssize{}.pkl'.format( args.trials, args.ssize)), 'rb')) bestmodel = getBestModelfromTrials(trials.trials, args.bestmodel, STATUS_OK) resampling = over_sampling.RandomOverSampler(sampling_strategy='auto', random_state=42) if args.classifier == 'RF': if bestmodel['max_features'][0] == 0: max_features = 'auto' else: max_features = 'sqrt' if bestmodel['bootstrap'][0] == 0: bootstrap = 'True' else: bootstrap = 'False' estimator = RandomForestClassifier( n_estimators=int(bestmodel['n_estimators'][0]), max_features=max_features, max_depth=int(bestmodel['max_depth'][0]), min_samples_leaf=int(bestmodel['min_samples_leaf'][0]), min_samples_split=int(bestmodel['min_samples_split'][0]), bootstrap=bootstrap, n_jobs=-1, verbose=1) else: c_lim = (-2, 7) g_lim = (-2, 4) C_space = [10**exp for exp in range(*c_lim)] gamma_space = [10**exp for exp in range(*g_lim)] kernel_space = ['rbf'] C = C_space[int(bestmodel['C'][0])] gamma = gamma_space[int(bestmodel['gamma'][0])] kernel = kernel_space[int(bestmodel['kernel'][0])] print('Best model using C = {} gamma = {} and kernel {}'.format( C, gamma, kernel)) estimator = SVC(C=C, gamma=gamma, kernel=kernel, verbose=1) pl = make_pipeline_imb(resampling, estimator) #-- train a rf classifier print('Training with the best model with parameters: ', bestmodel) start_train_time = time.time() estimator_fit = pl.fit(X_train, y_train) train_time = round(time.time() - start_train_time, 2) print('Training time (s): ', train_time) #-- test a rf classifier start_train_time = time.time() test_score = estimator_fit.score(X_val, y_val) test_time = round(time.time() - start_train_time, 2) print("Test Score: ", test_score) print("Time elapsed: ", test_time) def makedir(outfolder): if not os.path.exists(outfolder): os.makedirs(outfolder) outdir = os.path.join(args.modeldir, 'models') makedir(outdir) model_file = os.path.join( outdir, 'model-{}_bm{}.h5'.format(args.classifier, args.bestmodel)) #-- save the model joblib.dump(estimator_fit, model_file) print("Writing the model over {}".format(model_file)) eval_label = ['OA', 'train_time', 'test_time'] history = np.zeros((len(eval_label), 1)) history_file = os.path.join( outdir, 'trainingHistory-{}_bm{}.csv'.format(args.classifier, args.bestmodel)) history[0] = test_score history[1] = train_time history[2] = test_time df = pd.DataFrame(np.transpose(history), columns=eval_label) df.to_csv(history_file)