def test_sample_regular_with_nn_svm(): """Test sample function with regular SMOTE with a NN object.""" # Create the object kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = SVC(random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436888, -0.2645749], [1.07844561, -0.19435291], [1.44015515, -1.30621303]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_borderline2(): """Test sample function with borderline 2 SMOTE.""" # Create the object kind = 'borderline2' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436888, -0.2645749], [1.07844561, -0.19435291], [0.33339622, 0.49870937]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_nn_obj(): kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ -0.41635887, -0.38299653 ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_with_nn_svm(): kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = SVC(gamma='scale', random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436887, -0.2645749], [1.07844562, -0.19435291], [1.44228238, -1.31256615], [1.25636713, -1.04463226]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def fit(self, X , y = None): # 'Random under-sampling' # CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51) #Accuracy: 0.939693267481 #Precision: 0.238095238095 #Recall: 0.897435897436 #Accuracy: 0.962568234988 #Precision: 0.324468085106 #Recall: 0.782051282051 #SMOTE(ratio=ratio, kind='borderline1') #Accuracy: 0.971146347803 #Precision: 0.372093023256 #Recall: 0.615384615385 #SMOTE(ratio=ratio, kind='borderline2') #Accuracy: 0.965427605927 #Precision: 0.333333333333 #Recall: 0.705128205128 #svm_args = {'class_weight': 'auto'} #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args) #Accuracy: 0.972186119054 #Precision: 0.395683453237 #Recall: 0.705128205128 smote = SMOTE(ratio='auto', kind='regular') X, y = smote.fit_sample(X, y) # weights = np.array([1/y.mean() if i == 1 else 1 for i in y]) return super(RandomForestClassifier, self).fit(X,y)#,sample_weight=weights)
def train(addr_train, clf, sampling, add_estimators): with open(os.path.join(addr_train, "day_samp_bin.npy"), "r") as file_in: X = smio.load_sparse_csr(file_in) width = np.size(X, 1) X_train = X[:, :width-1] y_train = X[:, width-1] if sampling == "Over": sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) elif sampling == "Under": X_train, y_train = US.undersample(X, 0.01) print "Fitting Model......" clf.n_estimators += add_estimators clf.fit(X_train, y_train) print "Done" if __SAVE_MODEL: model_name = "RF_" + onoff_line + "_" + sampling + "_Model.p" dir_out = os.path.join(addr_train, "Random_Forest_Models") if not os.path.isdir(dir_out): os.mkdir(dir_out) path_out = os.path.join(dir_out, model_name) with open(path_out, "w") as file_out: pickle.dump(clf, file_out) return clf
def get_data(month, day, hour=-1, mode="normal"): if hour != -1: if hour == 24: hour = 0 day += 1 addr_in = os.path.join("/mnt/rips2/2016", str(month).rjust(2, "0"), str(day).rjust(2, "0"), str(hour).rjust(2, "0"), "output_bin.npy") else: addr_in = os.path.join("/mnt/rips2/2016", str(month).rjust(2, "0"), str(day).rjust(2, "0"), "day_samp_newer_bin.npy") with open(addr_in, "r") as file_in: loader = np.load(file_in) data = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']).toarray() X = data[:, :-1] y = data[:, -1] if mode == "over": sm = SMOTE(ratio=0.99, verbose=0) X, y = sm.fit_sample(X, y) return X, y
def resample_data(X, y, categorical_lst): ''' up-samples minority class ''' sm = SMOTE(kind='regular') X_train_re, y_train_re = sm.fit_sample(X,y) #rounding categorical variables X_train_re[:,categorical_lst] = np.round(X_train_re[:,categorical_lst]) return X_train_re, y_train_re
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object sm = SMOTE(random_state=RND_SEED) sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_sample_regular_wrong_svm(): kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = 'rnd' smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y)
def Input_Preparing(Scaled_Input_Data, Surgery_Outcome, N_Feat): # Feature Selection MIFS = mifs.MutualInformationFeatureSelector(method='JMI', verbose=2, n_features = N_Feat) MIFS.fit(Scaled_Input_Data, Surgery_Outcome) Selected_Input_Data = Scaled_Input_Data.loc[:,MIFS.support_] # Balancing using SMOTE sm = SMOTE(kind='regular') Prep_Train_Data, Prep_Surgery_Outcome = sm.fit_sample(X, y) return(Prep_Train_Data, Prep_Surgery_Outcome, MIFS.support_)
def transform(self, fp): fm, train_x, train_y = FeaturePool.to_train_arrays(fp) os = SMOTE(random_state = self.random_state) os_train_x, os_train_y = os.fit_sample(train_x, train_y[:, 0]) os_train_y = os_train_y.reshape((os_train_y.shape[0], 1)) for f in FeaturePool.from_train_arrays(fm, os_train_x, os_train_y): yield Feature.apply_config(f, is_over_sampled=True) for f in fp: if f.split_type == SplitType.TEST: yield f
def SMT(df, target): df1 = df.copy() y = df1.pop('anti_churn') X = df1 Xcols = df1.columns sm = SMOTE(kind='regular', ratio = target) X_resampled, y_resampled = sm.fit_sample(X, y) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) X_resampled.columns = Xcols y_resampled.columns = ['anti_churn'] return X_resampled, y_resampled
def oversample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: #ros = RandomOverSampler() ros = SMOTE() #ros = ADASYN() self.X, self.y = ros.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_smote_fit(): """Test the fitting method""" # Create the object smote = SMOTE(random_state=RND_SEED) # Fit the data smote.fit(X, Y) # Check if the data information have been computed assert_equal(smote.min_c_, 0) assert_equal(smote.maj_c_, 1) assert_equal(smote.stats_c_[0], 8) assert_equal(smote.stats_c_[1], 12)
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def train(cutoffs): print "\n========== Start Training ==========" if __DATA_FROM == 2: list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1]) else: list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1]) clf = BernoulliNB(class_prior=[0.05, 0.95]) if __IF_TRAIN_WITHOUT_SAVE: print "Performing correlation explanation......" with open("/home/wlu/Desktop/day_samp_bin_1-2.npy", "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: X = discard_vars(X, cutoffs) layer = correlation_ex(X) for i in range(0, len(list_io_addr)): path_in = list_io_addr[i] print "\nGenerating training set from {}".format(path_in) with open(path_in, "r") as file_in: X = Sparse_Matrix_IO.load_sparse_csr(file_in) if len(cutoffs) > 0: X = discard_vars(X, cutoffs) vector_len = len(X[0]) X_train = X[:, 0:vector_len-1] y_train = X[:, vector_len-1] if __IF_TRAIN_WITHOUT_SAVE: print "Transforming training set according to CorEx......" X_train = corex_transform(layer, X_train) sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) print "Fitting Model......" clf.partial_fit(X_train, y_train, classes=[0, 1]) print "Done" if __IF_TRAIN_WITHOUT_SAVE: return [clf, layer] else: with open(__ROOT_MODEL, "w") as file_out: pickle.dump(clf, file_out) return []
def get_data(ratio, sampling): list_io_addr = get_io_addr() data = [] for addr_in in list_io_addr: with open(addr_in, "r") as file_in: X = smio.load_sparse_csr(file_in) data.extend(X) data = np.array(data) n = 30000 if sampling == "Over": m = int(np.size(data, 1)) k = int(0.8*n) X = data[:n, :m-1] y = data[:n, m-1:] X_train = X[:k, :] y_train = y[:k] sm = SMOTE(ratio=ratio) X_train, y_train = sm.fit_sample(X_train, column_or_1d(y_train, warn=False)) X_test = X[k:, :] y_test = y[k:] elif sampling == "None": m = int(np.size(data, 1)) k = int(0.8*n) X = data[:n, :m-1] y = data[:n, m-1:].ravel() X_train = X[:k, :] y_train = y[:k] X_test = X[k:, :] y_test = y[k:] else: m = int(np.size(data, 1)) k = int(0.2*np.size(data, 0)) data_test = data[k:, :] data = data[:k, :] data = US.undersample(data, ratio) k = int(0.8*np.size(data, 0)) if np.size(data_test, 0) > k: data_test = data[:k, :] X_train = data[:, :m-1] y_train = data[:, m-1:].ravel() X_test = data_test[:, :m-1] y_test = data_test[:, m-1:].ravel() return X_train, y_train, X_test, y_test
def clf_extratree_predictor(item): (clf_args,idx,X,y,use_SMOTE) = item train_index, test_index = idx clf = sklearn.ensemble.ExtraTreesClassifier(**clf_args) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if use_SMOTE: sampler = SMOTE(ratio='auto', kind='regular') X_train, y_train = sampler.fit_sample(X_train,y_train) clf.fit(X_train,y_train) pred = clf.predict(X_test) pred_proba = clf.predict_proba(X_test) return idx,pred,pred_proba
def train_and_test_dnn(args): for a in args: print(a) primitive = args[1] res = pickle.load(open(sys.argv[2], "rb" )) notes_with_truth_labels_for_query_primitives = pd.read_csv(args[3]) dl_results = pd.DataFrame(columns = ['primitive', 'avg_fit_time', 'avg_score_time', 'avg_score']) X = get_doc_term_matrix(res) y = notes_with_truth_labels_for_query_primitives.loc[:, primitive] clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 5, 2), random_state=1) try: sm = SMOTE(random_state=357) X_sm, y_sm = sm.fit_sample(X, y) except ValueError: print("value error, smote") X_sm = X y_sm = y cv_results = cross_validate(clf, X_sm, y_sm, cv=3, return_train_score=False) print(cv_results) dump(clf, './models/{}_trained_dnn.joblib'.format(primitive)) dl_results.loc[0, 'primitive'] = primitive dl_results.loc[0, 'avg_fit_time'] = np.mean(cv_results['fit_time']) dl_results.loc[0, 'avg_score_time'] = np.mean(cv_results['score_time']) dl_results.loc[0, 'avg_test_score'] = np.mean(cv_results['test_score']) with open(args[4], 'a') as f: f.write("{}, {}, {}, {}\n".format(dl_results.loc[0,'primitive'], dl_results.loc[0,'avg_fit_time'], dl_results.loc[0,'avg_score_time'], dl_results.loc[0,'avg_test_score'])) #f.write(dl_results.loc[0,:]) #f.write("\n") f.close() print("DONE w/ {}".format(primitive))
def run_save_model(save_folder, spec, model_no, X_train, y_train, model_fn): kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2) cvscores = [] f1scores = [] for train, val in kfold.split(X_train, y_train): # create model using the model_fn parameter model = model_fn(spec, X_train) if model == None: return # returns if there was a mistake in specifications # fit model to k-split of training data num_examples, dx, dy = X_train[train].shape X_resampled, y_resampled = SMOTE(kind='borderline1', random_state=1).fit_sample( X_train[train].reshape((num_examples, dx * dy)), y_train[train]) num_total_examples, _ = X_resampled.shape X_resampled_reshaped = X_resampled.reshape(num_total_examples, dx, dy) model.fit(x=X_resampled_reshaped, y=y_resampled, epochs=10, batch_size=16, verbose=0) # evaluate model scores = model.evaluate(X_train[val], y_train[val], verbose=0) print('Accuracy: {}%'.format(scores[1] * 100)) cvscores.append(scores[1]) # get f1 f1 = f1_score(y_train[val], model.predict(X_train[val]) > 0.5) print('F1 score: {}'.format(f1)) f1scores.append(f1) mean_acc = 'Mean Accuracy: {}% +/- {}%'.format(np.mean(cvscores) * 100, np.std(cvscores) * 100) mean_f1 = 'Mean F1 score: {} +/- {}'.format(np.mean(f1scores), np.std(f1scores)) print(mean_acc) print(mean_f1) # modelfile = save_folder + 'model' + str(model_no) + '.h5' # save_model(model, modelfile) # print('model saved') txtfile = save_folder + 'model' + str(model_no) + '.txt' with open(txtfile, 'w') as f: f.write(mean_acc) f.write(mean_f1) f.write('\n') f.write('\n') f.writelines(spec) print('specs saved')
def test_sample_regular_half(): ratio = {0: 9, 1: 12} kind = 'regular' smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def DataFormat(data): Data = smio.load_sparse_csr(data) m = int(np.size(Data, 1)) n = int(np.size(Data, 0)) X_train = Data[:50000, :m-1] y_train = Data[:50000, m-1] sm = SMOTE(ratio=0.95) X_train, y_train = sm.fit_sample(X_train, y_train) data_new = [] for i in range(np.size(X_train, 0)): row = list(X_train[i].tolist()) row.append(y_train[i]) data_new.append(row) shuffle(data_new) data_new = np.array(data_new) m = int(np.size(data_new, 1)) X_train = data_new[:, :m-1] y_train = data_new[:, m-1] K = np.count_nonzero(y_train) # Number of good data points return X_train, y_train, n, K # Training set plus some numbers useful for weighting
def test_sample_regular_with_nn(): kind = 'regular' nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def oversample(X, y, bal_strategy): if(bal_strategy == "SMOTESVN" or bal_strategy == "ALL"): # Apply SMOTE SVM sm = SMOTE(kind='svm') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "SMOTE" or bal_strategy == "ALL"): # Apply regular SMOTE sm = SMOTE(kind='regular') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "ADASYN" or bal_strategy == "ALL"): # Apply the random over-sampling ada = ADASYN() X_sampled, y_sampled = ada.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE' sys.exit(1) return (X_sampled, y_sampled)
def get(addr_day, mode="normal", ratio=-1, sampling_method="None", bin=False): if "res" in mode: res_ratio = mode.split("-")[1] prefix = "day_samp_res" suffix = "_{}.npy".format(res_ratio) res = "Reservoir_Data" else: prefix = "day_samp_new" suffix = ".npy" res = "" if not ratio == -1: n = 100000 neg = int(n / (1+ratio)) pos = n - neg with open(os.path.join(addr_day, "PosNeg", res, prefix + "_neg" + suffix), "r") as file_neg: matrix_neg = smio.load_sparse_csr(file_neg) matrix_neg = matrix_neg[:neg, :] with open(os.path.join(addr_day, "PosNeg", res, prefix + "_pos" + suffix), "r") as file_pos: matrix_pos = smio.load_sparse_csr(file_pos) matrix_pos = matrix_pos[:pos, :] matrix = vstack((matrix_neg, matrix_pos)) np.random.shuffle(matrix) else: with open(os.path.join(addr_day, res, prefix + suffix), "r") as file_in: matrix = smio.load_sparse_csr(file_in) width = np.size(matrix, 1) X = matrix[:, :width-1] y = matrix[:, width-1] if "Over" in sampling_method: sm = SMOTE(ratio=0.95) X, y = sm.fit_sample(X, y) return X, y
def test_wrong_nn(): kind = 'borderline1' nn_m = 'rnd' nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y) nn_k = 'rnd' nn_m = NearestNeighbors(n_neighbors=10) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y) kind = 'regular' nn_k = 'rnd' smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y)
y_pred = rf.predict(X_test) # calculate the accuracy score score = accuracy_score(y_test, y_pred) # calculate the precision precision = precision_score(y_test, y_pred) # display 'score' and 'precision' # -------------- # import packages from imblearn.over_sampling import SMOTE # Instantiate smote smote = SMOTE() # fit_sample onm training data X_train, y_train = smote.fit_sample(X_train, y_train) # fit modelk on training data rf.fit(X_train, y_train) # predict on test data y_pred = rf.predict(X_test) # calculate the accuracy score score = accuracy_score(y_test, y_pred) # calculate the precision precision = precision_score(y_test, y_pred)
class Predict_football(): def __init__(self, raw_data, model): plt.rc("font", family="Malgun Gothic") plt.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(10, 10)) self.data = raw_data self.scaler = StandardScaler() self.sampler = SMOTE(random_state=42) self.pca = PCA(random_state=42) self.add = 0.1 self.skip = [] self.columns = None self.clf = model self.data["2HTHG"] = self.data["FTHG"] - self.data["HTHG"] self.data["2HTAG"] = self.data["FTAG"] - self.data["HTAG"] self.data["FGD"] = self.data["FTHG"] - self.data["FTAG"] self.data["2HGD"] = self.data["2HTHG"] - self.data["2HTAG"] self.data["HGD"] = self.data["HTHG"] - self.data["HTAG"] self.data["SD"] = self.data["HS"] - self.data["AS"] self.data["STD"] = self.data["HST"] - self.data["AST"] self.data["Pezzali"] = (self.data["FTHG"] + self.add) / ( self.data["HS"] + self.add) * (self.data["AS"] + self.add) / ( self.data["FTAG"] + self.add) self.COL = [ "FTHG", "2HTHG", "HTHG", "HS", "HST", "FTAG", "2HTAG", "HTAG", "AS", "AST" ] self.r_COL = ["FGD", "2HGD", "HGD", "SD", "STD", "Pezzali"] self.df = self.data.copy() # ------------------ ANOVA ------------------ def ANOVA(self): df = self.data.drop(columns=["Div", "HomeTeam", "AwayTeam", "RESULT"]) df = df.iloc[:train_index, :] scaled_train = self.scaler.fit_transform(df) scaled_train = pd.DataFrame(scaled_train, columns=df.columns) df = self.sampler.fit_resample(scaled_train, self.data.loc[:train_index, "RESULT"]) df = pd.concat([df[0], df[1]], axis=1) fstat, p_val = f_oneway(df.loc[df["RESULT"] == 0, df.columns[:-1]], df.loc[df["RESULT"] == 1, df.columns[:-1]], df.loc[df["RESULT"] == 2, df.columns[:-1]]) print(p_val) print(df.columns[:-1][p_val > 0.05]) # ------------------ Post-hoc ------------------ def PH(self): df = self.data.drop(columns=["Div", "HomeTeam", "AwayTeam", "RESULT"]) for i in df.columns: posthoc = pairwise_tukeyhsd(self.data.iloc[:, [i]], self.data["RESULT"], alpha=0.05) plt.figure(figsize=(10, 10)) posthoc.plot_simultaneous() plt.title("{}".format(self.data.columns[i])) plt.show() # ------------------ plot data pdf(probability density function) ------------------ def plot(self): res = ["패", "무", "승"] color = ["r", "g", "b"] y_Max = 0 x_Max = 0 x_Min = 100 for col in self.data.columns[3:-1]: for i in range(3): values = self.data[self.data["RESULT"] == i][col].value_counts().values / \ self.data[self.data["RESULT"] == i][col].shape[0] y_Max = max(y_Max, values.max()) x_Max = max(x_Max, self.data[self.data["RESULT"] == i][col].max()) x_Min = min(x_Min, self.data[self.data["RESULT"] == i][col].min()) fig, axes = plt.subplots(1, 3, figsize=(15, 8)) for i, (r, ax, c) in enumerate(zip(res, axes, color)): print(self.data[self.data["RESULT"] == i][col].value_counts()) index = self.data[self.data["RESULT"] == i][col].value_counts().index.tolist() values = self.data[self.data["RESULT"] == i][col].value_counts().values / \ self.data[self.data["RESULT"] == i][col].shape[0] ax.bar(index, values, color=c, label="{}".format(r)) ax.set_xlabel("{}".format(col)) ax.set_ylabel("bins") ax.set_xlim(x_Min, x_Max) ax.set_ylim(0, y_Max) ax.legend() plt.show() # ------------------ 해당 경기 홈팀과 원정팀의 이전 5경기 맞대결 데이터들의 평균 값 ------------------ def H2H(self, home, away, index, ratio=1): selected_df = self.data[self.data.index < index] record = selected_df[((selected_df['HomeTeam'] == home) & (selected_df['AwayTeam'] == away)) | ((selected_df['HomeTeam'] == away) & (selected_df['AwayTeam'] == home))].copy() # 승리와 패배의 경우 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 HTR과 RESULT를 홈과 바꿈 (계산 편리) record.loc[(record['AwayTeam'] == home) & (record['HTR'] != 1), ['HTR']] = \ 3 - record.loc[(record['AwayTeam'] == home) & (record['HTR'] != 1), ['HTR']] record.loc[(record['AwayTeam'] == home) & (record['RESULT'] != 1), ['RESULT']] = \ 2 - record.loc[(record['AwayTeam'] == home) & (record['RESULT'] != 1), ['RESULT']] # 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 feature들을 홈팀기준으로 변경, Pezzali는 -가 아닌 역수를 취함 record.loc[record["AwayTeam"] == home, self.r_COL[:-1]] = -record.loc[record["AwayTeam"] == home, self.r_COL[:-1]] record.loc[record["AwayTeam"] == home, ["Pezzali"]] = 1 / record.loc[record["AwayTeam"] == home, ["Pezzali"]] temp = record.loc[record["AwayTeam"] == home, self.COL[:5]].values record.loc[record["AwayTeam"] == home, self.COL[:5]] = record.loc[record["AwayTeam"] == home, self.COL[5:]].values record.loc[record["AwayTeam"] == home, self.COL[5:]] = temp if record.shape[0] == 0: self.skip.append(index) return div = 0 if record.shape[0] >= 5: record = record[-5:] INDEX = record["RESULT"].value_counts().index VALUES = record["RESULT"].value_counts().values for idx, val in zip(INDEX, VALUES): record.loc[record["RESULT"] == idx, self.r_COL] = record.loc[record["RESULT"] == idx, self.r_COL] * val record.loc[record["RESULT"] == idx, self.COL] = record.loc[record["RESULT"] == idx, self.COL] * val div += val**2 # 다른 방식과 혼합해서 사용할 경우 비율을 조정 self.df.loc[[index], self.r_COL] = record[self.r_COL].sum(axis=0).values * ratio self.df.loc[[index], self.COL] = record[self.COL].sum(axis=0).values * ratio # self.df.loc[[index], self.r_self.COL] = record[self.r_self.COL].ewm(span=record.shape[0], adjust=True).mean().sum().values * ratio # self.df.loc[[index], self.COL] = record[self.COL].ewm(span=record.shape[0], adjust=True).mean().mean().values * ratio self.df.loc[[index], ["HTR"]] = np.ravel(record["HTR"].mean(axis=0)) * ratio # ------------------ 해당 경기 홈팀의 이전 5경기 데이터 평균 값 - 해당 경기 원정팀의 이전 5경기 데이터 평균 값 ------------------ def Last_5(self, home, away, index, ratio=0.2): selected_df = self.data[self.data.index < index] home_record = selected_df[((selected_df['HomeTeam'] == home) | (selected_df['AwayTeam'] == home))].copy() away_record = selected_df[((selected_df['HomeTeam'] == away) | (selected_df['AwayTeam'] == away))].copy() home_record["RESULT"].replace(2, 3, inplace=True) away_record["RESULT"].replace(2, 3, inplace=True) # 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 HTR과 RESULT를 홈과 바꿈 home_record.loc[(home_record['AwayTeam'] == home) & (home_record['HTR'] != 1), ['HTR']] = \ 3 - home_record.loc[(home_record['AwayTeam'] == home) & (home_record['HTR'] != 1), ['HTR']] home_record.loc[(home_record['AwayTeam'] == home) & (home_record['RESULT'] != 1), ['RESULT']] = \ 3 - home_record.loc[(home_record['AwayTeam'] == home) & (home_record['RESULT'] != 1), ['RESULT']] home_record.loc[home_record["AwayTeam"] == home, ["Pezzali"]] = 1 / home_record.loc[ home_record["AwayTeam"] == home, ["Pezzali"]] # 현재 경기의 원정팀이 과거에 원정에서 치른 경기의 HTR과 RESULT를 홈과 바꿈 away_record.loc[(away_record['AwayTeam'] == away) & (away_record['HTR'] != 1), ['HTR']] = \ 3 - away_record.loc[(away_record['AwayTeam'] == away) & (away_record['HTR'] != 1), ['HTR']] away_record.loc[(away_record['AwayTeam'] == away) & (away_record['RESULT'] != 1), ['RESULT']] = \ 3 - away_record.loc[(away_record['AwayTeam'] == away) & (away_record['RESULT'] != 1), ['RESULT']] if index in self.skip: self.df.loc[[index], ["HTR"] + self.r_COL] = 0 ratio = 0.5 # 이전 10 경기 획득 승점 if home_record.shape[0] >= 10: home_record = home_record[-10:] # 2부리그 경기에 대해 0.8의 가중치 home_record.loc[home_record["Div"] == "E1", "RESULT"] *= 0.8 self.df.loc[[index], "HP"] = home_record["RESULT"].sum(axis=0) if away_record.shape[0] >= 10: away_record = away_record[-10:] away_record.loc[away_record["Div"] == "E1", "RESULT"] *= 0.8 self.df.loc[[index], "AP"] = away_record["RESULT"].sum(axis=0) if home_record.shape[0] >= 5: home_record = home_record[-5:] if away_record.shape[0] >= 5: away_record = away_record[-5:] # 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 피처들을 홈을 기준으로 변환 home_record.loc[home_record["AwayTeam"] == home, self.r_COL[:-1]] = -home_record.loc[ home_record["AwayTeam"] == home, self.r_COL[:-1]] home_record.loc[home_record["AwayTeam"] == home, ["Pezzali"]] = 1 / home_record.loc[ home_record["AwayTeam"] == home, ["Pezzali"]] INDEX = home_record["RESULT"].value_counts().index VALUES = home_record["RESULT"].value_counts().values for idx, val in zip(INDEX, VALUES): home_record.loc[home_record["RESULT"] == idx, self.r_COL] *= val # df.loc[[index], self.r_COL + ["HTR"]] += home_record[self.r_COL + ["HTR"]].ewm( # span=home_record.shape[0]).mean().mean().values self.df.loc[[index], self.r_COL] += home_record[self.r_COL].sum( axis=0).values * ratio H_HTR = np.ravel(home_record["HTR"].mean(axis=0)) # 현재 경기의 원정팀이 과거에 원정에서 치른 경기의 피처들을 홈을 기준으로 변환 away_record.loc[away_record["AwayTeam"] == away, self.r_COL[:-1]] = -away_record.loc[ away_record["AwayTeam"] == away, self.r_COL[:-1]] away_record.loc[away_record["AwayTeam"] == away, ["Pezzali"]] = 1 / away_record.loc[ away_record["AwayTeam"] == away, ["Pezzali"]] INDEX2 = away_record["RESULT"].value_counts().index VALUES2 = away_record["RESULT"].value_counts().values for idx, val in zip(INDEX2, VALUES2): away_record.loc[away_record["RESULT"] == idx, self.r_COL] *= val # self.df.loc[[index], self.r_COL + ["HTR"]] += away_record[self.r_COL + ["HTR"]].ewm( # span=away_record.shape[0]).mean().mean().values self.df.loc[[index], self.r_COL] -= away_record[self.r_COL].sum( axis=0).values * ratio A_HTR = np.ravel(away_record["HTR"].mean(axis=0)) self.df.loc[[index], ["HTR"]] += (H_HTR - A_HTR + 3) / 2 * ratio # ------------------ 해당 경기 홈팀의 이전 홈 5경기 데이터 평균 값 - 해당 경기 원정팀이 이전 원정 5경기 데이터 평균 값 ------------------ def Last_5_GF_GA(self, home, away, index, ratio=0.2): selected_df = self.data[self.data.index < index].copy() home_record = selected_df[selected_df['HomeTeam'] == home].copy() away_record = selected_df[selected_df['AwayTeam'] == away].copy() # 승점 등의 기준으 홈팀으로 설정되어 있으므로 원정팀에 해당하는 형식으로 바꿈 away_record.loc[:, ["Pezzali"]] = 1 / away_record["Pezzali"] away_record.loc[away_record['HTR'] != 1, ['HTR']] = \ 3 - away_record.loc[away_record['HTR'] != 1, ['HTR']] away_record.loc[away_record['RESULT'] != 1, ['RESULT']] = \ 2 - away_record.loc[away_record['RESULT'] != 1, ['RESULT']] if home_record.shape[0] == 0 & away_record.shape[0] == 0: return if home_record.shape[0] >= 5: home_record = home_record[-5:] if away_record.shape[0] >= 5: away_record = away_record[-5:] # 해당 방식을 단독으로 사용할 경우 # df.loc[[index], self.r_self.COL + ['HTR']] = 0 # 현재 경기가 상대전적 데이터가 없는 팀간의 경기일 경우 if index in self.skip: self.df.loc[[index], ["HTR"] + self.r_COL] = 0 ratio = 1 INDEX = home_record["RESULT"].value_counts().index VALUES = home_record["RESULT"].value_counts().values # 빈도에 따른 가중치 부여 for idx, val in zip(INDEX, VALUES): home_record.loc[home_record["RESULT"] == idx, self.COL[:5] + ["Pezzali"]] *= val # df.loc[[index], self.r_self.COL] += home_record[self.COL[:5] + ["Pezzali"]].ewm( # span=home_record.shape[0]).mean().sum().values * ratio self.df.loc[[index], self.r_COL] += home_record[self.COL[:5] + ["Pezzali"]].sum( axis=0).values * ratio self.df.loc[[index], self.COL[:5]] += home_record[self.COL[:5]].sum( axis=0).values INDEX2 = away_record["RESULT"].value_counts().index VALUES2 = away_record["RESULT"].value_counts().values for idx, val in zip(INDEX2, VALUES2): away_record.loc[away_record["RESULT"] == idx, self.COL[5:] + ["Pezzali"]] *= val # self.df.loc[[index], self.r_COL] -= away_record[self.COL[:5] + ["Pezzali"]].ewm( # span=away_record.shape[0]).mean().sum().values * ratio self.df.loc[[index], self.r_COL] -= away_record[self.COL[5:] + ["Pezzali"]].sum( axis=0).values * ratio self.df.loc[[index], self.COL[:5]] += away_record[self.COL[5:]].sum( axis=0).values H_HTR = home_record[["HTR"]].mean(axis=0) A_HTR = away_record["HTR"].mean(axis=0) # 0 ~ 3으로 정규화 val = (H_HTR - A_HTR + 3) / 2 * ratio self.df.loc[[index], "HTR"] += np.ravel(val) def remove_draw(self): # 무승부 데이터 제거 self.df = self.df[self.df.RESULT != 1] def train_test_split(self): train = self.df.loc[self.df.index < train_index, ["HTR"] + self.r_COL] train_label = self.df.loc[self.df.index < train_index, ["RESULT"]] test = self.df.loc[self.df.index >= train_index, ["HTR"] + self.r_COL] test_label = self.df.loc[self.df.index >= train_index, ["RESULT"]] self.columns = train.columns print("------------------ trainset example ------------------\n", train.head(20)) print("------------------ testset example ------------------\n", test.head(20)) print("------------------ train_label counts ------------------\n", train_label.value_counts()) print("------------------ test_label counts ------------------\n", test_label.value_counts()) train = self.scaler.fit_transform(train) test = self.scaler.transform(test) return train, test, train_label, test_label def corr(self): sns.heatmap(data=self.df.corr(), annot=True, fmt=".2f") plt.savefig("corr.jpg") sns.pairplot(self.df, height=3, hue="RESULT") plt.savefig("pairplot.jpg") # ------------------ oversampling ------------------ def oversampling(self, data, label, n=5): self.sampler.k_neighbors = n resampled_data, resampled_label = self.sampler.fit_resample( data, label) return resampled_data, resampled_label def Train(self, data, label): self.clf.fit(data, np.ravel(label)) def prediction(self, data, label): print(classification_report(np.ravel(label), self.clf.predict(data))) print(confusion_matrix(np.ravel(label), self.clf.predict(data))) print("{}%".format( np.round(self.clf.score(data, np.ravel(label)) * 100, 3))) # ------------------ PCA ------------------ def D_red(self, data1, data2, n=2): self.pca.n_components = n pca_train = self.pca.fit_transform(data1) pca_test = self.pca.transform(data2) print(np.round(self.pca.explained_variance_, 3)) print(np.round(self.pca.explained_variance_ratio_, 3)) return pca_train, pca_test def plot_dist(self, data, label, db=False): plt.xlim(data[:, 0].min(), data[:, 0].max() + 1) plt.ylim(data[:, 1].min(), data[:, 1].max() + 1) mglearn.discrete_scatter(data[:, 0], data[:, 1], np.ravel(label.values.reshape(1, -1)).astype( np.int32), alpha=0.7) plt.legend(["패", "무", "승"]) if db: mglearn.plots.plot_2d_classification(self.clf, data, fill=True, alpha=.7) plt.show()
# fit vec_final = TfidfVectorizer(max_df=.5, min_df=5, lowercase=False, ngram_range=(1, 1)) dtm = vec_final.fit_transform([t for t in df['raw']]) logreg = LogisticRegression(max_iter=1000) start_time = time.time() # time execution for comparison with wordfish sm = SMOTE(random_state=42, sampling_strategy={ 'AfD': len(df[df['party'] == 'CDU/CSU']) * 5, 'SPD': len(df[df['party'] == 'CDU/CSU']), 'CDU/CSU': len(df[df['party'] == 'CDU/CSU']), 'FDP': len(df[df['party'] == 'CDU/CSU']), 'GRUENE': len(df[df['party'] == 'CDU/CSU']), 'PDS/LINKE': len(df[df['party'] == 'CDU/CSU']) }) X_final, y_res_pt = sm.fit_resample(dtm, df['party']) y_res = [t == 'AfD' for t in y_res_pt] logreg.fit(X_final, y_res) # predict pred = logreg.predict_proba(dtm) l_pred = [] for p in pred: l_pred.append(p[1]) end_time = time.time()
def smote_oversample(self, X, y): sm = SMOTE(n_jobs=2) heart_signal_res, labels_res = sm.fit_sample(X, y) heart_signal_res = np.reshape(heart_signal_res, (heart_signal_res.shape[0],)) return heart_signal_res, labels_res
# + from sklearn.model_selection import cross_val_predict y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=10) # - confusion_matrices(y_train, y_train_pred) # # this is terrible! - will SMOTE help? # + from sklearn.model_selection import KFold from imblearn.over_sampling import SMOTE smt = SMOTE() def KFold_SMOTE_model_scores(X_df, y, model): scores = [] cv = KFold(n_splits=5, random_state=42, shuffle=False) # need to reset the indices as the X_df = X_df.reset_index(drop=True) y = y.reset_index(drop=True) #this will shuffle through 5 different training and validation data splits for train_index, val_index in cv.split(X_df): X_train = X_df.loc[train_index]
# #### Cost Sensitive Learning, # ### Synthetic Data Generation looks more suitable as it will be less prone to overfitting and also there will be no loss of data # In[73]: imbalance_train = churn = (sum(y_train['churn'])/len(y_train['churn'].index))*100 print("Telecom train dataset Imbalance before smote: {}".format(imbalance_train)) # In[74]: # sampling_strategy: auto which is equivalent to "not majority" ie, oversampling all the classes except the majority # kind: regular smote = SMOTE(kind = "regular") X_train_balanced,y_train_balanced = smote.fit_sample(X_train,y_train) churn_percentage = (sum(y_train_balanced)/len(y_train_balanced))*100 print("X train dataset {}".format(X_train_balanced.shape)) print("y train dataset {}".format(y_train_balanced.shape)) print("Telecom train dataset Imbalance after smote: {}".format(churn_percentage)) # In[75]: print(type(X_train_balanced))
average_samples = int(mean(no_samples)) weights = [] for i in range(len(no_samples)): if no_samples[i] < average_samples: weights.append(average_samples) else: weights.append(no_samples[i]) ratio_over = { 0: weights[0], 1: weights[1], 2: weights[2], 3: weights[3], 4: weights[4] } over = SMOTE(sampling_strategy=ratio_over, random_state=314) X_train, y_train = over.fit_resample(X_train, y_train) # undersample samples > average ratio_under = { 0: average_samples, 1: average_samples, 2: average_samples, 3: average_samples, 4: average_samples } under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314) X_train, y_train = under.fit_resample(X_train, y_train) cv_inner = KFold(n_splits=5, shuffle=True) model = KerasClassifier(build_fn=create_model, verbose=1)
test_data = pchurn.chngtocat(test_data, pchurn.collist) test_data = pchurn.removeColumns(pchurn.unimportantColumns, test_data) test_data = pchurn.removeColumns(pchurn.unimportantColumnsfornum, test_data) numerical_cols = pchurn.numerical_columns tdf = pchurn.copytdf for var in numerical_cols: minimum = min(tdf[var]) maximum = max(tdf[var]) test_data[var] = (test_data[var] - minimum) / (maximum - minimum) test_data = test_data[pchurn.sccollist] scaled_data = pchurn.scaled_data X_original = scaled_data.drop(['Churn'], axis=1) scaled_data['Churn'] = scaled_data['Churn'].replace([1, 0], ['Yes', 'No']) Y_original = scaled_data['Churn'] sm = SMOTE(kind='regular') X_oversampled, y_oversampled = sm.fit_sample(X_original, Y_original) testing_features = np.array(test_data.drop(['Churn'], axis=1)) test_data['Churn'] = test_data['Churn'].replace([1, 0], ['Yes', 'No']) testing_target = np.array(test_data['Churn']) def false_nagative_rate(y_actual, y_hat): TP = 0 FN = 0 for i in range(len(y_hat)): if y_actual[i] == y_hat[i] == 'Yes': TP += 1 if y_hat[i] == 'No' and y_actual[i] != y_hat[i]: FN += 1
def perform_oversampling(oversamp_method, tr_features, tr_labels, model_class): start = time.time() if True: print(model_class + " oversampling method:\t" + oversamp_method + " ...") # 1 SMOTE if oversamp_method == 'SMOTE': # kind={'borderline1', 'borderline2', 'svm'} svm_model = svm.SVC(C=0.001, kernel='rbf', degree=3, gamma='auto', decision_function_shape='ovo') oversamp = SMOTE(ratio='auto', random_state=None, k_neighbors=5, m_neighbors=10, out_step=0.5, kind='svm', svm_estimator=svm_model, n_jobs=1) # PROBAR SMOTE CON OTRO KIND elif oversamp_method == 'SMOTE_regular_min': oversamp = SMOTE(ratio='minority', random_state=None, k_neighbors=5, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=1) elif oversamp_method == 'SMOTE_regular': oversamp = SMOTE(ratio='auto', random_state=None, k_neighbors=5, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=1) elif oversamp_method == 'SMOTE_border': oversamp = SMOTE(ratio='auto', random_state=None, k_neighbors=5, m_neighbors=10, out_step=0.5, kind='borderline1', svm_estimator=None, n_jobs=1) # 2 SMOTEENN elif oversamp_method == 'SMOTEENN': oversamp = SMOTEENN() # 3 SMOTE TOMEK # NOTE: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.65.3904&rep=rep1&type=pdf elif oversamp_method == 'SMOTETomek': oversamp = SMOTETomek() # 4 ADASYN elif oversamp_method == 'ADASYN': oversamp = ADASYN(ratio='auto', random_state=None, k=None, n_neighbors=5, n_jobs=cpu_threads) tr_features_balanced, tr_labels_balanced = oversamp.fit_sample( tr_features, tr_labels) end = time.time() count = collections.Counter(tr_labels_balanced) print("Oversampling balance") print(count) print("Time required: " + str(format(end - start, '.2f')) + " sec") return tr_features_balanced, tr_labels_balanced
("num", num_pipeline, num_col), ("one", OneHotEncoder(), cat_col_one), #("ord", OneHotEncoder(), cat_col_ord), ]) features = full_pipeline.fit_transform(x) #Ok, so now we have a training set encoded... #It is very imblanced.. so we need to correct this, else have issues finding a good classification from imblearn.over_sampling import SMOTE from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=0) sm = SMOTE(ratio='auto', kind='regular') ros = RandomOverSampler(random_state=0) #X_train, y_train = sm.fit_sample(features,y) X_train, y_train = ros.fit_sample(features,y) #X_train, y_train = rus.fit_sample(features,y) #X_train = features #y_train = y #Finally, we have 'feat' and a target 'Y' we can begin modeling #Prep test data xt = test.drop(target, axis=1) y_test = test[target].copy()
labels.values.ravel(), train_size=train_size, shuffle=True, stratify=labels.values.ravel()) # ### Impute Data if data_impute: imp = IterativeImputer(max_iter=25, random_state=1337) X_train = imp.fit_transform(X_train) X_test = imp.transform(X_test) # ### Augment Data if smote_ratio > 0: smote = SMOTE(sampling_strategy='all', random_state=1337, k_neighbors=5, n_jobs=1) X_train, y_train = smote.fit_resample(X_train, y_train) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # ## Define Model knn = KNeighborsClassifier( n_neighbors=5, weights='uniform', # or distance p=2, n_jobs=8)
df_org = df.copy() df = df.dropna() df['Customer_Location'] = df['Customer_Location'].astype("category").cat.codes X = df.drop(['Email_Status'], axis=1) Y = df['Email_Status'] ## Oversampling Minority classes from imblearn.over_sampling import SMOTE df_m = df.copy() df_m = df_m[(df_m.Email_Status == 1) | (df_m.Email_Status == 2)] X_m = df_m.drop(['Email_Status'], axis=1) Y_m = df_m['Email_Status'] X = X.loc[Y[Y==0].index] Y = Y[Y==0] sm = SMOTE(random_state=np.random.randint(0, 100)) X_os_m , Y_os_m = sm .fit_resample(X_m, Y_m) X_os = pd.concat([X, pd.DataFrame(X_os_m, columns= X.columns)], axis=0) Y_os = pd.concat([Y, pd.Series(Y_os_m)], axis=0) X_train, X_test, y_train, y_test = train_test_split(X_os, Y_os, test_size=0.3, random_state=71, stratify=Y_os) def hyperopt_train_test(params): t = params['type'] del params['type'] if t == 'naive_bayes': clf = BernoulliNB(**params) elif t == 'svm': clf = SVC(**params)
model_evaluation(y_test, pred) # SMOTE을 이용해서 Oversampling을 진행해보자! # 기존의 X_train, y_train, X_test, y_test의 형태 확인 print("Number transactions X_train dataset: ", X_train.shape) print("Number transactions y_train dataset: ", y_train.shape) print("Number transactions X_test dataset: ", X_test.shape) print("Number transactions y_test dataset: ", y_test.shape) from imblearn.over_sampling import SMOTE print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) # y_train 중 레이블 값이 1인 데이터의 개수 print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) # y_train 중 레이블 값이 0 인 데이터의 개수 sm = SMOTE(random_state = 42, ratio = 0.3) # SMOTE 알고리즘, 비율 증가 X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) # Over Sampling 진행 print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1))) print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0))) print("Before OverSampling, the shape of X_train: {}".format(X_train.shape)) # SMOTE 적용 이전 데이터 형태 print("Before OverSampling, the shape of y_train: {}".format(y_train.shape)) # SMOTE 적용 이전 데이터 형태 print('After OverSampling, the shape of X_train: {}'.format(X_train_res.shape)) # SMOTE 적용 결과 확인 print('After OverSampling, the shape of y_train: {}'.format(y_train_res.shape)) # # SMOTE 적용 결과 확인 lgb_dtrain2 = lgb.Dataset(data = pd.DataFrame(X_train_res), label = pd.DataFrame(y_train_res)) # 학습 데이터를 LightGBM 모델에 맞게 변환 lgb_param2 = {'max_depth': 10, # 트리 깊이 'learning_rate': 0.01, # Step Size
def ensembleSmote(xydev): xdevf,ydev = xydev sm = SMOTE(kind='svm',random_state=sh.getConst('smoteSeed')) xdevfr,ydevr = sm.fit_sample(xdevf,ydev) return (xdevfr,ydevr)
# Preprocess compound_x = preprocess_variables(compound_x) # Find intersecting features avail_columns = compound_x.columns.intersection(full_columns) # Select features on subset x_data = compound_x.loc[:, avail_columns] y_data = compound_y.copy() # Create binary variable y_class = np.squeeze([int(y_val <= 10) for y_val in y_data]) # Smote from custom_pipe_helper import SMOTER import auto smote = SMOTE() check = smote.fit(x_data, y_class) smote.fit_sample() check = smote.sample(x_data, y_class) check[0].shape check[1] # Create folds # For each fold # SMOTE the train data # Train model # Evaluate model from sklearn.ensemble import AdaBoostClassifier
'num', numeric_transformer, numeric_features), ( 'cat', categorical_transformer, categorical_features), ('scaler', scaling_transformer, numeric_features)]) # Boosting classifier xgb_clf = xgb.XGBClassifier(objective="binary:logistic", learning_rate=0.01, n_estimators=500, max_depth=1, subsample=0.4, random_state=42) # Combine preprocessing with classifier latePaymentsModel = make_pipeline(preprocess, SMOTE(random_state=42), xgb_clf) # Fit the pipeline to the training data (fit is for both the preprocessing and the classifier) print("\nTraining model ...") latePaymentsModel.fit(X_train, y_train) # Save the trained model as a pickle file print("\nSaving model ...") file = open('public/latePaymentsModel.pkl', 'wb') pickle.dump(latePaymentsModel, file) file.close() # load the pickled model print("\nLoading saved model to make example predictions...") pickledModel = pickle.load(open('public/latePaymentsModel.pkl', 'rb'))
y=None, kind='pie', ax=axs[1], autopct='%1.2f%%') # 饼图 axs[1].set_title("Percentage of each TARGET") plt.show() # In[]: # 1.7、上采样: import imblearn from imblearn.over_sampling import SMOTE X_temp = data.iloc[:, 1:] y_temp = data["SeriousDlqin2yrs"] # y = data.iloc[:,0] sm = SMOTE(random_state=42) #实例化 X, y = sm.fit_sample(X_temp, y_temp) n_sample_ = X.shape[0] # 278584 pd.Series(y).value_counts() n_1_sample = pd.Series(y).value_counts()[1] n_0_sample = pd.Series(y).value_counts()[0] print('样本个数:{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample_, n_1_sample / n_sample_, n_0_sample / n_sample_)) #样本个数:278584; 1占50.00%; 0占50.00% # In[]: # 上采样之后,切分训练集、测试集; 保存 前期处理 结果 from sklearn.model_selection import train_test_split X = pd.DataFrame(X) y = pd.DataFrame(y)
plt.xlim([0.0,1.0]) plt.ylim([0.0,1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive rate') plt.title('ROC Curve GradientBoosting Un-Balanced Data') plt.legend(loc="lower right") plt.show() #As the data for target is unbalanced creating balanced datasets using SMOTE kind = ['svm']; sm = [SMOTE(kind=k) for k in kind] for method in sm: X_res, y_res = method.fit_sample(emp_mod, y) X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.2, random_state=0) #Using Logistic Regression with Balanced Dataset after SMOTE Sampling regr=skl_lm.LogisticRegression() regr.fit(X_train,y_train) pred=regr.predict(X_test) #validation of Logistic fpr,tpr,_=roc_curve(y_test,pred) from sklearn.metrics import auc
self.precisionMaj, self.precisionMin, self.recallMaj, self.recallMin, self.f1Maj, self.f1Min, self.gMaj, self.gMin, self.mcc, self.aucRes, int(round(self.tp)), int(round( self.tn)), int(round(self.fp)), int(round(self.fn)), str(self.clf), dictDesbalanceamento['imbLevel'])) conn.close() if __name__ == '__main__': # exemplo # defino a lista de datasets dataSets = ["pima.csv"] # a lista com as tecnicas de amostragem tecnicasAmostragem = [ None, SMOTE(kind="regular", ratio=1.0), SMOTETomek(ratio=1.0), SMOTE(kind="borderline1", ratio=1.0), SMOTE(kind="borderline2", ratio=1.0) ] clfs = [ svm.LinearSVC(), BernoulliNB(), tree.DecisionTreeClassifier(criterion="entropy", max_depth=7) ] for dataSet in dataSets: cv = CrossValidationStratified(dataset=dataSet, verbose=True) cv.splitClasses() # pego os dados de desbalanceamento para o sql dictDesbalanceamento = cv.getImbalanceLevel()
random_state=8) # Defining pipelines along with relative parameters to be used in GridSearchCV # #### Pipe 1 ---> StandardScaler + PCA # In[11]: pipe_1 = make_pipeline(PearsonSelector(), #OutliersIQR(),\ StandardScalerCust(),\ BinaryEncoder(selected_columns = ['international_plan','voice_mail_plan']),\ DropColumns(['state','area_code']),\ #GetDummies(), SMOTE(),\ PCA(),\ xgb.XGBClassifier(n_jobs=-1)) params_1 = [{ 'pearsonselector__limit': [0.2, 0.4], 'smote__k_neighbors': [3, 5], 'pca__n_components': [2, 3], 'xgbclassifier__n_estimators': [1000] }] # #### Pipe 2 ---> MinMaxScaler + RFE # In[12]:
def fraud_detection(data): data = data.drop('cardverificationcodesupplied', 1) data = data.drop('cvcresponsecode', 1) y = data['simple_journal'] X = data.drop('simple_journal', 1) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.35) sm = SMOTE(ratio = 1) x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) print("\nBefore SMOTE, counts of label '1': {}".format(sum(y_train==1))) print("Before SMOTE, counts of label '0': {} \n".format(sum(y_train==0))) print("\nAfter SMOTE, counts of label '1': {}".format(sum(y_train_sm==1))) print("After SMOTE, counts of label '0': {} \n".format(sum(y_train_sm==0))) print("\nTest data, counts of label '1': {}".format(sum(y_test==1))) print("Test data, counts of label '0': {} \n".format(sum(y_test==0))) # (1) Build Random Forest classifier (Black Box) # with SMOTE clf_rf_sm = RandomForestClassifier(n_estimators=25, random_state=12) clf_rf_sm.fit(x_train_sm, y_train_sm) probs_rf_sm = clf_rf_sm.predict_proba(x_test) probs_rf_sm = probs_rf_sm[:,1] auc_rf_sm = roc_auc_score(y_test, probs_rf_sm) fpr_rf_sm, tpr_rf_sm, thresholds_rf_sm = roc_curve(y_test, probs_rf_sm) # without SMOTE clf_rf = RandomForestClassifier(n_estimators=25, random_state=12) clf_rf.fit(x_train, y_train) probs_rf = clf_rf.predict_proba(x_test) probs_rf = probs_rf[:,1] auc_rf = roc_auc_score(y_test, probs_rf) fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, probs_rf) # (2) Build KNN Classifier (Black Box) # with SMOTE neigh_sm = KNeighborsClassifier(n_neighbors=3) neigh_sm.fit(x_train_sm, y_train_sm) probs_neigh_sm = neigh_sm.predict_proba(x_test) probs_neigh_sm = probs_neigh_sm[:,1] auc_neigh_sm = roc_auc_score(y_test, probs_neigh_sm) fpr_neigh_sm, tpr_neigh_sm, thresholds_neigh_sm = roc_curve(y_test, probs_neigh_sm) # without SMOTE neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(x_train, y_train) probs_neigh = neigh.predict_proba(x_test) probs_neigh = probs_neigh[:,1] auc_neigh = roc_auc_score(y_test, probs_neigh) fpr_neigh, tpr_neigh, thresholds_neigh = roc_curve(y_test, probs_neigh) # (3) Build Decision Tree Classifier (White Box) # with SMOTE dt_sm = tree.DecisionTreeClassifier() dt_sm.fit(x_train_sm, y_train_sm) probs_dt_sm = dt_sm.predict_proba(x_test) probs_dt_sm = probs_dt_sm[:,1] auc_dt_sm = roc_auc_score(y_test, probs_dt_sm) fpr_dt_sm, tpr_dt_sm, thresholds_dt_sm = roc_curve(y_test, probs_dt_sm) # without SMOTE dt = tree.DecisionTreeClassifier() dt.fit(x_train, y_train) probs_dt = dt.predict_proba(x_test) probs_dt = probs_dt[:,1] auc_dt = roc_auc_score(y_test, probs_dt) fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, probs_dt) # Plot ROC curves plt.plot([0, 1], [0, 1], linestyle='--') plt.plot(fpr_rf_sm, tpr_rf_sm, marker='.', label='Random Forest SMOTE (area= %0.2f)' % auc_rf_sm) plt.plot(fpr_rf, tpr_rf, marker='.', label='Random Forest (area= %0.2f)' % auc_rf) plt.plot(fpr_neigh_sm, tpr_neigh_sm, marker='.', label='kN SMOTE (area= %0.2f)' % auc_neigh_sm) plt.plot(fpr_neigh, tpr_neigh, marker='.', label='kN (area= %0.2f)' % auc_neigh) plt.plot(fpr_dt_sm, tpr_dt_sm, marker='.', label='Decision Tree SMOTE (area= %0.2f)' % auc_dt_sm) plt.plot(fpr_dt, tpr_dt, marker='.', label='Decision Tree (area= %0.2f)' % auc_dt) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC)') plt.legend(loc='lower right') plt.show()
'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ] data_vars = df.columns.values.tolist() to_keep = [i for i in data_vars if i not in cat_vars] print(to_keep) data_final = df[to_keep] print(data_final.columns.values) x = data_final.loc[:, data_final.columns != 'y'] y = data_final.loc[:, data_final.columns == 'y'] from imblearn.over_sampling import SMOTE os = SMOTE(random_state=0) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) columns = x_train.columns os_data_x, os_data_y = os.fit_sample(x_train, y_train) os_data_x = pd.DataFrame(data=os_data_x, columns=columns) os_data_y = pd.DataFrame(data=os_data_y, columns=['y']) # we can Check the numbers of our data print("length of oversampled data is ", len(os_data_x)) print("Number of no subscription in oversampled data", len(os_data_y[os_data_y['y'] == 0])) print("Number of subscription", len(os_data_y[os_data_y['y'] == 1])) print("Proportion of no subscription data in oversampled data is ",
# calculate the accuracy score score = accuracy_score(y_pred, y_test) # calculate the precision precision = precision_score(y_test, y_pred) # display 'score' and 'precision' print('Accuracy:', score) print('Precision:', precision) # -------------- # import packages from imblearn.over_sampling import SMOTE # Instantiate smote smote = SMOTE(random_state=9) # fit_sample onm training data X_train, y_train = smote.fit_sample(X_train, y_train) # fit modelk on training data rf.fit(X_train, y_train) # predict on test data y_pred = rf.predict(X_test) # calculate the accuracy score score = accuracy_score(y_test, y_pred) # calculate the precision precision = precision_score(y_test, y_pred)
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from imblearn.over_sampling import SMOTE from rd import ReadData import numpy as np from goGoogle import goGoogle import time for i in range(30): train_data, test_data, train_label, test_label = ReadData(i) Train_data = np.asarray(train_data).astype(np.float64) Test_data = np.asarray(test_data).astype(np.float64) Train_label = np.asarray(train_label).astype(np.float64) Test_label = np.asarray(test_label).astype(np.float64) sm = SMOTE(random_state=42) New_Data, New_Label = sm.fit_resample(Train_data, Train_label) rfc = RandomForestClassifier(n_estimators=100) rfc.fit(Train_data, Train_label) print('RFC_Acc:', rfc.score(Test_data, Test_label)) ac = rfc.score(Test_data, Test_label) rfc.fit(New_Data, New_Label) print('New_Rfc_Acc:', rfc.score(Test_data, Test_label)) nac = rfc.score(Test_data, Test_label) goGoogle(i, 1, ac, nac, 'rfc') time.sleep(2)
def train(datasetFilename, classFilename): # second, prepare text samples and their labels print('Processing text dataset') texts = [] # list of text samples labels = [] # list of label ids dataset = open(datasetFilename).read() #Read file texts = dataset.split('#SEPARATOR#') #Split Status Dataset by each user labelClass = open(classFilename).read() labels = labelClass.split('\n') print('Found %s texts.' % len(texts)) tempTexts = [] tempLabels = [] indices = np.arange(400) np.random.shuffle(indices) for i in range(0, 400): tempTexts.append(texts[indices[i]]) tempLabels.append(labels[indices[i]]) texts = tempTexts labels = tempLabels # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-num_validation_samples] y_train = [] x_test = data[-num_validation_samples:] y_test = [] startIdx = 0 finishIdx = 0.8 * data.shape[0] for i in range(int(startIdx), int(finishIdx)): y_train.append(labels[i]) startIdx = data.shape[0] - 0.2 * data.shape[0] finishIdx = data.shape[0] for i in range(int(startIdx), int(finishIdx)): y_test.append(labels[i]) # Start of Oversampling ####################################################################################################### y_train = np.reshape(y_train, (len(y_train))) y_test = np.reshape(y_test, (len(y_test))) # kind = ['regular', 'borderline1', 'borderline2', 'svm'] kind = ['borderline1'] sm = [SMOTE(kind=k) for k in kind] for method in sm: x_train_resampled, y_train_resampled = method.fit_sample( x_train, y_train) x_test_resampled, y_test_resampled = method.fit_sample(x_test, y_test) print("x_train= " + str(x_train.shape) + " x_test= " + str(x_test.shape)) print("y_train= " + str(y_train.shape) + " y_test= " + str(y_test.shape)) print("x_train_resampled= " + str(x_train_resampled.shape) + " x_test_resampled= " + str(x_test_resampled.shape)) print("y_train_resampled= " + str(y_train_resampled.shape) + " y_test_resampled= " + str(x_test_resampled.shape)) x_train = x_train_resampled x_test = x_test_resampled y_train = to_categorical(np.asarray(y_train_resampled.tolist())) y_test = to_categorical(np.asarray(y_test_resampled.tolist())) # End of Oversampling ######################################################################################################### print('Shape of data tensor:', x_train.shape + x_test.shape) print('Shape of label tensor:', y_train.shape + y_test.shape) print('Preparing embedding matrix.') # prepare embedding matrix num_words = min(MAX_NB_WORDS, len(word_index)) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) print('Training model.') # LSTM model = Sequential() model.add(embedding_layer) model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(2, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'mse', 'mae']) model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test), verbose=2) model_json = model.to_json() with open("model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights("model.h5") print("Saved model to disk") # Evaluate Model confusion = np.array([[0, 0], [0, 0]]) predictions = model.predict(x_test) predictedRound = [round(x[1]) for x in predictions] predicted = [x[1] for x in predictions] tested = [round(x[1]) for x in y_test] confusion += confusion_matrix(tested, predictedRound) precisionScore = precision_score(tested, predictedRound, pos_label=1.) #pos_label= (1.=yes, 0.=no) recallScore = recall_score(tested, predictedRound, pos_label=1.) accuracyScore = accuracy_score(tested, predictedRound) f1Score = f1_score(tested, predictedRound, pos_label=1.) rocAucScore = roc_auc_score(tested, predictedRound) maeScore = mean_absolute_error(tested, predictedRound) mseScore = mean_squared_error(tested, predictedRound) r2Score = r2_score(tested, predictedRound) trainResult = '' # trainResult += 'Filename: ' + dsetFilename + '\n' # trainResult += 'Classifier: ' + str(classifier) + '\n' trainResult += str(confusion[0]) + str(confusion[1]) + ',' trainResult += str(precisionScore) + ',' trainResult += str(recallScore) + ',' trainResult += str(f1Score) + ',' trainResult += str(accuracyScore) + ',' trainResult += str(rocAucScore) + ',' trainResult += str(maeScore) + ',' trainResult += str(mseScore) + ',' trainResult += str(r2Score) + '\n' print("Trained successfully\n") return trainResult
vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2)) elif 'unibitri_gram' in experiment: vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3)) else: vectorizer = CountVectorizer(binary=True) vectorizer.fit(texts_train) X_train = vectorizer.transform(texts_train) X_test = vectorizer.transform(texts_test) X = vectorizer.transform(pro_texts) # In[35]: if 'smote' in experiment: #oversampling with SMOTE sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) elif 'undersampling' in experiment: rus = RandomUnderSampler(random_state=42) X_train, y_train = rus.fit_resample(X_train, y_train) elif 'random_oversampling' in experiment: ros = RandomOverSampler(random_state=42) X_train, y_train = ros.fit_resample(X_train, y_train) X_train.shape # In[36]: vocab_size = X_train.shape[1] vocab_size
# making our independent set Y = X['isFraud'] # removing the dependent set X = X.drop(['isFraud'], axis=1) # getting the shapes of x and y print("Shape of x: ", X.shape) print("Shape of y: ", Y.shape) print(X.head()) from imblearn.over_sampling import SMOTE x_resample, y_resample = SMOTE().fit_sample(X, Y.values.ravel()) # getting the shapes of x and y after resampling print("Shape of x: ", x_resample.shape) print("Shape of y:", y_resample.shape) # splitting the dataset into train and tests from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x_resample, y_resample, test_size=0.2, random_state=0) # checking the new shapes
X, Y = import_training_data() #X, Y = data_cleaning(X, Y, 5) X = check_collinearity(X) X, y = reorganize_data(X, Y) y = y.ix[:, 0] names = list(X) print X.shape, y.shape # split the training data into training and test set X_training, X_test, y_training, y_val = train_test_split(X, y, train_size=0.75, random_state=0) # correct the skewness oversampler = SMOTE(random_state=0) X_training, y_training = oversampler.fit_sample(X_training, y_training) # random forest parameter params_rf = { 'n_jobs': 1, 'n_estimators': 1600, 'warm_start': True, 'max_features': 0.3, 'max_depth': 9, 'min_samples_leaf': 2, 'random_state': 0, 'verbose': 0 } # random forest
from imblearn.over_sampling import SMOTE # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Borderline SMOTE 1 sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
k_best = SelectKBest(score_func=score_func, k=10).fit(X, y) idxs = k_best.get_support(indices=True) X = X.iloc[:,idxs] return X """ testar KBest """ X, y = split_dataset(super_table, CLASS) X = getKBest(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) sm = SMOTE(random_state=2) X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) results = {} for clf in base_clfs: clf_name = type(clf).__name__ stats = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test) results[clf_name] = stats measures = {} i = 0 for clf in results: clf_res = results[clf] measures[i] = {'Classifier': clf, 'Measure': 'Accuracy', 'Value': clf_res['accuracy']}
feature_scaler = StandardScaler() X_scaled = feature_scaler.fit_transform(X) # Dividing dataset into training and test sets X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.3, random_state=100) print(X_train.shape) print(X_test.shape) # Implementing Oversampling to balance the dataset; SMOTE stands for Synthetic Minority Oversampling TEchnique print( "Number of observations in each class before oversampling (training data): \n", pd.Series(Y_train).value_counts()) smote = SMOTE(random_state=101) X_train, Y_train = smote.fit_sample(X_train, Y_train) print( "Number of observations in each class after oversampling (training data): \n", pd.Series(Y_train).value_counts()) rfc = RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) grid_param = {'n_estimators': [50, 100, 150, 200, 250, 300]} gd_sr = GridSearchCV(estimator=rfc, param_grid=grid_param, scoring='precision', cv=5)
#使用imbalanceed-learn库来进行 欠采样、过采样 #但相当一部分模型都可以通过 class_weight 参数来调整每种分类下样本的权重。 from imblearn.over_sampling import SMOTE,RandomOverSampler from imblearn.under_sampling import RandomUnderSampler #随机欠采样:在样本中,如果某一分类的样本明显多于另一类,则可以使用欠采样来减少多数类样本的个数 rus=RandomUnderSampler({0:4000,1:3000})#“0”,“1”为多数样本,4000为采样个数,可以根据实际情况调整。参数中没有提及的分类则不会改动 x,y=rus.fit_resample(x,y) #这时x,y就被替换为欠采样后的数据 #如果欠采样的比例过小,则可能造成欠拟合 #过采样 #1.随机过采样:某分类的样本明显偏少,则可以使用它对少数样本进行过采样。本质上是对少数类样本的随机重复使用 ros=RandomOverSampler({2:3000,3:4000})#同上 x,y=ros.fit_resample(x,y) #2.少数类合成过采样 Synthetic Minority Over-Sampling Technique 我看过用它写的一篇论文。写得还行 smote=SMOTE(kind='regular',k_neighbors=5,ratio={2:3000,3:4000}) ''' parameters: kind: ('regular', 'borderline1', 'borderline2' or 'svm',default='regular') 样本合成的方式,不懂就百度SMOTE,有真相 k_neighbors: (int,default=5) 样本合成时使用的邻近样本数 ratio: 样本合成的数量或比例 svm_estimator: kind=svm时才需要设置,传入一个sklearn的模型就行 ''' #如果过采样比例过大,则可能造成“过拟合”,即把少数类中的偶然情况当作一般规律