def resample(X, y, sample_fraction=0.1, test_size=0.3): X_columns = X.columns y_columns = y.columns n = len(X_columns) print('~' * 80) print('@@-\n', y.converted.value_counts()) print('@@0 - Original') show_balance(y.values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) print('@@2 - y_train') show_balance(y_train) print('@@2 - y_test') show_balance(y_test) assert X_train.shape[1] == n and X_test.shape[1] == n ros = RandomOverSampler(random_state=42) X_train, y_train = ros.fit_sample(X_train, y_train) X_test, y_test = ros.fit_sample(X_test, y_test) print('@@3 - Oversampled y_train') show_balance(y_train) print('@@3 - Oversampled y_test') show_balance(y_test) assert X_train.shape[1] == n and X_test.shape[1] == n if sample_fraction < 1.0: _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43) _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44) print('@@2 - Downsampled y_train') show_balance(y_train) print('@@2 - Downsampled y_test') show_balance(y_test) assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape) assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape) print('X_columns=%d %s' % (len(X_columns), X_columns)) print('y_columns=%d %s' % (len(y_columns), y_columns)) print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape))) print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape))) assert X_train.shape[1] == n and X_test.shape[1] == n X_train = pd.DataFrame(X_train, columns=X_columns) y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index) X_test = pd.DataFrame(X_test, columns=X_columns) y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index) print('@@+ y_train\n', y_train.converted.value_counts(), flush=True) print('@@+ y_test\n', y_test.converted.value_counts(), flush=True) return (X_train, y_train), (X_test, y_test)
def transform(self, X, y=None): # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the # TODO simple trainer in the correct order and leave this to advanced users? # Extract predicted column y = np.squeeze(X[[self.predicted_column]]) # Copy the dataframe without the predicted column temp_dataframe = X.drop([self.predicted_column], axis=1) # Initialize and fit the under sampler over_sampler = RandomOverSampler(random_state=self.random_seed) x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y) # Build the resulting under sampled dataframe result = pd.DataFrame(x_over_sampled) # Restore the column names result.columns = temp_dataframe.columns # Restore the y values y_over_sampled = pd.Series(y_over_sampled) result[self.predicted_column] = y_over_sampled return result
def oversample(self): self._X_original = self._X self._y_original = self._y ros = RandomOverSampler(random_state=0) X, y = ros.fit_sample(self._X, self._y) self._X = X self._y = y
def test_ros_fit_sample(): """Test the fit sample routine""" # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ros_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ros_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def Random_OverSample(self, imData): imDataPlace = self.sPath + imData tradition_list, _ = LoadTraditionCSV(imDataPlace) trainingDF_Bug = LoadCSV(imDataPlace)['bug'].tolist() training_listLabel = MakeLabels(trainingDF_Bug) from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) X_res, y_res = ros.fit_sample(tradition_list, training_listLabel) result = [X_res, y_res] savepickle(result, self.oPath + "AfterOverSample" + imData[:-4] + ".pickle") return "AfterOverSample" + imData[:-4] + ".pickle"
def oversample(alg, X_train, y_train): # print('in oversample got ', X_train, y_train) if alg == 'smote': smt = SMOTE() X_train, y_train = smt.fit_sample(X_train, y_train) # print('Resampled dataset shape %s' % Counter(y_train)) return X_train, y_train if alg == 'random': ros = RandomOverSampler(random_state=42) X_train, y_train = ros.fit_sample(X_train, y_train) # print('Resampled dataset shape %s' % Counter(y_train)) return X_train, y_train
def test_ros_fit_sample_half(): ratio = 0.5 ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def KMeans_unbalanced(X_datavec, Y_datavec, X_columns, Y_names, num_used=20000): XY_datavec = pd.merge(pd.DataFrame(X_datavec, columns=X_columns), pd.DataFrame(Y_datavec, columns=[Y_names]), how="left", right_index=True, left_index=True) XY_datavec_normal = XY_datavec[XY_datavec[Y_names] == 0] X_datavec_normal = XY_datavec_normal.drop(Y_names, axis=1).values.tolist() XY_datavec_outlier = XY_datavec[XY_datavec[Y_names] == 1] #处理数据不平衡问题 best_num_cluster = GS_KMeans_parameter(X_datavec_normal) y_clst_labels = Model_KMeans(X_datavec_normal, best_num_cluster) #避免和已经标记label重合 y_clst_labels = [i + 100 for i in y_clst_labels] print 'y_clst_labels Information:', set(y_clst_labels) print '----------------------------------------------' XY_clst_normal = pd.merge(pd.DataFrame(X_datavec_normal, columns=X_columns), pd.DataFrame(y_clst_labels, columns=[Y_names]), how="left", right_index=True, left_index=True) XY_datavec = pd.concat([XY_clst_normal, XY_datavec_outlier]) X_data = XY_datavec.drop(Y_names, axis=1).values.tolist() Y_data = XY_datavec[Y_names].values.tolist() #输出每个标签的数量 print 'Counter:y', Counter(Y_data) print '----------------------------------------------' #随机采样的少数类,解决类别不平衡问题 ros = RandomOverSampler(random_state=0) #ros = SMOTE(random_state=0) X_resampled, Y_resampled = ros.fit_sample(X_data, Y_data) print 'Counter:y after using RandomOverSampler', Counter(Y_resampled) print '----------------------------------------------' if len(X_resampled) > num_used * len(Counter(Y_data)): #每个类别分层采样num_used个事例 x_NoUse_train, X_resampled, y_NoUse_train, Y_resampled = train_test_split( X_resampled, Y_resampled, train_size=None, test_size=num_used * len(Counter(Y_data)), stratify=Y_resampled, random_state=0) print 'Counter:used y', Counter(Y_resampled) print '----------------------------------------------' return X_resampled, Y_resampled
def AdversarialTrainVal(train, test): from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(ratio='minority', random_state=44) X_res, y_res = ros.fit_sample(train, train.target) train = pd.DataFrame(X_res, columns=train.columns) train = train.sort_values(by=['prob_test'], ascending=False) Xtrain = pd.DataFrame(train.nlargest(int(train.shape[0] * 0.82), 'prob_test'), columns=train.columns) ros = RandomOverSampler(ratio='all', random_state=44) X_xres, y_xres = ros.fit_sample(Xtrain, Xtrain.target) X_xres = pd.DataFrame(X_xres, columns=train.columns) X_data, X_test, Y_data, y_test = train_test_split(X_xres, X_xres.target, stratify=X_xres.target, test_size=0.6, random_state=44) Xtrain = Xtrain.append(X_data) print("Xtrain = {}".format(Xtrain.shape)) Xtrain = Xtrain.append( train.nlargest(int(train.shape[0] * 0.31), 'prob_test')) for i in range(100): Xtrain = Xtrain.append(train[train["prob_test"] > 0.80]) Xtrain = Xtrain.append(train[train["prob_test"] > 0.70]) for i in range(110): Xtrain = Xtrain.append(train[train["prob_test"] > 0.60]) #Xtrain = Xtrain.drop(["is_test"], 1) #Xtrain = Xtrain.append(train) val = train.nsmallest(int(train.shape[0] * 0.7), 'prob_test') val = val.append(X_test) #val = val.drop(["is_test"], 1) x_train, y_train = Xtrain.drop(['prob_test', "target"], 1), Xtrain.target x_val, y_val = val.drop(['prob_test', "target"], 1), val.target x_test = test.drop(['prob_test', "target"], 1) return x_train, y_train, x_val, y_val, x_test
def fit_RandomForestClassifier(X_train, X_test, y_train, y_test, target_label): # Pipeline a = Imputer(missing_values='NaN', strategy='median', axis=0) b = StandardScaler() c = SelectKBest() d = RandomOverSampler() X_res, y_res = d.fit_sample(X_train, y_train) clf = RandomForestClassifier() model = Pipeline([('impute', a), ('scaling', b), ('anova', c), ('rf', clf)]) # Grid Search CV parameters = { 'anova__k': [5, 10, 20, 40], 'rf__n_estimators': [10, 50], 'rf__criterion': ['gini', 'entropy'], 'rf__max_features': ['auto', 'sqrt'] } grid = GridSearchCV(model, parameters, cv=10, scoring='f1_weighted') grid.fit(X_res, y_res) # Features Used final_pipeline = grid.best_estimator_ select_indices = final_pipeline.named_steps['anova'].transform( np.arange(X_train.shape[1]).reshape(1, -1)) feature_names = X_train.columns[select_indices] # Predicting and scoring on test set class_index = list(grid.classes_).index(target_label) y_pred = grid.predict(X_test) y_score = grid.predict_proba(X_test)[:, class_index] model_name = get_modelName(clf) # Plot fig = plt.figure(figsize=(15, 5)) plt.subplot(121) # ROC curve metrics.plot_roc(y_test, y_score, target_label, model_name) # Precision Recall Curve plt.subplot(122) metrics.plot_prc(y_test, y_score, target_label, model_name) plt.show() # Get Metrics metric = metrics.get_ClassificationMetrics(model_name, y_test, y_pred, y_score, target_label) print('Training data best accuracy: %.5f' % grid.best_score_) print('Testing data accuracy: %.5f' % grid.score(X_test, y_test)) print() print('Classification Report:') print(classification_report(y_test, y_pred)) return metric, feature_names, y_score
def resampling_data(self, X, y): # Import a dataset with X and multi-label y lp = LabelPowerset() ros = RandomOverSampler(random_state=42) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = ros.fit_sample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) return X_resampled, y_resampled
def oversample(X_train, y_train): ## DOES NOT WORK CORRECTLY ros = RandomOverSampler(random_state=42) n_samples, n_levels, n_variables = X_train.shape[0], \ X_train.shape[1], \ X_train.shape[2] X_train = X_train.reshape((n_samples, -1), order='F') X_train, y_train = ros.fit_sample(X_train, y_train) X_train = np.reshape(X_train, (-1, n_levels, n_variables)) return X_train, y_train
def oversample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: ros = RandomOverSampler() #ros = SMOTE() #ros = ADASYN() self.X, self.y = ros.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def sample_data(X, y, sampleMethod): if sampleMethod == "Undersample": rus = RandomUnderSampler(return_indices=True) X_rus, y_rus, id_rus = rus.fit_sample(X, y) return X_rus, y_rus, id_rus elif sampleMethod == "Oversample": ros = RandomOverSampler() X_ros, y_ros = ros.fit_sample(X, y) return X_ros, y_ros elif sampleMethod == "SMOTE": smote = SMOTE(ratio='minority') X_sm, y_sm = smote.fit_sample(X, y) return X_sm, y_sm
def preprocess_data(data, upsample=False): data_x = data.iloc[:, 0:-1] data_y = data.iloc[:, -1:np.newaxis] if upsample: sampler = RandomOverSampler() data_x_sampled, data_y_sampled = sampler.fit_sample(data_x, data_y) print(data_x_sampled.shape[0] - data_x.shape[0], 'new random picked points') data_x = data_x_sampled data_y = data_y_sampled.reshape(-1, 1) # standard_x=StandardScaler().fit_transform(data_x) standard_x = (data_x - data_x.mean()) / data_x.std() return np.hstack((standard_x, np.array(data_y)))
def OverSampling_RandomOver(X, y): from collections import Counter print(sorted(Counter(y).items())) from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) newX, newY = ros.fit_sample(X, y) print(newX.shape, newY.shape, type(newX)) print(sorted(Counter(newY).items())) print('-' * 20) #from imblearn.datasets import make_imbalance #newX,newY=make_imbalance(newX,newY,ratio=ratio_multiplier) #print(sorted(Counter(newY).items())) return newX, newY
def test_ros_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.5 ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ros_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ros_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def data_prepare(self): """ 数据准备,由于数据集存在不均衡的现象[点击数:未点击数=1:8],所以对训练集数据进行处理,处理方式为对训练集中对点击数据进行随机过采样 """ print('正在进行数据集准备...') if self.oversampling == 's': ros = RandomOverSampler(random_state=0) new_train_vec, new_train_label = ros.fit_sample( self.train_vec, self.train_label) else: new_train_vec, new_train_label = self.train_vec, self.train_label print('数据集准备完成...') return new_train_vec, new_train_label
def sample(self, X, Y): ##################over_sample######################## if self.with_sample: shape = X.shape X = X.reshape((shape[0], -1)) Y = Y # 定义模型,random_state相当于随机数种子的作用 if self.sample_method == "ROS": sampler = RandomOverSampler(random_state=0) elif self.sample_method == "SMOTE": sampler = SMOTE(random_state=0) X, Y = sampler.fit_sample(X, Y) X = X.reshape((-1, shape[1], shape[2], shape[3])) return X, Y
def oversample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: ros = RandomOverSampler() # to fix the random sampling seed at a certain value & return indices: #ros = RandomOverSampler(random_state=2) self.X, self.y = ros.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def Rand_Over_Samp(self, X_t, Y_t): X_train = pd.DataFrame(self.X_t) Y_train = pd.DataFrame(self.Y_t) comb = pd.concat([X_train, Y_train], axis=1) l = list(comb) sampler = ROS(random_state=42) sampled_X, sampled_Y = sampler.fit_sample(X_train, Y_train.values.ravel()) sampled_X = pd.DataFrame(sampled_X) sampled_Y = pd.DataFrame(sampled_Y) data_for_modelling = np.concatenate([sampled_X, sampled_Y], axis=1) data_for_modelling = pd.DataFrame(data_for_modelling) data_for_modelling.columns = l return data_for_modelling
def load_data(binary=False): src_path = os.path.dirname(os.path.realpath(__file__)) s_var = BinaryVariable(name=u'sex', pos=u'Male', neg=u'Female') y_var = BinaryVariable(name=u'income', pos=u'>50K', neg=u'<=50K') if binary: df = pd.read_csv(os.path.join(src_path, '../data/adult/adult-b.csv')) x_vars = [ CategoricalVariable('age'), CategoricalVariable('workclass'), CategoricalVariable('education-num'), CategoricalVariable('marital-status'), CategoricalVariable('occupation'), CategoricalVariable('relationship'), CategoricalVariable('race'), CategoricalVariable('hours-per-week'), CategoricalVariable('native-country') ] else: df = pd.read_csv(os.path.join(src_path, '../data/adult/adult.csv')) x_vars = [ QuantitativeVariable('age'), CategoricalVariable('workclass'), QuantitativeVariable('education-num'), CategoricalVariable('marital-status'), CategoricalVariable('occupation'), CategoricalVariable('relationship'), CategoricalVariable('race'), QuantitativeVariable('hours-per-week'), CategoricalVariable('native-country') ] s = s_var.normalize(df[s_var.name]) y = y_var.normalize(df[y_var.name]) x = pd.DataFrame(data=None) for x_var in x_vars: x = pd.concat([x, x_var.normalize(x=df[x_var.name])], axis=1) xs = pd.concat([x, s], axis=1) ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_sample(xs, y) x = pd.DataFrame(X_resampled[:, :-1]) s = pd.Series(X_resampled[:, -1], name=s_var.name) y = pd.Series(y_resampled, name=y_var.name) offset = { 'hinge-hinge-tau': [0.13, 3.27], } return s_var, y_var, x, s, y, offset
def minority_oversample(x, labels, ratio='auto', post_resamp_n=None): """ Over-sample the minority class(es) in "labels" by picking samples at random. with replacement. :param x: data to resample :param labels: values of dependent variable (m - samples,) :param binwidth: width of bin by which to group values. Bins will be the discrete "label" by which to randomly over-sample :param post_resamp_n: after resampling, unformly subsample - without replacement - this many samples. :return: resampled x """ binwidth = 2 * (iqr(labels)) / (labels.shape[0]**(1. / 3)) miny, maxy = np.min(labels), np.max(labels) # Center the min/maxes inside their own bins yrange = maxy - miny nbins = np.ceil(yrange / (binwidth + 1.0e-10)) if nbins == 0: return x # Obtain bin centers, where min/max are within the beginning/ending bins binsctrs = np.linspace((miny - binwidth / 2000.), (maxy + binwidth / 2000.), num=nbins) # Assign y-values to each bin bins = np.digitize(x=labels, bins=binsctrs) # Use the index of each sample to over-sample. # 1) Pair each sample with an index. # 2) Resample the p # Represent the resample array as: [y-value, sample-index] samp_idc_feat = np.array(list(range(labels.shape[0])))[..., None] # Let n_maj be the number of the majority class. ROS generates data # such that the count for each class is equal to n_maj. ros = RandomOverSampler(ratio) resamp_idc, res_bins = ros.fit_sample(X=samp_idc_feat, y=bins) if post_resamp_n is not None: assert post_resamp_n <= resamp_idc.shape[0] resamp_idc, _ = resample(resamp_idc, res_bins, n_samples=post_resamp_n, replace=False) # Recover the y-values from each sample return x[resamp_idc, ...]
def test_random_over_sampling_return_indices(): ros = RandomOverSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, sample_indices = ros.fit_sample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X)))
def get_data(self, data_files, setting, names): """ Get the Data object :param data_files: the pathname of the data files :param setting: the Setting object :param names: the Names object :return: the Data object """ # If one data file if len(data_files) == 1: data_file = data_files[0] # Get X and y X, y = self.get_X_y(data_file, names) elif len(data_files) == 2: training_data_file = data_files[0] if 'train' in data_files[ 0] else data_files[1] testing_data_file = data_files[0] if 'test' in data_files[ 0] else data_files[1] # Get X_train and y_train X_train, y_train = self.get_X_y(training_data_file, names) # Get X_test and y_test X_test, y_test = self.get_X_y(testing_data_file, names) # Combine training and testing data X = pd.concat([X_train, X_test]) y = pd.concat([y_train, y_test]) else: print("Wrong number of data files!") exit(1) # Encode X and y X, y = self.encode_X_y(X, y, setting, names) # Update the name of features names.features = np.array(X.columns) # Transform X from dataframe into numpy array X = X.values # Oversampling when y is imbalanced if len(np.unique(np.unique(y, return_counts=True)[1])) != 1: ros = RandomOverSampler(random_state=setting.random_state) X, y = ros.fit_sample(X, y) data = Data.Data(X, y) return data
def handleImbalanceDataset(self, X, Y): """ Method Name: handleImbalanceDataset Description: This method handles the imbalance in the dataset by oversampling. Output: A Dataframe which is balanced now. On Failure: Raise Exception """ rdsmple = RandomOverSampler() x_sampled, y_sampled = rdsmple.fit_sample(X, Y) return x_sampled, y_sampled
def random_oversample(X, y, ratio='auto', random_state=None): """ Function to oversample minority class by sampling at random with replacement (by default) :param X: Feature data :param y: Class labels :param ratio: (string/float) The number of samples in the minority class over the number of samples in the majority class. :param random_state: (int) Seed used by the random number generator. :return: Re-sampled features and corresponding class labels, with higher sampling of minority class """ ros = RandomOverSampler(ratio=ratio, random_state=random_state) X_res, y_res = ros.fit_sample(X, y) return X_res, y_res
def test_train_split(self, X, y, share_train=0.8, stratify=None, balance=None, X_label=None): ''' Create testing and training splits from the provided data. If balance is not None, balances data by upsampling or downsampling (upsample, downsample) using RandomSampling. Requires the imbalanced-learn library. :param X: :param Y: :param share_train: :param stratify: :param balance: :param mod: :param min_val: :return: ''' if X.shape[0] != y.shape[0]: logging.warning('X and Y are not the same length.', UserWarning) # set aside X_test and y_test so that test data is not upsample or downsample data X_train, self.X_test, y_train, self.y_test = train_test_split( X, y, test_size=(1. - share_train), stratify=stratify) self.dependent = y.name if X_label: self.independent = X_label else: self.independent = list(X.columns.values) self.balance = balance if balance == 'upsample': ros = RandomOverSampler() X_resample, y_resample = ros.fit_sample(X_train, y_train) elif balance == 'downsample': rus = RandomUnderSampler() X_resample, y_resample = rus.fit_sample(X_train, y_train) else: X_resample = X y_resample = y self.X_train, X_test, self.y_train, y_test = train_test_split( X_resample, y_resample, test_size=(1. - share_train), stratify=stratify)
def Decision(): print('------------决策树------------') # 读取文件 data1 = pd.read_csv('data.csv', encoding='GBK') data = data1.loc[data1["年份(年末)"] != 7] # 处理数据不均衡问题 ros = RandomOverSampler(random_state=0, sampling_strategy=1) X_resampled, y_resampled = ros.fit_sample(data.iloc[:, 3:15], data['是否高转送']) # 拆分专家样本集 data_tr, data_te, label_tr, label_te = train_test_split( X_resampled, y_resampled) #模型构建 Model = DecisionTreeClassifier( max_depth=25, random_state=8, splitter='random', min_samples_split=3, min_samples_leaf=1, ) #模型训练 Model.fit(data_tr, label_tr) #模型预测 dt_pre = Model.predict(data_te) print('预测结果为:', dt_pre) print('---------模型预测值与真实值比较------------') print(dt_pre == label_te) #比较模型预测值与真实值 # 分类报告 dt_reports = classification_report(label_te, dt_pre) print('---------分类报告------------') #打印分类报告 print(dt_reports) # 决策树可视化 dot_data = export_graphviz(Model, feature_names=[ '年份(年末)', '交易日平均价', '预增或预减', '超涨或超跌', '次新股', '每股资本公积(元/股)+每股未分配利润(元/股)', '每股现金流量净额(元/股)', '实收资本(或股本)', '每股收益(期末摊薄,元/股)', '每股净资产(元/股)', '营业总收入同必增长(%)', '近两年送转比例', '上市时间' ], class_names='是否高转送') #可视化结果保存到“dt.dot” #打开dot文件,需要在node属性中添加“fontname = FangSong”,否则会出现乱码 f = open('dt.dot', 'w') f.write(dot_data) f.close() graph = graphviz.Source(dot_data)
def test_multiclass_fit_sample(): # Make y to be multiclass y = Y.copy() y[5] = 2 y[6] = 2 # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 5) assert_equal(count_y_res[1], 5) assert_equal(count_y_res[2], 5)
def ros_rs(df): train_df, test_df, X_train, y_train, X_test, y_test = preprocess(df) ros = RandomOverSampler(random_state=SEED) X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train) # print(X_train_ros.shape[0] - train_features.shape[0], 'new random picked points') X_train_ros = pd.DataFrame(X_train_ros) X_train_ros.columns = train_df.keys().tolist() y_train_ros = pd.DataFrame(y_train_ros, columns=['depressed']) print(y_train_ros.depressed.value_counts()) return X_train_ros, y_train_ros, X_test, y_test
def update_initial_train(iter_sampling, under_sampling, smote, unmodified_train_X, unmodified_train_y, num_subsets): if iter_sampling == True: print "Oversampling in the active iteration list" ros = RandomOverSampler() initial_X_train = None initial_y_train = None initial_X_train, initial_y_train = ros.fit_sample(unmodified_train_X, unmodified_train_y) elif under_sampling == True: ee = EasyEnsemble(return_indices=True, replacement=True, n_subsets=num_subsets) initial_X_train = None initial_y_train = None initial_X_train, initial_y_train, indices = ee.fit_sample(unmodified_train_X, unmodified_train_y) elif smote == True: ros = SMOTE(k_neighbors=3) initial_X_train = None initial_y_train = None initial_X_train, initial_y_train = ros.fit_sample(unmodified_train_X, unmodified_train_y) else: # initial_X_train[:] = [] # initial_y_train[:] = [] initial_X_train = copy.deepcopy(unmodified_train_X) initial_y_train = copy.deepcopy(unmodified_train_y) return initial_X_train, initial_y_train
def predict(ver, predict_ver, alike_metrics): predictor_rep = PredictorRepository(predict_ver, ver) training_m = Metrics_Origin(ver, METRICS_DIR) evaluate_m = Metrics_Origin(predict_ver, METRICS_DIR) ens_analyzer = AUCAnalyzer(predict_ver, 'ENS', TARGET) for i in tqdm(range(ITER)): # NML MODEL predictor = predictor_rep.get_predictor('ENS', PRED_TYPE) if predictor is None: print(' predictor has not found, type: ' + PRED_TYPE) return # sm = RandomOverSampler(ratio='auto', random_state=random.randint(1,100)) # X_resampled, y_resampled = sm.fit_sample(training_m.product_df, training_m.fault) X_resampled, y_resampled = training_m.product_df.as_matrix( ), training_m.fault.as_matrix() nml_model = predictor.train_model(X_resampled, y_resampled) ev_data, dv_data = evaluate_m.get_not_modified_df() nml_value, _ = predictor.predict_ensemble_test_data( nml_model, ev_data, dv_data, None) # RFN MODEL sm = RandomOverSampler(ratio='auto', random_state=random.randint(1, 100)) X_resampled, y_resampled = sm.fit_sample(training_m.mrg_df, training_m.fault) rfn_model = predictor.train_model(X_resampled, y_resampled) ev_data, dv_data = evaluate_m.get_modified_df() mrg_value, _ = predictor.predict_ensemble_test_data( rfn_model, ev_data, dv_data, None) predictor.set_is_new_df(evaluate_m.isNew) predictor.set_is_modified_df(evaluate_m.isModified) report_df = predictor.export_report(predict_ver) report_df[REPORT_COLUMNS].to_csv('df.csv') if report_df is not None: ens_analyzer.set_report_df(report_df[REPORT_COLUMNS]) ens_analyzer.calculate() ens_analyzer.analyze_predict_result() # export report ens_df = ens_analyzer.calculate_average(ITER) ens_analyzer.export(target_sw=TARGET, df=ens_df, predictor_type=PRED_TYPE) ens_df = ens_analyzer.calculate_num_report_averge(ITER) ens_analyzer.export_count_report(target_sw=TARGET, df=ens_df, predictor_type=PRED_TYPE)
def balance_dataset(self, X, Y): X_new = X Y_new = Y overSampler = RandomOverSampler() underSampler = RandomUnderSampler() #sm = EasyEnsemble() #X_refit, Y_refit = sm.fit_sample(X, Y) #print('Resampled dataset shape {}'.format(Counter(Y_refit[0]))) #X, Y = X_refit[0], Y_refit[0] classCounts = Counter(Y) print('Original training dataset shape {}'.format(classCounts)) avg = 0 minCount = classCounts[self.classes[0]] maxCount = classCounts[self.classes[0]] for i in self.classes: avg = avg + classCounts[i] if classCounts[i] < minCount: minCount = classCounts[i] if classCounts[i] > maxCount: maxCount = classCounts[i] avg = avg // len(classCounts) print("Rounded-down average class count in training dataset: " + str(avg)) print("minCount: " + str(minCount)) print("maxCount: " + str(maxCount)) rate = avg / float(maxCount) print("rate: " + str(rate)) underSampler = RandomUnderSampler(ratio=rate) X_new, Y_new = underSampler.fit_sample(X_new, Y_new) classCounts = Counter(Y_new) print('Class counts after undersampling {}'.format(classCounts)) #avg = 0 #minCount = classCounts[0] #maxCount = classCounts[0] #for i in range(len(classCounts)): # avg = avg + classCounts[i] # if classCounts[i] < minCount: # minCount = classCounts[i] # if classCounts[i] > maxCount: # maxCount = classCounts[i] #avg = avg // len(classCounts) #rate = minCount / float(avg) #print("rate: " + str(rate)) #overSampler = RandomOverSampler(ratio = rate) #print("I am here1") X_new, Y_new = overSampler.fit_sample(X_new, Y_new) #print("I am here2") return X_new, Y_new
def test_ros_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.5 ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502]]) y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 3600) assert_equal(count_y_res[1], 3600) assert_equal(count_y_res[2], 3600)
def balance_data(X, y): # Apply the random over-sampling ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_sample(X, y) return X_resampled, y_resampled
# summarize the number of rows and columns in the dataset after listwise drop (sample, vnum) = dataset.shape print(sample, vnum) # Get the number of variables vnum = vnum - 1 # splice into IVs and DV values = dataset.values X = values[:, 0:vnum] y = values[:, vnum] # Oversampling ros = RandomOverSampler(random_state=0) X_R, y_R = ros.fit_sample(X, y) # create model model = Sequential() model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu')) model.add(Dense(8, kernel_initializer='uniform', activation='relu')) model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Fit the model model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2) # calculate predictions predictions = model.predict(X) # round predictions rounded = [round(x[0]) for x in predictions]
# Split train_val data into training set and validation set X_train, X_val, y_train, y_val \ = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # ========================================================================================== # Over-sampled data # Generate the new dataset using under-sampling method verbose = False ratio = 'auto' # 'Random over-sampling' OS = RandomOverSampler(ratio=ratio, verbose=verbose) X_train_os, y_train_os = OS.fit_sample(X_train, y_train) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') X_train_smo, y_train_smo = smote.fit_sample(X_train, y_train) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') X_train_bs1, y_train_bs1 = bsmote1.fit_sample(X_train, y_train) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') X_train_bs2, y_train_bs2 = bsmote2.fit_sample(X_train, y_train) # 'SMOTE SVM' svm_args={'class_weight': 'auto'}
for feature in list(data.columns): # onehot encode the feature feature_data = data[[feature]] encoded_feature_data = pd.get_dummies(feature_data) print '\n' print feature print feature_data.shape print encoded_feature_data.shape print y.shape # upsample minority class from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(ratio=0.5) X_resampled, y_resampled = ros.fit_sample(encoded_feature_data, y) print '\n' print X_resampled.shape print y_resampled.shape # create train and test split X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0, test_size=0.2) print '\n' print 'Training data' print X_train.shape print y_train.shape print 'Testing data' print X_test.shape
import sys, os, csv from imblearn.over_sampling import RandomOverSampler input_csv_file = sys.argv[1] input_csv = input_csv_file.split(".csv")[0] with open(input_csv_file, newline="") as input_file: reader = csv.reader(input_file, delimiter=',') with open(input_csv + "-ro-.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, delimiter=',') skip_header = True X = [] y = [] ros = RandomOverSampler() for x in reader: if skip_header: skip_header = False continue y.append(x[-1]) X.append(list(map(int, x[:len(x) - 1]))) #print (X) X_res, y_res = ros.fit_sample(X, y) print (len(X_res)) print (len(y_res)) for idx, s in enumerate(X_res): #print (list(s) + list(y_res[idx])) writer.writerow(list(s) + list(y_res[idx])) #break;
# Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ros = RandomOverSampler() X_resampled, y_resampled = ros.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset=dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset=deal_with_nulls(dealing_with_nulls,dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors=dataset.drop(resp_var,axis=1,inplace=False) predictors=pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp=dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds,resp) #-------get the important predictors feature_imp = pd.Series(rf_clf.feature_importances_, index=list(predictors.iloc[:,0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp=pd.DataFrame(data=resp,columns=[resp_var]) predictors=pd.DataFrame(prds,columns=list(predictors)) dataset=pd.concat([resp,predictors],axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data= dataset.drop(resp_var, axis = 1,inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402) # Instantiate model with n_estimators decision trees clf = SVC(kernel='rbf',probability=True) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels,predicted)) #precision score precision = precision_score(test_labels,predicted,pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels,pred_prob[:,[1]]) #recall score rec = recall_score(test_labels,predicted,pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels,predicted,pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels,predicted,beta=0.5) #hamming_loss hamming = hamming_loss(test_labels,predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels,predicted) #logloss logloss = log_loss(test_labels,predicted) #zero-oneloss zero_one = zero_one_loss(test_labels,predicted) #auc roc area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]]) #cohen_score cohen = cohen_kappa_score(test_labels,predicted) #mathews corr mathews = matthews_corrcoef(test_labels,predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews} output=json.dumps(output) return jsonify({"Predictions": output})