def train_test(self, data_type=None, test_size=0.20, random_state=None): data_xy = self._get_data(data_type) x_train = data_xy.x y_train = data_xy.y x_test = pd.DataFrame(data=None, columns=x_train.columns) y_test = pd.Series(data=None, name=data_xy.y.name) if test_size > 0: x_train, x_test, y_train, y_test = train_test_split( data_xy.x, data_xy.y, stratify=data_xy.y, random_state=random_state, test_size=test_size) ros = RandomUnderSampler(random_state=random_state) # ros = RandomOverSampler(random_state=random_state) # ros = SMOTE() ros.fit(x_train, y_train) x_train2, y_train2 = ros.fit_resample(x_train, y_train) x_train = pd.DataFrame(x_train2, columns=x_train.columns) y_train = pd.Series(y_train2, name=y_train.name) train = DataXyz(data_xy.code, x_train, y_train) train = self._get_data3(train) test = DataXyz(data_xy.code, x_test, y_test) test = self._get_data3(test) return (train, test)
class ImbalancedClassResampler(): SMOTE = "SMOTE" RANDOM_UNDERSAMPLE = "RANDOM_UNDERSAMPLE" def __init__(self, method=None, n_process=1): self.method = method self.n_process = n_process self.resampler = None def fit(self, x, y): if self.method is None: return self if self.method == ImbalancedClassResampler.SMOTE: self.resampler = SMOTE(n_jobs=self.n_process) self.resampler.fit(x, y) elif self.method == ImbalancedClassResampler.RANDOM_UNDERSAMPLE: self.resampler = RandomUnderSampler() def get_params(self, deep): return {"method": self.method} def set_params(self, method): self.method = method def resample(self, x, y): if self.method is None: return x, y return self.resampler.resample(x, y) def fit_resample(self, x, y): self.fit(x, y) return self.transform(x, y)
def balance_classes(X_train,y_train): print('UnderSample Data - Balance Classes') rus = RandomUnderSampler(random_state=42) rus.fit(X_train, y_train) X_train, y_train = rus.sample(X_train, y_train) print('After Balancing the new size is {0}'.format(len(X_train))) return X_train,y_train
def test_rus_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object rus = RandomUnderSampler(random_state=RND_SEED) rus.fit(X, Y) assert_raises(RuntimeError, rus.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_rus_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object rus = RandomUnderSampler(random_state=RND_SEED) rus.fit(X, Y) assert_raises(RuntimeError, rus.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_rus_fit(): """Test the fitting method""" # Create the object rus = RandomUnderSampler(random_state=RND_SEED) # Fit the data rus.fit(X, Y) # Check if the data information have been computed assert_equal(rus.min_c_, 0) assert_equal(rus.maj_c_, 1) assert_equal(rus.stats_c_[0], 500) assert_equal(rus.stats_c_[1], 4500)
def test_rus_fit(): """Test the fitting method""" # Create the object rus = RandomUnderSampler(random_state=RND_SEED) # Fit the data rus.fit(X, Y) # Check if the data information have been computed assert_equal(rus.min_c_, 0) assert_equal(rus.maj_c_, 1) assert_equal(rus.stats_c_[0], 3) assert_equal(rus.stats_c_[1], 7)
class RatioRandomUnderSampler(RandomUnderSampler): def __init__(self, pos_ratio, random_state=0): self.pos_ratio = pos_ratio self.ratio_sampler = None super(RatioRandomUnderSampler, self).__init__(random_state=random_state) def fit(self, X, y): pos = len(y[y == 1]) neg = int(pos * ((1 - self.pos_ratio) / self.pos_ratio)) self.ratio_sampler = RandomUnderSampler(random_state=self.random_state, ratio={0: neg, 1: pos}) self.ratio_sampler.fit(X, y) return self def sample(self, X, y): return self.ratio_sampler.sample(X, y)
def main(): # Directory containing the original dataset new_dir = os.path.join(directory, 'original') # Iterate through each of the 7 project files for k in range(1, 8): filename = os.path.join(new_dir, str(k) + '_original.csv') print('\n\t\t\t' + filename) # Read files and get the data data, target, minor = read_file(filename) # Get the new undersampled size for the majority class ratio = round((len(data) - minor) * 0.1) print('total: ' + str(len(data)) + ' minor: ' + str(minor)) print('ratio: ' + str(ratio)) ### Important to finish significance test for this part! ### # Repeat this 10 times, get 10 random datasets for c in ascii_lowercase: if c == 'k': break print(c) # Fit the training data for the undersampled size and get new resampled training set rus = RandomUnderSampler(random_state=None, ratio={0: ratio}) rus.fit(data, target) X_resampled, y_resampled = rus.sample(data, target) print('resampled total: ' + str(len(X_resampled))) # Print number of buggy and number of non buggy bug = len([y for y in y_resampled if y == 1]) not_bug = len(y_resampled) - bug print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug)) # Store the new dataset in a new file new_filename = os.path.join( directory + '/random_sampled/files', str(k) + '_random_sampled_' + c + '.csv') with open(new_filename, 'w') as f: writer = csv.writer(f) for d, t in zip(X_resampled, y_resampled): instance = np.append(d, t) writer.writerow(instance)
def main(): # Directory containing the original dataset new_dir = os.path.join(directory, 'original') # Iterate through each of the 7 project files for k in range(1, 8): filename = os.path.join(new_dir, str(k) + '_original.csv') print('\n\t\t\t' + filename) # Read files and get the data data, target, minor = read_file(filename) # Get the new undersampled size for the majority class ratio = round((len(data) - minor) * 0.1) print('total: ' + str(len(data)) + ' minor: ' + str(minor)) # print ('ratio: '+str(ratio)) # Fit the training data for the undersampled size and get new resampled training set rus = RandomUnderSampler(random_state=RANDOM_STATE, ratio={0: minor}) rus.fit(data, target) X_resampled, y_resampled = rus.sample(data, target) print('resampled total: ' + str(len(X_resampled))) # Print number of buggy and number of non buggy bug = len([y for y in y_resampled if y == 1]) not_bug = len(y_resampled) - bug print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug)) # Store the new dataset in a new file new_filename = os.path.join(directory + '/down_sampled', str(k) + '_down_sampled.csv') with open(new_filename, 'w') as f: writer = csv.writer(f) for d, t in zip(X_resampled, y_resampled): instance = np.append(d, t) writer.writerow(instance)
fmt='d', xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values) plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() print( metrics.classification_report(y_test, y_pred, target_names=data['category'].unique())) # Resampling/Undersampling rus = RandomUnderSampler(ratio=.8, random_state=0) rus.fit(features, labels) X_resampled, y_resampled = rus.sample(features, labels) X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=0) clf = model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(accuracy) conf_mat = confusion_matrix(y_test, y_pred)
Percentage_of_no_promotion = Count_no_promotion / (Count_no_promotion + Count_promotion) print("percentage of normal transacation is", Percentage_of_no_promotion * 100) Percentage_of_promotion = Count_promotion / (Count_no_promotion + Count_promotion) print("percentage of fraud transacation", Percentage_of_promotion * 100) # In[15]: #Data Slice X = data.iloc[:, 1:13] Y = data.iloc[:, 13:] # In[12]: #ReSampling - Under Sampling from imblearn.under_sampling import RandomUnderSampler # In[16]: rus = RandomUnderSampler(random_state=0) rus.fit(X, Y) X_resampled, y_resampled = rus.sample(X, Y) Y = pd.DataFrame(y_resampled, columns=['is_promoted']) X = pd.DataFrame(X_resampled, columns=[ 'department', 'region', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score' ]) # In[18]: from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() X.department = label_encoder.fit_transform(X.department) X.region = label_encoder.fit_transform(X.region) X.education = label_encoder.fit_transform(X.education)
all_auc_with_clustered_trees = [] all_auc_with_one_tree = [] X_train_major = np.zeros((0, 1294)) y_train_major = [] avg_roc = 0 avg_aupr = 0 for train_index, test_index in skf.split(X, y): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] major_class = max(sampler.fit(X_train, y_train).stats_c_, key=sampler.fit(X_train, y_train).stats_c_.get) major_class_X_train = [] major_class_y_train = [] minor_class_X_train = [] minor_class_y_train = [] for index in range(len(X_train)): if y_train[index] == major_class: major_class_X_train.append(X_train[index]) major_class_y_train.append(y_train[index]) else: minor_class_X_train.append(X_train[index]) minor_class_y_train.append(y_train[index])
def create_model(dataset): print("dataset : ", dataset) df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None) print('reading', dataset) df['label'] = df[df.shape[1] - 1] # df.drop([df.shape[1] - 2], axis=1, inplace=True) labelencoder = LabelEncoder() df['label'] = labelencoder.fit_transform(df['label']) # X = np.array(df.drop(['label'], axis=1)) y = np.array(df['label']) number_of_clusters = 23 sampler = RandomUnderSampler() normalization_object = Normalizer() X = normalization_object.fit_transform(X) skf = StratifiedKFold(n_splits=5, shuffle=True) n_classes = 2 for train_index, test_index in skf.split(X, y): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] break print('training', dataset) top_roc = 0 depth_for_rus = 0 split_for_rus = 0 for depth in range(3, 20, 20): for split in range(3, 9, 20): classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=depth, min_samples_split=split), n_estimators=100, learning_rate=1, algorithm='SAMME') X_train, y_train = sampler.fit_sample(X_train, y_train) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) score = roc_auc_score(y_test, predictions[:, 1]) if top_roc < score: top_roc = score tpr = dict() fpr = dict() roc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test, predictions[:, i]) roc[i] = roc_auc_score(y_test, predictions[:, i]) major_class = max(sampler.fit(X_train, y_train).stats_c_, key=sampler.fit(X_train, y_train).stats_c_.get) major_class_X_train = [] major_class_y_train = [] minor_class_X_train = [] minor_class_y_train = [] for index in range(len(X_train)): if y_train[index] == major_class: major_class_X_train.append(X_train[index]) major_class_y_train.append(y_train[index]) else: minor_class_X_train.append(X_train[index]) minor_class_y_train.append(y_train[index]) # optimize for number of clusters here kmeans = KMeans(max_iter=200, n_jobs=4, n_clusters=number_of_clusters) kmeans.fit(major_class_X_train) # get the centroids of each of the clusters cluster_centroids = kmeans.cluster_centers_ # get the points under each cluster points_under_each_cluster = { i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters) } for i in range(number_of_clusters): size = len(points_under_each_cluster[i]) random_indexes = np.random.randint(low=0, high=size, size=int(size / 2)) temp = points_under_each_cluster[i] feature_indexes = temp[random_indexes] X_train_major = np.concatenate( (X_train_major, X_train[feature_indexes]), axis=0) y_train_major = np.concatenate( (y_train_major, y_train[feature_indexes]), axis=0) final_train_x = np.concatenate((X_train_major, minor_class_X_train), axis=0) final_train_y = np.concatenate((y_train_major, minor_class_y_train), axis=0) classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=150)) # classifier = sklearn.svm.SVC(C=50 , gamma= .0008 , kernel='rbf', probability=True) # classifier = sklearn.svm.SVC(C=100, gamma=.006, kernel='rbf', probability=True) classifier.fit(final_train_x, final_train_y) predicted = classifier.predict_proba(X_test) tpr_c = dict() fpr_c = dict() roc_c = dict() for i in range(n_classes): fpr_c[i], tpr_c[i], _ = roc_curve(y_test, predictions[:, i]) roc_c[i] = auc(y_test, predictions[:, i]) print('ploting', dataset) # plt.clf() plt.plot(fpr[1], tpr[1], lw=2, color='red', label='Roc curve: Clustered sampling') plt.plot(fpr_c[1], tpr_c[1], lw=2, color='navy', label='Roc curve: random under sampling') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Area under ROC curve') plt.legend(loc="lower right") plt.show()
plot(hist_iphone_smote) galaxy_smote, gsent_smote = smote.fit_sample(galaxy_cor_3v.iloc[:, 0:45], galaxy_cor_3v['galaxysentiment']) galaxy_smote_complete = pd.DataFrame(galaxy_smote, columns=list( galaxy_cor_3v.iloc[:, 0:45].columns)) galaxy_smote_complete['galaxysentiment'] = gsent_smote galaxy_smote_complete['galaxysentiment'].unique() hist_galaxy_smote = px.histogram(galaxy_smote_complete, x='galaxysentiment') plot(hist_galaxy_smote) ### Under sampling # Random under sampler rus = RandomUnderSampler(random_state=0) #, ratio={0: 30, 1: 20, 2: 60} rus.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled_under, isent_resampled_under = rus.sample( iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled_complete_under = pd.DataFrame(iphone_resampled_under) iphone_resampled_complete_under['iphonesentiment'] = isent_resampled_under hist_iphone_resampled_under = px.histogram(iphone_resampled_complete_under, x='iphonesentiment') plot(hist_iphone_resampled_under) rus.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled_under, gsent_resampled_under = rus.sample( galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled_complete_under = pd.DataFrame(galaxy_resampled_under) galaxy_resampled_complete_under['galaxysentiment'] = gsent_resampled_under hist_galaxy_resampled_under = px.histogram(galaxy_resampled_complete_under, x='galaxysentiment')