def balance_classes(X_train,y_train): print('UnderSample Data - Balance Classes') rus = RandomUnderSampler(random_state=42) rus.fit(X_train, y_train) X_train, y_train = rus.sample(X_train, y_train) print('After Balancing the new size is {0}'.format(len(X_train))) return X_train,y_train
class RatioRandomUnderSampler(RandomUnderSampler): def __init__(self, pos_ratio, random_state=0): self.pos_ratio = pos_ratio self.ratio_sampler = None super(RatioRandomUnderSampler, self).__init__(random_state=random_state) def fit(self, X, y): pos = len(y[y == 1]) neg = int(pos * ((1 - self.pos_ratio) / self.pos_ratio)) self.ratio_sampler = RandomUnderSampler(random_state=self.random_state, ratio={0: neg, 1: pos}) self.ratio_sampler.fit(X, y) return self def sample(self, X, y): return self.ratio_sampler.sample(X, y)
def main(): # Directory containing the original dataset new_dir = os.path.join(directory, 'original') # Iterate through each of the 7 project files for k in range(1, 8): filename = os.path.join(new_dir, str(k) + '_original.csv') print('\n\t\t\t' + filename) # Read files and get the data data, target, minor = read_file(filename) # Get the new undersampled size for the majority class ratio = round((len(data) - minor) * 0.1) print('total: ' + str(len(data)) + ' minor: ' + str(minor)) print('ratio: ' + str(ratio)) ### Important to finish significance test for this part! ### # Repeat this 10 times, get 10 random datasets for c in ascii_lowercase: if c == 'k': break print(c) # Fit the training data for the undersampled size and get new resampled training set rus = RandomUnderSampler(random_state=None, ratio={0: ratio}) rus.fit(data, target) X_resampled, y_resampled = rus.sample(data, target) print('resampled total: ' + str(len(X_resampled))) # Print number of buggy and number of non buggy bug = len([y for y in y_resampled if y == 1]) not_bug = len(y_resampled) - bug print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug)) # Store the new dataset in a new file new_filename = os.path.join( directory + '/random_sampled/files', str(k) + '_random_sampled_' + c + '.csv') with open(new_filename, 'w') as f: writer = csv.writer(f) for d, t in zip(X_resampled, y_resampled): instance = np.append(d, t) writer.writerow(instance)
def main(): # Directory containing the original dataset new_dir = os.path.join(directory, 'original') # Iterate through each of the 7 project files for k in range(1, 8): filename = os.path.join(new_dir, str(k) + '_original.csv') print('\n\t\t\t' + filename) # Read files and get the data data, target, minor = read_file(filename) # Get the new undersampled size for the majority class ratio = round((len(data) - minor) * 0.1) print('total: ' + str(len(data)) + ' minor: ' + str(minor)) # print ('ratio: '+str(ratio)) # Fit the training data for the undersampled size and get new resampled training set rus = RandomUnderSampler(random_state=RANDOM_STATE, ratio={0: minor}) rus.fit(data, target) X_resampled, y_resampled = rus.sample(data, target) print('resampled total: ' + str(len(X_resampled))) # Print number of buggy and number of non buggy bug = len([y for y in y_resampled if y == 1]) not_bug = len(y_resampled) - bug print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug)) # Store the new dataset in a new file new_filename = os.path.join(directory + '/down_sampled', str(k) + '_down_sampled.csv') with open(new_filename, 'w') as f: writer = csv.writer(f) for d, t in zip(X_resampled, y_resampled): instance = np.append(d, t) writer.writerow(instance)
xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values) plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() print( metrics.classification_report(y_test, y_pred, target_names=data['category'].unique())) # Resampling/Undersampling rus = RandomUnderSampler(ratio=.8, random_state=0) rus.fit(features, labels) X_resampled, y_resampled = rus.sample(features, labels) X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=0) clf = model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(accuracy) conf_mat = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(10, 10))
Count_promotion) print("percentage of normal transacation is", Percentage_of_no_promotion * 100) Percentage_of_promotion = Count_promotion / (Count_no_promotion + Count_promotion) print("percentage of fraud transacation", Percentage_of_promotion * 100) # In[15]: #Data Slice X = data.iloc[:, 1:13] Y = data.iloc[:, 13:] # In[12]: #ReSampling - Under Sampling from imblearn.under_sampling import RandomUnderSampler # In[16]: rus = RandomUnderSampler(random_state=0) rus.fit(X, Y) X_resampled, y_resampled = rus.sample(X, Y) Y = pd.DataFrame(y_resampled, columns=['is_promoted']) X = pd.DataFrame(X_resampled, columns=[ 'department', 'region', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score' ]) # In[18]: from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() X.department = label_encoder.fit_transform(X.department) X.region = label_encoder.fit_transform(X.region) X.education = label_encoder.fit_transform(X.education) X.gender = label_encoder.fit_transform(X.gender)
galaxy_smote, gsent_smote = smote.fit_sample(galaxy_cor_3v.iloc[:, 0:45], galaxy_cor_3v['galaxysentiment']) galaxy_smote_complete = pd.DataFrame(galaxy_smote, columns=list( galaxy_cor_3v.iloc[:, 0:45].columns)) galaxy_smote_complete['galaxysentiment'] = gsent_smote galaxy_smote_complete['galaxysentiment'].unique() hist_galaxy_smote = px.histogram(galaxy_smote_complete, x='galaxysentiment') plot(hist_galaxy_smote) ### Under sampling # Random under sampler rus = RandomUnderSampler(random_state=0) #, ratio={0: 30, 1: 20, 2: 60} rus.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled_under, isent_resampled_under = rus.sample( iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled_complete_under = pd.DataFrame(iphone_resampled_under) iphone_resampled_complete_under['iphonesentiment'] = isent_resampled_under hist_iphone_resampled_under = px.histogram(iphone_resampled_complete_under, x='iphonesentiment') plot(hist_iphone_resampled_under) rus.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled_under, gsent_resampled_under = rus.sample( galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled_complete_under = pd.DataFrame(galaxy_resampled_under) galaxy_resampled_complete_under['galaxysentiment'] = gsent_resampled_under hist_galaxy_resampled_under = px.histogram(galaxy_resampled_complete_under, x='galaxysentiment') plot(hist_galaxy_resampled_under)