Esempio n. 1
0
def balance_classes(X_train,y_train):
    print('UnderSample Data - Balance Classes')
    rus = RandomUnderSampler(random_state=42)
    rus.fit(X_train, y_train)
    X_train, y_train = rus.sample(X_train, y_train)

    print('After Balancing the new size is {0}'.format(len(X_train)))
    return X_train,y_train
Esempio n. 2
0
class RatioRandomUnderSampler(RandomUnderSampler):
    def __init__(self, pos_ratio, random_state=0):
        self.pos_ratio = pos_ratio
        self.ratio_sampler = None
        super(RatioRandomUnderSampler, self).__init__(random_state=random_state)

    def fit(self, X, y):
        pos = len(y[y == 1])
        neg = int(pos * ((1 - self.pos_ratio) / self.pos_ratio))
        self.ratio_sampler = RandomUnderSampler(random_state=self.random_state, ratio={0: neg, 1: pos})
        self.ratio_sampler.fit(X, y)
        return self

    def sample(self, X, y):
        return self.ratio_sampler.sample(X, y)
Esempio n. 3
0
def main():

    # Directory containing the original dataset
    new_dir = os.path.join(directory, 'original')

    # Iterate through each of the 7 project files
    for k in range(1, 8):
        filename = os.path.join(new_dir, str(k) + '_original.csv')
        print('\n\t\t\t' + filename)

        # Read files and get the data
        data, target, minor = read_file(filename)

        # Get the new undersampled size for the majority class
        ratio = round((len(data) - minor) * 0.1)
        print('total: ' + str(len(data)) + ' minor: ' + str(minor))
        print('ratio: ' + str(ratio))

        ### Important to finish significance test for this part! ###
        # Repeat this 10 times, get 10 random datasets
        for c in ascii_lowercase:
            if c == 'k':
                break
            print(c)
            # Fit the training data for the undersampled size and get new resampled training set
            rus = RandomUnderSampler(random_state=None, ratio={0: ratio})
            rus.fit(data, target)
            X_resampled, y_resampled = rus.sample(data, target)
            print('resampled total: ' + str(len(X_resampled)))

            # Print number of buggy and number of non buggy
            bug = len([y for y in y_resampled if y == 1])
            not_bug = len(y_resampled) - bug
            print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug))

            # Store the new dataset in a new file
            new_filename = os.path.join(
                directory + '/random_sampled/files',
                str(k) + '_random_sampled_' + c + '.csv')
            with open(new_filename, 'w') as f:
                writer = csv.writer(f)

                for d, t in zip(X_resampled, y_resampled):
                    instance = np.append(d, t)
                    writer.writerow(instance)
Esempio n. 4
0
def main():

    # Directory containing the original dataset
    new_dir = os.path.join(directory, 'original')

    # Iterate through each of the 7 project files
    for k in range(1, 8):
        filename = os.path.join(new_dir, str(k) + '_original.csv')
        print('\n\t\t\t' + filename)

        # Read files and get the data
        data, target, minor = read_file(filename)

        # Get the new undersampled size for the majority class
        ratio = round((len(data) - minor) * 0.1)
        print('total: ' + str(len(data)) + ' minor: ' + str(minor))
        # print ('ratio: '+str(ratio))

        # Fit the training data for the undersampled size and get new resampled training set
        rus = RandomUnderSampler(random_state=RANDOM_STATE, ratio={0: minor})
        rus.fit(data, target)
        X_resampled, y_resampled = rus.sample(data, target)
        print('resampled total: ' + str(len(X_resampled)))

        # Print number of buggy and number of non buggy
        bug = len([y for y in y_resampled if y == 1])
        not_bug = len(y_resampled) - bug

        print('not buggy: ' + str(not_bug) + ' buggy: ' + str(bug))

        # Store the new dataset in a new file
        new_filename = os.path.join(directory + '/down_sampled',
                                    str(k) + '_down_sampled.csv')
        with open(new_filename, 'w') as f:
            writer = csv.writer(f)

            for d, t in zip(X_resampled, y_resampled):
                instance = np.append(d, t)
                writer.writerow(instance)
Esempio n. 5
0
            xticklabels=category_id_df.category.values,
            yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(
    metrics.classification_report(y_test,
                                  y_pred,
                                  target_names=data['category'].unique()))

# Resampling/Undersampling

rus = RandomUnderSampler(ratio=.8, random_state=0)
rus.fit(features, labels)
X_resampled, y_resampled = rus.sample(features, labels)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                    y_resampled,
                                                    test_size=0.1,
                                                    random_state=0)

clf = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10, 10))
                                                   Count_promotion)
print("percentage of normal transacation is", Percentage_of_no_promotion * 100)
Percentage_of_promotion = Count_promotion / (Count_no_promotion +
                                             Count_promotion)
print("percentage of fraud transacation", Percentage_of_promotion * 100)
# In[15]:
#Data Slice
X = data.iloc[:, 1:13]
Y = data.iloc[:, 13:]
# In[12]:
#ReSampling - Under Sampling
from imblearn.under_sampling import RandomUnderSampler
# In[16]:
rus = RandomUnderSampler(random_state=0)
rus.fit(X, Y)
X_resampled, y_resampled = rus.sample(X, Y)
Y = pd.DataFrame(y_resampled, columns=['is_promoted'])
X = pd.DataFrame(X_resampled,
                 columns=[
                     'department', 'region', 'education', 'gender',
                     'recruitment_channel', 'no_of_trainings', 'age',
                     'previous_year_rating', 'length_of_service',
                     'KPIs_met >80%', 'awards_won?', 'avg_training_score'
                 ])
# In[18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X.department = label_encoder.fit_transform(X.department)
X.region = label_encoder.fit_transform(X.region)
X.education = label_encoder.fit_transform(X.education)
X.gender = label_encoder.fit_transform(X.gender)
Esempio n. 7
0
galaxy_smote, gsent_smote = smote.fit_sample(galaxy_cor_3v.iloc[:, 0:45],
                                             galaxy_cor_3v['galaxysentiment'])
galaxy_smote_complete = pd.DataFrame(galaxy_smote,
                                     columns=list(
                                         galaxy_cor_3v.iloc[:, 0:45].columns))
galaxy_smote_complete['galaxysentiment'] = gsent_smote
galaxy_smote_complete['galaxysentiment'].unique()
hist_galaxy_smote = px.histogram(galaxy_smote_complete, x='galaxysentiment')
plot(hist_galaxy_smote)

### Under sampling
# Random under sampler
rus = RandomUnderSampler(random_state=0)  #, ratio={0: 30, 1: 20, 2: 60}
rus.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment'])
iphone_resampled_under, isent_resampled_under = rus.sample(
    iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment'])
iphone_resampled_complete_under = pd.DataFrame(iphone_resampled_under)
iphone_resampled_complete_under['iphonesentiment'] = isent_resampled_under
hist_iphone_resampled_under = px.histogram(iphone_resampled_complete_under,
                                           x='iphonesentiment')
plot(hist_iphone_resampled_under)

rus.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment'])
galaxy_resampled_under, gsent_resampled_under = rus.sample(
    galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment'])
galaxy_resampled_complete_under = pd.DataFrame(galaxy_resampled_under)
galaxy_resampled_complete_under['galaxysentiment'] = gsent_resampled_under
hist_galaxy_resampled_under = px.histogram(galaxy_resampled_complete_under,
                                           x='galaxysentiment')
plot(hist_galaxy_resampled_under)