Beispiel #1
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ros = RandomOverSampler(random_state=RND_SEED)
    ros.fit(X, Y)
    assert_raises(RuntimeError, ros.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ros = RandomOverSampler(random_state=RND_SEED)
    ros.fit(X, Y)
    assert_raises(RuntimeError, ros.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Beispiel #3
0
def oversample_data():

    X, y = encoded_data.drop('Churn', axis=1), encoded_data[['Churn']]

    oversampler = RandomOverSampler(random_state=1)
    oversampler.fit(X, y)

    X_oversampled, y_oversampled = oversampler.fit_sample(X, y)

    return X_oversampled, y_oversampled
Beispiel #4
0
def test_ros_fit():
    """Test the fitting method"""

    # Create the object
    ros = RandomOverSampler(random_state=RND_SEED)
    # Fit the data
    ros.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ros.min_c_, 0)
    assert_equal(ros.maj_c_, 1)
    assert_equal(ros.stats_c_[0], 500)
    assert_equal(ros.stats_c_[1], 4500)
def test_ros_fit():
    """Test the fitting method"""

    # Create the object
    ros = RandomOverSampler(random_state=RND_SEED)
    # Fit the data
    ros.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ros.min_c_, 0)
    assert_equal(ros.maj_c_, 1)
    assert_equal(ros.stats_c_[0], 3)
    assert_equal(ros.stats_c_[1], 7)
def selection(**kwargs):
    df = kwargs['ti'].xcom_pull(task_ids='data_preprocessing', key='df')

    for col in df.columns:  #Label Encoding
        if df[col].dtypes == 'object':
            encoder = LabelEncoder()
            df[col] = encoder.fit_transform(df[col])

    X = df.drop('income', axis=1)
    Y = df['income']

    selector = ExtraTreesClassifier(random_state=42)
    selector.fit(X, Y)
    feature_imp = selector.feature_importances_
    for index, val in enumerate(feature_imp):
        print(index, round((val * 100), 2))

    X = X.drop([
        'workclass', 'education', 'race', 'gender', 'capital-loss',
        'native-country'
    ],
               axis=1)

    for col in X.columns:
        scaler = StandardScaler()
        X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
    round(Y.value_counts(normalize=True) * 100, 2).astype('str') + ' %'

    ros = RandomOverSampler(random_state=42)
    ros.fit(X, Y)
    X_resampled, Y_resampled = ros.fit_resample(X, Y)
    print(
        round(Y_resampled.value_counts(normalize=True) *
              100, 2).astype('str') + ' %')

    X_train, X_test, Y_train, Y_test = train_test_split(X_resampled,
                                                        Y_resampled,
                                                        test_size=0.2,
                                                        random_state=42)

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("Y_train shape:", Y_train.shape)
    print("Y_test shape:", Y_test.shape)

    kwargs['ti'].xcom_push(key='X_train', value=X_train)
    kwargs['ti'].xcom_push(key='X_test', value=X_test)
    kwargs['ti'].xcom_push(key='Y_train', value=Y_train)
    kwargs['ti'].xcom_push(key='Y_test', value=Y_test)
Beispiel #7
0
    def __init__(self, data_pool, parameters, training):
        self.data_pool = data_pool
        self.parameters = parameters
        self.batch_size = parameters['batch_size']
        self.training = training
        # Training is defined as the boolean flag of whether the data is for training or test
        # During training, the data is sampled from a pool
        # During test, the data is sampled sequentially, and exhaustively.
        # A vector needs to be given whether the data is padding data at the end of the dataset
        # A return state needs to be given to state if all test data is given.
        self.categorical = True
        self.d_thresh_range = None

        self.val_minibatch_idx = 0
        self.d_thresh = None
        self.reduced_pool = None
        self.distance_pool_cache = {}
        self.input_mask = pd.Series([
            np.tile(self.parameters['input_mask'],
                    (self.parameters['observation_steps'], 1))
            for x in range(self.batch_size)
        ],
                                    dtype=object,
                                    index=([0] * self.batch_size))

        # Generate balanced index list
        ros = RandomOverSampler()
        if 'relative' in self.parameters['ibeo_data_columns'][0]:
            selection_data = list(data_pool.relative_destination.values)
        else:
            selection_data = list(data_pool.track_class.values)
        le = preprocessing.LabelEncoder()
        le.fit(selection_data)
        indexed_classes = np.array(le.transform(selection_data))
        ros.fit(np.expand_dims(range(len(indexed_classes)), 1),
                indexed_classes)
        balanced_idxs, balanced_classes = ros.sample(
            np.expand_dims(range(len(indexed_classes)), 1), indexed_classes)
        self.balanced_idxs = np.squeeze(balanced_idxs)
        # bf = data_pool.iloc[balanced_idxs]
        # class_dict = {}
        # for class_t in data_pool.track_class.unique():
        #     class_dict[class_t] = len(bf[bf.track_class==class_t])/float(len(bf))
        return
def execute(trainfile, sampler):

    print("--- Executing")
    print("Using trainfile:  ", trainfile)

    print("--- Loading (transformed) data")
    data = Data.Data()
    train_df = data.load(trainfile)
    y = train_df["is_attributed"]
    X = train_df.drop(["is_attributed"], axis=1)
    columns = X.columns.values

    before_class_weight = dict(
        zip([0, 1], compute_class_weight('balanced', [0, 1], y)))
    print("Original weights: ", before_class_weight)

    X_resampled = None
    y_resampled = None
    if sampler == "RANDOM":
        oversampler = RandomOverSampler(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    elif sampler == "ADASYN":
        oversampler = ADASYN(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    elif sampler == "SMOTE":
        oversampler = SMOTE(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    else:
        print("Invalid sampler: ", sampler)

    after_class_weight = dict(
        zip([0, 1], compute_class_weight('balanced', [0, 1], y_resampled)))
    print("Sampler: ", sampler, ", weights: ", after_class_weight)

    X_resampled = X_resampled.astype(int)
    y_resampled = y_resampled.astype(int)

    # print("X_resampled: ", X_resampled)
    # print("y_resampled: ", y_resampled)

    df = pd.DataFrame(data=X_resampled, columns=columns)
    df["is_attributed"] = y_resampled
    # df["is_attributed"] = df["is_attributed"].astype(int)

    compressor = "blosc"
    outfilename = trainfile + "." + sampler
    print("Output file (over-sampled): ", outfilename)
    df.to_hdf(outfilename,
              "table",
              mode="w",
              append=True,
              complevel=9,
              complib=compressor)
Beispiel #9
0
import pickle
import os
import pandas as pd
from imblearn.over_sampling import RandomOverSampler


with open(os.path.abspath('data')+'/google_play_review.pickle','rb') as f:
    df = pickle.load(f)

ros = RandomOverSampler(random_state=666)
ros.fit(df[['reviews','replies']],df[['ratings']])
X,y =  ros.fit_sample(df[['reviews','replies']],df[['ratings']])
df = pd.DataFrame(X,columns= ['reviews','replies'])

train_size = int(len(df)*0.9)

with open(os.path.abspath('data')+'/train.txt','w') as f:
    for index, row in df[:train_size].iterrows():
        if isinstance(row['reviews'],float):
            continue
        f.write(row['reviews'].replace('\r\n',' '))
        f.write('\n')
        f.write(row['replies'].replace('\r\n',' '))
        f.write('\n')


with open(os.path.abspath('data')+'/train.reviews.txt','w') as f:
    for index, row in df[:train_size].iterrows():
        if isinstance(row['reviews'],float):
            continue
        f.write(row['reviews'].replace('\r\n',' '))
Beispiel #10
0
y = train.target
train.drop(["target", "id"], axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

#train test and validation split
x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.1)
x_train, x_val, y_train, y_val = train_test_split(train, y, test_size=0.3)
print("TRAIN : ", x_train.shape , " and ", y_train.shape)
print("TEST : ", x_test.shape, " and ", y_test.shape)
print("VALIDATION : ", x_val.shape, " and ", y_val.shape)
print("MAIN TO PREDICT ", test.shape)

#Random Oversampling
ros = RandomOverSampler(random_state=0)
ros.fit(x_train, y_train)
X_resampledo, y_resampledo = ros.fit_sample(x_train, y_train)
print(X_resampledo.shape, y_resampledo.shape)

#model_selection
catboost_pool = Pool(X_resampledo, y_resampledo)
cat_model = CatBoostClassifier(task_type='CPU', iterations=20000, learning_rate=0.03, early_stopping_rounds=5)
cat_model.fit(X_resampledo, y_resampledo, verbose=True, plot=False, eval_set=(x_val, y_val),)

#accuracy on test categories
print(cat_model.score(x_test,y_test))

#metrics and score
y_pred = cat_model.predict(x_test)
print("ACCURACY SCORE : ", accuracy_score(y_test, y_pred))
print("MAE : ",mean_absolute_error(y_test, y_pred))
Beispiel #11
0
X = X.drop([
    'workclass', 'education', 'race', 'sex', 'capital.loss', 'native.country',
    'fnlwgt', 'relationship', 'capital.gain'
],
           axis=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = scaler.fit_transform(X)

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)

ros.fit(X, Y)

X_resampled, Y_resampled = ros.fit_resample(X, Y)

X = X_resampled
Y = Y_resampled

from sklearn.ensemble import RandomForestClassifier
ran_for = RandomForestClassifier(max_depth=102,
                                 n_estimators=40,
                                 random_state=42)

ran_for.fit(X, Y)

pickle.dump(ran_for, open('model.pkl', 'wb'))
Beispiel #12
0
def main(data='ann_dataset.csv', headless=False):

    dataset = pd.read_csv(data)
    oversample = True
    try:
        dataset.to_csv(("metrics/dataset.csv"), index=False)
    except:
        print("error: unable to write to file")
    inputs = (len(dataset.columns) - 1)
    #print(dataset.info())
    X = dataset.iloc[:, 0:-2].values
    y = dataset.iloc[:, -1].values

    #random resampling to reduce effect of minority class size

    # Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    # Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    from imblearn.over_sampling import RandomOverSampler

    if (oversample):
        #oversampling from training set
        ros = RandomOverSampler(random_state=0)
        ros.fit(X_train, y_train)
        X_train, y_train = ros.fit_resample(X_train, y_train)

        #oversampling for test set
        ros.fit(X_test, y_test)
        X_test, y_test = ros.fit_resample(X_test, y_test)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)

    # ------- Part-2: Build the ANN --------

    # import keras library and packages
    from keras.models import Sequential
    from keras.layers import Dense
    import livelossplot

    #creating the classifier and setting the layers...
    classifier = Sequential()
    classifier.add(Dense(inputs, activation='relu'))
    classifier.add(Dense(inputs, activation='relu'))
    classifier.add(Dense(math.floor(inputs), activation='sigmoid'))
    classifier.add(Dense(1, activation='sigmoid'))
    classifier.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])
    # Fitting the ANN to the training set
    num_epochs = 10
    batch = 100
    if (headless == False):
        classifier.fit(X_train,
                       y_train,
                       batch_size=batch,
                       epochs=num_epochs,
                       callbacks=[livelossplot.PlotLossesKeras()],
                       verbose=1,
                       validation_data=(X_test, y_test))
    else:
        classifier.fit(X_train,
                       y_train,
                       batch_size=batch,
                       epochs=num_epochs,
                       verbose=1,
                       validation_data=(X_test, y_test))
    y_pred = classifier.predict(X_test)
    # Predicting the Test set results
    score = classifier.evaluate(X_test, y_test, verbose=1)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    print("Classifier Summary")
    classifier.summary()
    #print("Y_test, Y_pred")

    # Making the confusion Matrix
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    cm = confusion_matrix(y_test, y_pred.round())
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    #write confusion matrix to file
    #cv2.imwrite("metrics/cm.img", cv2.imwrite(disp.plot()))
    if (headless == False):
        disp.plot()

    print("Confusion Matrix:")
    print(cm)

    tn = float(cm[0, 0])
    fp = float(cm[0, 1])
    tp = float(cm[1, 1])
    fn = float(cm[1, 0])

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    f1 = 2 * (precision * recall) / (precision + recall)

    print("Precision: " + str(precision * 100) + "%")
    print("Recall: " + str(recall * 100) + "%")
    print("Sensitivity: " + str(sensitivity * 100) + "%")
    print("Specificity: " + str(specificity * 100) + "%")
    print("F1 Score: " + str(f1))
Beispiel #13
0
hist_iphone_3v = px.histogram(iphone_cor_3v, x="iphonesentiment")
plot(hist_iphone_3v)

galaxy_cor_3v = galaxy_corr
galaxy_cor_3v['galaxysentiment'] = galaxy_cor_3v['galaxysentiment'].map(mapper)
galaxy_cor_3v['galaxysentiment'] = pd.Series(galaxy_cor_3v['galaxysentiment'],
                                             dtype="category")
galaxy_cor_3v.dtypes
galaxy_cor_3v['galaxysentiment'].unique()
hist_galaxy_3v = px.histogram(galaxy_cor_3v, x="galaxysentiment")
plot(hist_galaxy_3v)

### Over sampling
# Random over sampler
ros = RandomOverSampler(random_state=0)
ros.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment'])
iphone_resampled, isent_resampled = ros.sample(iphone_corr.iloc[:, 0:46],
                                               iphone_corr['iphonesentiment'])
iphone_resampled_complete = pd.DataFrame(iphone_resampled)
iphone_resampled_complete['iphonesentiment'] = isent_resampled
hist_iphone_resampled = px.histogram(iphone_resampled_complete,
                                     x='iphonesentiment')
plot(hist_iphone_resampled)

ros.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment'])
galaxy_resampled, gsent_resampled = ros.sample(galaxy_corr.iloc[:, 0:45],
                                               galaxy_corr['galaxysentiment'])
galaxy_resampled_complete = pd.DataFrame(galaxy_resampled)
galaxy_resampled_complete['galaxysentiment'] = gsent_resampled
hist_galaxy_resampled = px.histogram(galaxy_resampled_complete,
                                     x='galaxysentiment')