Python over_samplingの例、resources.sampling.over_sampling Pythonの例

コード例 #1

0

ファイルを表示

ファイル: neural_net_test.py プロジェクト: sebalp1987/trabajo_4

    def fit_model(self,
                  predictors,
                  target,
                  learning_rate=0.001,
                  loss_function='categorical_crossentropy',
                  epochs=500,
                  batch_size=500,
                  verbose=True,
                  callback_list=[],
                  validation_data=None):

        if self.sampling is None:
            pass
        elif self.sampling == 'ALLKNN':
            predictors, target = under_sampling(predictors, target)
        else:
            predictors, target = over_sampling(predictors,
                                               target,
                                               model=self.sampling)

        target = to_categorical(target.target)
        if validation_data is not None:
            validation_data[1] = to_categorical(validation_data[1].target)

        optimizer = Adam(lr=learning_rate)
        self.model.compile(optimizer=optimizer,
                           loss=loss_function,
                           metrics=['accuracy'])
        self.model.fit(x=predictors,
                       y=target,
                       epochs=epochs,
                       batch_size=batch_size,
                       validation_data=validation_data,
                       callbacks=callback_list,
                       verbose=verbose)

コード例 #2

0

ファイルを表示

    def applied(self, x_train, y_train, x_predicted):
        """
        The applied definitive model, getting just probabilities.
        """

        if self.sampling is None:
            class_weight = self.class_weight

        elif self.sampling == 'ALLKNN':
            x_train, y_train = under_sampling(x_train, y_train)
            class_weight = None

        else:
            x_train, y_train = over_sampling(x_train,
                                             y_train,
                                             model=self.sampling)
            class_weight = None

        if isinstance(x_train, pd.DataFrame):
            x_train = x_train.values
        if isinstance(y_train, (pd.DataFrame, pd.Series)):
            y_train = y_train.values

        min_sample_leaf = round(x_train.shape[0] * 0.01)
        min_sample_split = min_sample_leaf * 10
        max_features = 'sqrt'

        file_model = ensemble.ExtraTreesClassifier(
            criterion='entropy',
            bootstrap=self.bootstrap,
            min_samples_leaf=min_sample_leaf,
            min_samples_split=min_sample_split,
            n_estimators=self.n_estiators,
            max_depth=self.max_depth,
            max_features=max_features,
            oob_score=self.oob_score,
            random_state=531,
            verbose=1,
            class_weight=class_weight,
            n_jobs=1)

        file_model.fit(x_train, y_train)
        y_hat_test = file_model.predict_proba(x_predicted)
        y_hat_test = np.delete(y_hat_test, 0, axis=1)
        y_hat_test = (y_hat_test > self.final_threshold).astype(int)

        return y_hat_test

コード例 #3

0

ファイルを表示

        x_train_sample_1, y_train_sample_1, x_test_sample_1, y_test_sample_1
], [x_train_sample_2, y_train_sample_2, x_test_sample_2, y_test_sample_2]]:
    i += 1
    x = sample[0]
    y = sample[1]
    print(y.sum() / y.count())
    x_final_test = sample[2]
    y_final_test = sample[3]

    if sampling is None:
        pass
    elif sampling == 'ALLKNN':
        x, y = under_sampling(x, y)
        class_weight = None
    else:
        x, y = over_sampling(x, y, model=sampling)
        class_weight = None

    try:
        min_sample_leaf = round(y.shape[0] * 0.06)
        min_sample_split = min_sample_leaf * 10
        fileModel.min_samples_leaf = min_sample_leaf
        fileModel.min_samples_split = min_sample_split
        fileModel.fit(x, y)
        y_pred_score = fileModel.predict_proba(x_final_test)
        feature_importance = fileModel.feature_importances_
        feature_importance = feature_importance / feature_importance.max()
        sorted_idx = np.argsort(feature_importance)
        bar_position = np.arange(sorted_idx.shape[0]) + 0.5
        plot.barh(bar_position, feature_importance[sorted_idx], align='center')
        plot.yticks(bar_position, columns[sorted_idx])

コード例 #4

0

ファイルを表示

ファイル: training.py プロジェクト: sebalp1987/categorical_challenge

    f, ax = plot.subplots(2)
    sns.countplot(df_0[i], ax=ax[0])
    sns.countplot(df_1[i], ax=ax[1])

    plot.show()
'''
predictors = df.drop(['id', 'target'], axis=1)
n_cols = predictors.shape[1]

x_train, x_valid, y_train, y_valid = train_test_split(predictors,
                                                      df['target'],
                                                      train_size=0.75,
                                                      shuffle=True,
                                                      random_state=42)
x_train, y_train = sampling.over_sampling(x_train,
                                          y_train,
                                          model='ADASYN',
                                          neighbors=500)

y_train = to_categorical(y_train)
y_valid = to_categorical(y_valid)

input_tensor = Input(shape=(n_cols, ))
x = layers.Dense(40,
                 activation='relu',
                 kernel_regularizer=L1L2(l1=0.01, l2=0.))(input_tensor)
x = layers.Dense(20, activation='relu')(x)
# x = layers.Dropout(0.15)(x)
output_tensor = layers.Dense(2, activation='sigmoid')(x)

model = Model(input_tensor, output_tensor)
print(model.summary())

コード例 #5

0

ファイルを表示

feature_importance_list = []
feature_name_k = pd.DataFrame(columns=['names', 'index'])
for train_index, test_index in skf.split(x.values, y[[label]].values):
    k += 1
    x_train, x_test = x.loc[train_index].values, x.loc[test_index].values
    y_train, y_test = y.drop('oferta_id',
                             axis=1).loc[train_index].values, y.drop(
                                 'oferta_id', axis=1).loc[test_index].values
    print(train_index, test_index)
    if sampling is None:
        pass
    elif sampling == 'ALLKNN':
        x_train, y_train = under_sampling(x_train, y_train)
        class_weight = None
    else:
        x_train, y_train = over_sampling(x_train, y_train, model=sampling)
        class_weight = None

    try:
        min_sample_leaf = round(y_train.shape[0] * 0.06)
        min_sample_split = min_sample_leaf * 10
        fileModel.min_samples_leaf = min_sample_leaf
        fileModel.min_samples_split = min_sample_split
        fileModel.fit(x_train, y_train)
        y_pred_score_i = fileModel.predict_proba(x_test)

        # Feature Importance
        featureImportance = fileModel.feature_importances_
        feature_importance_list.append(featureImportance)
        sorted_idx_k = np.argsort(featureImportance)
        fi_k = featureImportance[sorted_idx_k]

コード例 #6

0

ファイルを表示

    def threshold(self, x_train, y_train, x_valid, y_valid, plot_graph=True):
        """
        Obtain optimal threshold using FBeta as parameter using a range (0.1, 1.0, 200) for 
        evaluation
        """

        if self.sampling is None:
            class_weight = self.class_weight

        elif self.sampling == 'ALLKNN':
            x_train, y_train = under_sampling(x_train, y_train)
            class_weight = None

        else:
            x_train, y_train = over_sampling(x_train,
                                             y_train,
                                             model=self.sampling)
            class_weight = None

        if isinstance(x_train, pd.DataFrame):
            x_train = x_train.values
        if isinstance(y_train, (pd.DataFrame, pd.Series)):
            y_train = y_train.values
        if isinstance(x_valid, pd.DataFrame):
            x_valid = x_valid.values
        if isinstance(y_valid, (pd.DataFrame, pd.Series)):
            y_valid = y_valid.values

        min_sample_leaf = round(x_train.shape[0] * 0.01)
        min_sample_split = min_sample_leaf * 10
        max_features = None

        file_model = ensemble.ExtraTreesClassifier(
            criterion='gini',
            bootstrap=self.bootstrap,
            min_samples_leaf=min_sample_leaf,
            min_samples_split=min_sample_split,
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            max_features=max_features,
            oob_score=self.oob_score,
            random_state=531,
            verbose=1,
            class_weight=class_weight,
            n_jobs=1)
        cv = StratifiedKFold(n_splits=10, random_state=None)
        file_model.fit(x_train, y_train)

        thresholds = np.linspace(0.1, 1.0, 200)

        scores = []

        y_pred_score = cross_val_predict(file_model,
                                         x_valid,
                                         y_valid,
                                         cv=cv,
                                         method='predict_proba')

        y_pred_score = np.delete(y_pred_score, 0, axis=1)

        for threshold in thresholds:
            y_hat = (y_pred_score > threshold).astype(int)
            y_hat = y_hat.tolist()
            y_hat = [item for sublist in y_hat for item in sublist]

            scores.append([
                recall_score(y_pred=y_hat, y_true=y_valid),
                precision_score(y_pred=y_hat, y_true=y_valid),
                fbeta_score(y_pred=y_hat,
                            y_true=y_valid,
                            beta=self.beta,
                            average=self.metric_weight)
            ])

        scores = np.array(scores)

        if plot_graph:
            plot.plot(thresholds, scores[:, 0], label='$Recall$')
            plot.plot(thresholds, scores[:, 1], label='$Precision$')
            plot.plot(thresholds, scores[:, 2], label='$F_2$')
            plot.ylabel('Score')
            plot.xlabel('Threshold')
            plot.legend(loc='best')
            plot.close()

        self.final_threshold = thresholds[scores[:, 2].argmax()]
        print(self.final_threshold)
        return self.final_threshold

コード例 #7

0

ファイルを表示

    def over_fitting(self,
                     x_train,
                     y_train,
                     x_test,
                     y_test,
                     n_estimator_range=range(1, 301, 10)):
        """
        Calculate overfitting using accuracy for ERT Class model. We get the training Acc and
        the test Acc, and also plot it.
        """

        accuracy_test_list = []
        accuracy_train_list = []

        if self.sampling is None:
            class_weight = self.class_weight

        elif self.sampling == 'ALLKNN':
            x_train, y_train = under_sampling(x_train, y_train)
            class_weight = None

        else:
            x_train, y_train = over_sampling(x_train,
                                             y_train,
                                             model=self.sampling)
            class_weight = None

        if isinstance(x_train, pd.DataFrame):
            x_train = x_train.values
        if isinstance(y_train, (pd.DataFrame, pd.Series)):
            y_train = y_train.values
        if isinstance(x_test, pd.DataFrame):
            x_test = x_test.values
        if isinstance(y_test, (pd.DataFrame, pd.Series)):
            y_test = y_test.values

        min_sample_leaf = round(x_train.shape[0] * 0.01)
        min_sample_split = min_sample_leaf * 10
        max_features = 'sqrt'

        for iter in n_estimator_range:
            print('iter nº: ', iter)

            file_model = ensemble.ExtraTreesClassifier(
                criterion='entropy',
                bootstrap=self.bootstrap,
                min_samples_leaf=min_sample_leaf,
                min_samples_split=min_sample_split,
                n_estimators=iter,
                max_depth=self.max_depth,
                max_features=max_features,
                oob_score=self.oob_score,
                random_state=531,
                verbose=1,
                class_weight=class_weight,
                n_jobs=1)
            file_model.fit(x_train, y_train)

            predictions = file_model.predict_proba(x_test)
            predictions = np.delete(predictions, 0, axis=1)
            predictions = (predictions > self.final_threshold).astype(int)
            accuracy_test_list.append(accuracy_score(y_test, predictions))

            predictions = file_model.predict_proba(x_train)
            predictions = np.delete(predictions, 0, axis=1)
            predictions = (predictions > self.final_threshold).astype(int)
            accuracy_train_list.append(accuracy_score(y_train, predictions))

        plot.figure()
        plot.plot(n_estimator_range,
                  accuracy_train_list,
                  label='Training Set Accuracy')
        plot.plot(n_estimator_range,
                  accuracy_test_list,
                  label='Test Set Accuracy')
        plot.legend(loc='upper right')
        plot.xlabel('Number of Trees in Ensamble')
        plot.ylabel('Accuracy')
        plot.show()

コード例 #8

0

ファイルを表示

    def evaluation(self, x_train, y_train, x_test, y_test, plot_graph=True):
        """
        Evaluate the performance of the ERT model using Recall, Precision and FBeta using the
        optimal threshold. Also we can get a Confusion Matrix and Importance Feature ploting.
        """

        columns = None
        label = None

        if self.sampling is None:
            class_weight = self.class_weight

        elif self.sampling == 'ALLKNN':
            x_train, y_train = under_sampling(x_train, y_train)
            class_weight = None

        else:
            x_train, y_train = over_sampling(x_train,
                                             y_train,
                                             model=self.sampling)
            class_weight = None

        if isinstance(x_train, pd.DataFrame):
            columns = x_train.columns.values
            x_train = x_train.values
        if isinstance(y_train, (pd.DataFrame, pd.Series)):
            label = y_train.columns.values
            y_train = y_train.values
        if isinstance(x_test, pd.DataFrame):
            x_test = x_test.values
        if isinstance(y_test, (pd.DataFrame, pd.Series)):
            y_test = y_test.values

        min_sample_leaf = round(x_train.shape[0] * 0.01)
        min_sample_split = min_sample_leaf * 10
        max_features = 'sqrt'

        file_model = ensemble.ExtraTreesClassifier(
            criterion='entropy',
            bootstrap=self.bootstrap,
            min_samples_leaf=min_sample_leaf,
            min_samples_split=min_sample_split,
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            max_features=max_features,
            oob_score=self.oob_score,
            random_state=531,
            verbose=1,
            class_weight=class_weight,
            n_jobs=1)

        file_model.fit(x_train, y_train)

        y_hat_test = file_model.predict_proba(x_test)

        y_hat_test = np.delete(y_hat_test, 0, axis=1)
        print(self.final_threshold)
        y_hat_test = (y_hat_test > self.final_threshold).astype(int)
        y_hat_test = y_hat_test.tolist()
        y_hat_test = [item for sublist in y_hat_test for item in sublist]

        print('Final threshold: %.3f' % self.final_threshold)
        print('Test Recall Score: %.3f' %
              recall_score(y_pred=y_hat_test, y_true=y_test))
        print('Test Precision Score: %.3f' %
              precision_score(y_pred=y_hat_test, y_true=y_test))
        print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test,
                                                  y_true=y_test,
                                                  beta=self.beta,
                                                  average=self.metric_weight))

        # PLOTS
        if plot_graph:
            # CONFUSION MATRIX
            cnf_matrix = confusion_matrix(y_test, y_hat_test)
            plot_csm(cnf_matrix,
                     classes=['No' + str(label), str(label)],
                     title='Confusion matrix')

            # FEATURE IMPORTANCE
            if columns is not None:
                feature_importance = file_model.feature_importances_
                feature_importance = feature_importance / feature_importance.max(
                )
                sorted_idx = np.argsort(feature_importance)
                bar_position = np.arange(sorted_idx.shape[0]) + 0.5
                plot.barh(bar_position,
                          feature_importance[sorted_idx],
                          align='center')
                plot.yticks(bar_position, columns[sorted_idx])
                plot.xlabel('Variable Importance')
                plot.show()