コード例 #1
0
def _oversample(X, y, method='SMOTE', strat='not majority'):
    # compute minimum number of samples per class
    min_samples = len(y)
    for l in set(y):
        if y.tolist().count(l) < min_samples:
            min_samples = y.tolist().count(l)
    if min_samples <= 5:
        method = 'RNDM'

    if method == 'ADASYN':
        ios = imbover.ADASYN(sampling_strategy=strat, random_state=42)
    elif method == 'SMOTE':
        ios = imbover.SMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'SMOTENC':
        ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42)
    elif method == 'BORDERSMOTE':
        ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'SVMSMOTE':
        ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'KMEANSSMOTE':
        ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'RNDM':
        ios = imbover.RandomOverSampler(sampling_strategy=strat,
                                        random_state=42)

    X_resampled, y_resampled = ios.fit_resample(X, y)
    return X_resampled, y_resampled
コード例 #2
0
def oversample(classifier):
    print('*** OVERSAMPLE ***')

    pipe = pipeline.Pipeline([
        ('scaler', preprocessing.StandardScaler()),
        ('resample', over_sampling.RandomOverSampler()),
        classifier,
    ])

    X, y = prepare_data()

    y_pred = model_selection.cross_val_predict(pipe, X, y, cv=cv, n_jobs=-1)

    c = Counter(over_sampling.RandomOverSampler().fit_sample(X, y)[1])

    return y, y_pred, c
コード例 #3
0
def create_predictions(model, x_df, y_df):
    """create_predictions splits the data into a training and testing set,
        oversamples the training set, and predicts the testing set.
        
    Args:
        model(sklearn.ensemble.RandomForestClassifier): The machine learning model to train and predict with
        x_df(pd.DataFrame): Input "x" vector 
        y_df(pd.DataFrame): Output "y" vector
    """
    # Split the data into training and testing data
    x_train_df, x_test_df, y_train_df, y_test_df = train_test_split(x_df, y_df)

    # Oversample the training data
    oversampling = over_sampling.RandomOverSampler()
    x_resampled, y_resampled = oversampling.fit_sample(x_train_df, y_train_df)

    # Train the model on the oversampled training data
    model.fit(x_resampled, y_resampled)

    # Use model to predict the testing set
    y_pred_df = model.predict(x_test_df)

    # Create a confusion matrix and write to file.
    cm_df = pd.DataFrame(metrics.confusion_matrix(y_test_df, y_pred_df),
                         index=["actual_negative", "actual_positive"],
                         columns=["predicted_negative", "predicted_positive"])
    cm_df.to_csv(
        (DIRECTORY + "results/" + DATE + LOC + TYPE + "confusion_matrix.csv"),
        sep='\t')

    # Create a file to store metrics.
    metrics_file = open(
        (DIRECTORY + "results/" + DATE + LOC + TYPE + "metrics.txt"), "w+")
    metrics_file.write(metrics.classification_report(y_test_df, y_pred_df))
コード例 #4
0
 def balance_classes(self, X, y, max):
     classes = np.unique(y)
     balanced_X = None
     balanced_y = None
     oversample = over_sampling.RandomOverSampler(
         sampling_strategy='minority')
     balanced_X, balanced_y = oversample.fit_resample(X, y)
     # for c in classes:
     #     if len(np.where(y == c)[0])>max:
     #         indices=np.where(y==c)
     #         print(indices)
     #         indices=(indices[0][0:max],)
     #         print(indices)
     #         if balanced_X is None:
     #             balanced_X=np.take(X,indices=indices,axis=0)
     #             balanced_y = np.take(y, indices=indices, axis=0)
     #         else:
     #             balanced_X=np.concatenate((balanced_X,np.take(X,indices=indices,axis=0)), axis=1)
     #             balanced_y=np.append(balanced_y,np.take(y,indices=indices,axis=0))
     #     else:
     #         indices=np.where(y==c)
     #         if balanced_X is None:
     #             balanced_X=np.take(X,indices=indices,axis=0)
     #             balanced_y = np.take(y, indices=indices, axis=0)
     #         else:
     #             balanced_X=np.concatenate((balanced_X,np.take(X,indices=indices,axis=0)), axis=1)
     #             balanced_y=np.append(balanced_y,np.take(y,indices=indices,axis=0))
     #
     # balanced_X = balanced_X.reshape((balanced_X.shape[1], balanced_X.shape[2]))
     return balanced_X, balanced_y
コード例 #5
0
def get_confusion_matrix(model_data,
                         feature_cols,
                         dependent_variable,
                         seed_val=0,
                         test_size=0.25,
                         solver='liblinear',
                         oversample_training_data=False,
                         sampling_strategy=0.5):

    # Split into independent and dependent variables
    X = model_data[feature_cols]
    y = model_data[dependent_variable]

    # Split into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed_val)

    # Oversample training data if specified
    oversample_obj = imbsample.RandomOverSampler(
        sampling_strategy=sampling_strategy, random_state=seed_val)
    X_train, y_train = oversample_obj.fit_resample(X_train, y_train)

    # Instantiate the model
    lr = LogisticRegression(solver=solver)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

    return cnf_matrix
コード例 #6
0
ファイル: datasets.py プロジェクト: asvilesov/modil
 def oversample(self, x, y):
     '''
     oversample minority actions in case of sparse actions
     '''
     ros = imb.RandomOverSampler(random_state=0)
     x, y = ros.fit_resample(x, y)
     self.check_balance()
     return x, y
コード例 #7
0
    def ReSampling(self, data, labels, over_s=True):

        label_status = Counter(labels)
        print(self.tasktype, "data " + self.tasktype, label_status)

        featurelen = len(data[0])
        if 1 not in label_status.keys():
            x, y = np.zeros(shape=featurelen, dtype=np.int), 1
        elif 0 not in label_status.keys():
            x, y = np.zeros(shape=featurelen, dtype=np.int), 0
        else:
            x, y = None, None
        if x is not None:
            data = np.insert(data, 0, x, 0)
            labels = np.insert(labels, 0, y, 0)

        if len(label_status) < 2:
            print(self.tasktype, "no need to resample")
            return data, labels
        if label_status[1] / label_status[0] < 5. and label_status[
                1] / label_status[0] > 0.2:
            print("data are not biased too much")
            return data, labels

        maxSamples = label_status[0]
        if label_status[1] > label_status[0]:
            maxSamples = label_status[1]
            resampling = over_sampling.ADASYN(ratio={
                1: maxSamples,
                0: int(0.4 * maxSamples)
            })
        else:
            resampling = over_sampling.ADASYN(ratio={
                0: maxSamples,
                1: int(0.4 * maxSamples)
            })

        try:
            data, labels = resampling.fit_sample(data, labels)
        except:
            print(self.tasktype, "resampling using random method")
            if over_s:
                resampling = over_sampling.RandomOverSampler()
            else:
                resampling = under_sampling.RandomUnderSampler()

            data, labels = resampling.fit_sample(data, labels)

        label_status = Counter(labels)
        print(self.tasktype, "sampling status=", label_status)

        return data, labels
コード例 #8
0
ファイル: UMCE.py プロジェクト: w4k2/weles
def us_os_bac(base_clf, X_train, y_train, X_test, y_test):
    us = under_sampling.RandomUnderSampler()
    os = over_sampling.RandomOverSampler()
    X_us, y_us = us.fit_sample(X_train, y_train)
    X_os, y_os = os.fit_sample(X_train, y_train)
    us_clf = base.clone(base_clf)
    os_clf = base.clone(base_clf)
    us_clf.fit(X_us, y_us)
    os_clf.fit(X_os, y_os)
    us_pred = us_clf.predict(X_test)
    os_pred = os_clf.predict(X_test)
    return (
        metrics.balanced_accuracy_score(y_test, us_pred),
        metrics.balanced_accuracy_score(y_test, os_pred),
    )
コード例 #9
0
ファイル: UMCE.py プロジェクト: w4k2/weles
    def fit(self, X_train, y_train):
        # Save X and y
        self.X_train = X_train
        self.y_train = y_train

        # Firstly we analyze the training set to find majority class and to
        # establish the imbalance ratio
        self.classes, c_counts = np.unique(y_train, return_counts=True)
        majority_c = 0 if c_counts[0] > c_counts[1] else 1
        minority_c = 1 - majority_c

        min_idx = np.where(y_train == minority_c)[0]
        maj_idx = np.where(y_train == majority_c)[0]

        # K is the imbalanced ratio round to int, being also a number of
        # ensemble members.
        imbalance_ratio = c_counts[majority_c] / c_counts[minority_c]
        self.k = int(np.around(imbalance_ratio))
        self.k = self.k if self.k > 2 else 2

        # We use k to KFold division of majority class
        self.clfs = []
        kf = model_selection.KFold(n_splits=self.k, shuffle=True)
        for _, index in kf.split(maj_idx):
            fold_idx = np.concatenate([min_idx, maj_idx[index]])
            X_train_f, y_train_f = X_train[fold_idx], y_train[fold_idx]

            clf = base.clone(self.base_clf)
            clf.fit(X_train_f, y_train_f)
            self.clfs.append(clf)

        # Add OS
        clf = base.clone(self.base_clf)
        os = over_sampling.RandomOverSampler()
        X_os, y_os = os.fit_sample(self.X_train, self.y_train)
        clf.fit(X_os, y_os)
        self.clfs.append(clf)

        # Calculate weights as balanced accuracy on whole set
        self.weights = np.array([
            metrics.balanced_accuracy_score(self.y_train,
                                            clf.predict(self.X_train))
            for clf in self.clfs
        ])

        scaler = preprocessing.MinMaxScaler()
        self.nweights = scaler.fit_transform(self.weights.reshape(-1, 1)).T[0]
        self.nweights += 0.01
コード例 #10
0
def get_oversampled_df(model_data,
                       feature_columns,
                       dependent_variable,
                       sampling_strategy=0.5,
                       seed_val=0):

    X = model_data[feature_columns]
    y = model_data[dependent_variable]
    oversample_obj = imbsample.RandomOverSampler(
        sampling_strategy=sampling_strategy, random_state=seed_val)
    X_over, y_over = oversample_obj.fit_resample(X, y)

    oversampled_df = X_over
    oversampled_df['Competed'] = y_over

    return oversampled_df
コード例 #11
0
def curves():
    from sklearn import multiclass
    from itertools import cycle

    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])
    latexify()

    baseline = ipipeline.make_pipeline(
        over_sampling.RandomOverSampler(random_state=SEED),
        dummy.DummyClassifier(strategy='constant', constant=1, random_state=SEED),
    )

    logreg = ipipeline.make_pipeline(
        over_sampling.RandomOverSampler(random_state=SEED),
        linear_model.LogisticRegression(solver='lbfgs', random_state=SEED),
    )

    dtree = ipipeline.make_pipeline(
        over_sampling.RandomOverSampler(random_state=SEED),
        tree.DecisionTreeClassifier(max_depth=3, random_state=SEED),
    )


    models = (
        ('Constant', baseline),
        ('Logistic Regression', logreg),
        ('Decision Tree', dtree),
    )

    for name, pipe in models:
        classifier = multiclass.OneVsRestClassifier(pipe)

        df = prepare_data()
        df = df[[*features, 'class']].dropna()

        X, y = df[features].values, df['class'].ravel()
        y = preprocessing.label_binarize(y, classes=labels)

        n_labels = 3
        #n_samples, n_features = X.shape


        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.2, random_state=SEED)

        classifier.fit(X_train, y_train)


        if hasattr(classifier, 'decision_function'):
            y_score = classifier.decision_function(X_test)
        
        if hasattr(classifier, 'predict_proba'):
            y_score = classifier.predict_proba(X_test)

        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_labels):
            fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = metrics.auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])


        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_labels)]))

        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_labels):
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])


        # Finally average it and compute AUC
        mean_tpr /= n_labels

        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = metrics.auc(fpr["macro"], tpr["macro"])

        # Plot all ROC curves
        latexify(columns=1)
        f, ax = plt.subplots()


        ax.plot(
            fpr["micro"],
            tpr["micro"],
            label=f'micro-average ROC curve (area = {roc_auc["micro"]:0.2f})',
            color='deeppink',
            linestyle=':'
        )

        ax.plot(
            fpr["macro"],
            tpr["macro"],
            label=f'macro-average ROC curve (area = {roc_auc["macro"]:0.2f})',
            color='navy',
            linestyle=':'
        )

        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(n_labels), colors):
            ax.plot(
                fpr[i], 
                tpr[i], 
                color=color,
                label=f'ROC curve of class {labels[i]} (area = {roc_auc[i]:0.2f})'
            )



        ax.plot([0, 1], [0, 1], 'k--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('FP Rate')
        ax.set_ylabel('TP Rate')
        #ax.set_title(f'Multi-class ROC curves for {name}')
        ax.legend(loc="lower right")
        format_axes(ax)
        #f.tight_layout()
        f.savefig(f'./output/roc-{name.replace(" ", "-").lower()}.pdf', bbox_inches='tight')
コード例 #12
0
def multiple_figures():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])
    latexify()

    cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
    scaler = preprocessing.StandardScaler()
    resample = over_sampling.RandomOverSampler()

    baseline = pipeline.make_pipeline(
        scaler, resample,
        dummy.DummyClassifier(strategy='constant', constant='good'))

    logreg = pipeline.make_pipeline(
        scaler,
        resample,
        linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr'),
    )

    dtree = pipeline.make_pipeline(
        scaler,
        resample,
        tree.DecisionTreeClassifier(),
    )

    knn = pipeline.make_pipeline(
        scaler,
        resample,
        neighbors.KNeighborsClassifier(),
    )

    mlp = pipeline.make_pipeline(
        scaler,
        resample,
        neural_network.MLPClassifier(hidden_layer_sizes=(
            100,
            100,
            100,
        ),
                                     activation='relu',
                                     solver='adam'),
    )

    svc = pipeline.make_pipeline(
        scaler,
        resample,
        svm.LinearSVC(),
    )

    RForest = pipeline.make_pipeline(
        scaler,
        resample,
        ensemble.RandomForestClassifier(n_estimators=100),
    )

    models = (
        ('Constant', baseline),
        ('Logistic Regression', logreg),
        ('Decision Tree', dtree),
        #('kNN', knn),
        ('Multi-Layer Perceptron', mlp),
        ('linearSVM', svc),
        ('Random Forest', RForest),
    )

    # Special case of baseline
    filename = 'baseline-link-overall'
    df = prepare_data()
    y, y_pred = df['class'].ravel(), df['class_overall'].ravel()

    acc = metrics.accuracy_score(y, y_pred)
    prec = metrics.precision_score(y,
                                   y_pred,
                                   average='weighted',
                                   labels=labels)
    recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels)

    cm = metrics.confusion_matrix(y, y_pred, labels=labels)
    cm = norm_cm(cm)

    cm = pd.DataFrame(cm, index=labels, columns=labels)

    fig, ax = plt.subplots(dpi=92)
    sns.heatmap(cm,
                vmin=0,
                vmax=1,
                annot=True,
                fmt='.2f',
                cmap='Greys',
                ax=ax,
                cbar=False,
                square=True)
    ax.set_title(
        f'accuracy = {acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})')
    format_axes_for_cm(ax)

    fig.tight_layout()

    ensure_dir('./output/models/')
    fig.savefig(f'./output/models/{filename}.pdf', dpi=92, bbox_inches='tight')
    plt.close(fig)
    print(f'Done {filename}')

    for name, pipe in models:
        filename = name.lower().replace(' ', '_')

        y, y_pred = different_models(pipe)

        acc = metrics.accuracy_score(y, y_pred)
        #prec = metrics.precision_score(y, y_pred, average='weighted', labels=labels)
        #recall = metrics.recall_score(y, y_pred, average='weighted', labels=labels)
        print(name)
        print(metrics.classification_report(y, y_pred, labels=labels))

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)

        fig, ax = plt.subplots(dpi=92)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)
        ax.set_title(f'accuracy={acc:.3f}')
        format_axes_for_cm(ax)

        fig.tight_layout()

        ensure_dir('./output/models/')
        fig.savefig(f'./output/models/{filename}.pdf',
                    dpi=92,
                    bbox_inches='tight')
        plt.close(fig)
コード例 #13
0
def main():
    cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
    poly = preprocessing.PolynomialFeatures(degree=2)
    scaler = preprocessing.StandardScaler()
    resample = over_sampling.RandomOverSampler()

    baseline = pipeline.make_pipeline(
        scaler, resample,
        dummy.DummyClassifier(strategy='constant', constant='good'))

    logreg = pipeline.make_pipeline(
        scaler,
        resample,
        linear_model.LogisticRegression(),
    )

    dtree = pipeline.make_pipeline(
        scaler,
        resample,
        tree.DecisionTreeClassifier(),
    )

    #knn = pipeline.make_pipeline(
    #    scaler,
    #    resample,
    #    neighbors.KNeighborsClassifier()
    #)

    mlp = pipeline.make_pipeline(scaler, resample,
                                 neural_network.MLPClassifier())

    svc = pipeline.make_pipeline(scaler, resample, svm.LinearSVC())

    RForest = pipeline.make_pipeline(scaler, resample,
                                     ensemble.RandomForestClassifier())

    models = (
        ('Constant', baseline),
        ('Logistic Regression', logreg),
        ('Decision Tree', dtree),
        #('kNN', knn),
        ('Multi-Layer Perceptron', mlp),
        ('SVM (linear kernel)', svc),
        ('Random Forest', RForest),
    )

    fig, axes = plt.subplots(nrows=2,
                             ncols=3,
                             dpi=96,
                             sharey=True,
                             sharex=True)

    for (name, pipe), ax in zip(models, axes.reshape(-1)):
        y, y_pred = different_models(pipe)

        acc = metrics.accuracy_score(y, y_pred)
        prec = metrics.precision_score(y,
                                       y_pred,
                                       average='weighted',
                                       labels=labels)
        recall = metrics.recall_score(y,
                                      y_pred,
                                      average='weighted',
                                      labels=labels)

        cm = metrics.confusion_matrix(y, y_pred, labels=labels)
        cm = norm_cm(cm)

        cm = pd.DataFrame(cm, index=labels, columns=labels)
        sns.heatmap(cm,
                    vmin=0,
                    vmax=1,
                    annot=True,
                    fmt='.2f',
                    cmap='Greys',
                    ax=ax,
                    cbar=False,
                    square=True)

        ax.set_title(
            f'{name}\naccuracy={acc:.3f}\n(prec = {prec:.3f}, rec = {recall:.3f})'
        )

    fig.tight_layout()
    fig.savefig('./different_models.pdf', bbox_inches='tight')

    plt.show()
コード例 #14
0
    def run(self, mode: str, pred: list = [], prob: list = [], **kwargs):
        if 'oversample' in kwargs:
            sampler = over_sampling.RandomOverSampler()
            bins = np.linspace(-1, 1, num=20)

        if mode == 'warmup':
            all_dataset = AffectNet(
                self.root_dir,
                img_transform=self.transform_train,
                annotation_filename=
                'affectnet_annotations_train_all_ext_det_noisy.json',
                target_transform=self.target_transform,
                mode=None,
                filter_expressions=self.filter_expression,
                **self.kwargs)
            # debug line
            if 'oversample' in kwargs:
                idx = list(range(len(all_dataset)))
                idx = np.asarray(idx).reshape(-1, 1)
                labels = [
                    y['arousal'] for y in all_dataset.data['annotations']
                ]
                labels = np.digitize(labels, bins)
                labels -= 1
                new_idx, _ = sampler.fit_resample(idx, labels)
                all_dataset = Subset(all_dataset, new_idx.reshape(-1).tolist())

            print('# Train Images ' + str(len(all_dataset)))
            train_loader = DataLoader(dataset=all_dataset,
                                      batch_size=self.batch_size * 2,
                                      shuffle=True,
                                      num_workers=self.num_workers,
                                      pin_memory=True)

            return train_loader

        elif mode == 'train':
            labeled_dataset = AffectNet(
                self.root_dir,
                img_transform=self.transform_train,
                annotation_filename=
                'affectnet_annotations_train_all_ext_det_noisy.json',
                target_transform=self.target_transform,
                mode='labeled',
                pred=pred,
                probability=prob,
                filter_expressions=self.filter_expression,
                **self.kwargs)
            unlabeled_dataset = AffectNet(
                self.root_dir,
                img_transform=self.transform_train,
                annotation_filename=
                'affectnet_annotations_train_all_ext_det_noisy.json',
                target_transform=self.target_transform,
                mode='unlabeled',
                pred=pred,
                probability=prob,
                filter_expressions=self.filter_expression,
                **self.kwargs)
            labeled_loader = DataLoader(dataset=labeled_dataset,
                                        batch_size=self.batch_size,
                                        shuffle=True,
                                        num_workers=self.num_workers,
                                        pin_memory=True)
            unlabeled_loader = DataLoader(dataset=unlabeled_dataset,
                                          batch_size=self.batch_size,
                                          shuffle=True,
                                          num_workers=self.num_workers,
                                          pin_memory=True)
            return labeled_loader, unlabeled_loader
        elif mode == 'test':
            test_dataset = AffectNet(
                self.root_dir,
                img_transform=self.transform_train,
                annotation_filename=
                'affectnet_annotations_val_all_ext_det.json',
                mode=None,
                target_transform=self.target_transform,
                filter_expressions=self.filter_expression_test,
                **self.kwargs)
            # debug line
            print('# Test Images: ' + str(len(test_dataset)))
            test_loader = DataLoader(dataset=test_dataset,
                                     batch_size=self.batch_size * 2,
                                     shuffle=True,
                                     num_workers=self.num_workers,
                                     pin_memory=True)
            return test_loader
        elif mode == 'eval_train':
            eval_dataset = AffectNet(
                self.root_dir,
                img_transform=self.transform_train,
                annotation_filename=
                'affectnet_annotations_train_all_ext_det_noisy.json',
                mode=None,
                target_transform=self.target_transform,
                filter_expressions=self.filter_expression,
                **self.kwargs)
            eval_loader = DataLoader(dataset=eval_dataset,
                                     batch_size=self.batch_size * 2,
                                     shuffle=False,
                                     num_workers=self.num_workers,
                                     pin_memory=True)
            return eval_loader
コード例 #15
0
y_pred = cls.predict(X_test)
print('ROC_AUC ', metrics.roc_auc_score(y_test, y_pred))
print('Recall ', metrics.recall_score(y_test, y_pred))
print('Precision ', metrics.precision_score(y_test, y_pred))
print('F1 ', metrics.f1_score(y_test, y_pred))
metrics.plot_confusion_matrix(cls, X_test, y_test, normalize='true')

# ## Abordagem Random Over Sample

# ### Ajustando balanceamento

# In[6]:

from imblearn import over_sampling as over

oversample = over.RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

y_over.value_counts()

# In[7]:

X_over.shape[0] == y_over.shape[0]

# ### Distribuição em 2D
#
# *Não há alteração perceptível porque os data points se sobrepõem*

# In[8]:

from sklearn.decomposition import TruncatedSVD
コード例 #16
0
    [(['c1', 'c2', 'c3', 'c4', 'c5'], uf.Straight(), {
        'alias': 'has_straight'
    })  #,
     #        (['s1', 's2', 's3', 's4', 's5'], None)
     ],
    input_df=True,
    df_out=True,
    default=False)

features_pipeline = ppl.make_union(engineered_feature_pipeline1,
                                   engineered_feature_pipeline2,
                                   engineered_feature_pipeline3,
                                   engineered_feature_pipeline4)

sampling_pipeline = imbppl.make_pipeline(
    over_sampling.RandomOverSampler(random_state=9565))

model_pipeline = imbppl.make_pipeline(
    LogisticRegression(multi_class='multinomial',
                       penalty='l2',
                       random_state=9546,
                       solver="lbfgs"))

pipe = imbppl.Pipeline([('prep', features_pipeline),
                        ('sample', sampling_pipeline),
                        ('clf', model_pipeline)])

y = d_in.hand
X = d_in.loc[:, 's1':'c5']  # produces a copy

# split - results in < 5 observations for a the smallest class (need for sampling)
コード例 #17
0
ファイル: train-hpt.py プロジェクト: acocac/MTLCC-MODIS-GCP
def hyperopt(param_space, X_train, y_train, X_test, y_test, args):
    resampling = over_sampling.RandomOverSampler(sampling_strategy='auto',
                                                 random_state=42)

    start = time.time()

    def objective_function(params):
        classifier_type = params['type']
        del params['type']
        if classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
        elif classifier_type == 'svm':
            clf = SVC(**params)
        else:
            return 0

        pl = make_pipeline_imb(resampling, clf)

        score = cross_val_score(pl, X_train, y_train, n_jobs=args.cpus,
                                cv=3).mean()
        return {'loss': -score, 'status': STATUS_OK}

    rstate = np.random.RandomState(1)  # <== Use any number here but fixed

    trials = Trials()
    best_param = fmin(objective_function,
                      param_space,
                      algo=tpe.suggest,
                      max_evals=args.num_eval,
                      trials=trials,
                      rstate=rstate)

    loss = [x['result']['loss'] for x in trials.trials]

    joblib.dump(
        trials,
        os.path.join(
            args.modeldir, 'hyperopt_trials_niters{}_ssize{}.pkl'.format(
                args.num_eval, args.ssize)))

    # best_param_values = [ x for x in best_param.values() ]
    #
    # del best_param_values['classifier_type']
    #
    # if best_param_values[2] == 0:
    # 	max_features = 'auto'
    # else:
    # 	max_features = 'sqrt'
    #
    # if best_param_values[0] == 0:
    # 	bootstrap = 'True'
    # else:
    # 	bootstrap = 'False'
    #
    # print("Best parameters: ", best_param)
    #
    # clf_best = RandomForestClassifier(n_estimators=int(best_param_values[5]),
    # 								  max_features=max_features,
    # 							      max_depth=int(best_param_values[1]),
    # 								  min_samples_leaf=int(best_param_values[3]),
    # 								  min_samples_split=int(best_param_values[4]),
    # 								  bootstrap=bootstrap,
    # 								  n_jobs=args.cpus)
    #
    # pl = make_pipeline_imb(resampling, clf_best)
    #
    # # clf_best.fit(X_train, y_train)
    # estimator_fit = pl.fit(X_train, y_train)
    #
    print("")
    print("##### Results")
    print("Score best parameters: ", min(loss) * -1)
    print("Best parameters: ", best_param)
    # print("Test Score: ", estimator_fit.score(X_test, y_test))
    print("Time elapsed: ", round(time.time() - start, 2))
    print("Parameter combinations evaluated: ", args.num_eval)
    #
    # if args.writemodel:
    # 	model_file = os.path.join(args.modeldir, 'model-' + args.classifier + '.h5')
    # 	# -- save the model
    # 	joblib.dump(clf_best, model_file)
    # 	print("Writing the model over path {}".format(model_file))

    return trials
コード例 #18
0
nd1 = preprocessing.scale(df1.values)

logger.info(f"Data loaded")

jn = pushbulletNotifier.JobNotification(devices="phone")

processes = 25
try:
    X_train, X_test, y_train, y_test = model_selection.train_test_split(nd1, gender.values,
                                                                        test_size=0.2, stratify=gender.values)

    logger.info(f"Split data in to training set and validation set.")
    classifier = ['logisticregression', linear_model.LogisticRegression(max_iter=250)]
    sampler_lst = [['smote', over_sampling.SMOTE()],
                   ['adasyn', over_sampling.ADASYN()],
                   ['random¬oversampler', over_sampling.RandomOverSampler()]]
    pipeline_lst = [ [f'{sampler[0]}-{classifier[0]}', make_pipeline(sampler[1], classifier[1])]
                      for sampler in sampler_lst ]  # noqa
    param_grid = {
        'logisticregression__C': 2.0**np.linspace(-8, 5, 15)
        }  # noqa
    for name, pipe in pipeline_lst:
        jn.send(message=f"Starding cross validation with resampling method {name}")
        logger.info(f"Starting cross validation")
        est = model_selection.GridSearchCV(pipe, param_grid, scoring='roc_auc', cv=5, verbose=49, refit=True,
                                           n_jobs=processes, pre_dispatch=processes, return_train_score=True)
        est.fit(X_train, y_train)
        _, yhat = est.predict_proba(X_test).T
        try:
            logger.info(f"Cross validation done, best score was {est.best_score_}")
            logger.info(f"Best params were {est.best_params_}")
コード例 #19
0
def random_oversampling(features, labels):
    ros = over_sampling.RandomOverSampler(random_state=0)
    return ros.fit_resample(X=features, y=labels)
コード例 #20
0
def resample_classes(X,
                     Y,
                     how='und1',
                     random_state=None,
                     test_size=0.3,
                     n_jobs=2,
                     split=True,
                     verbose=True):
    """

    """
    if how == 'und1':
        if verbose:
            msg = 'Under-sampling the majority class(es) by randomly picking '
            msg += 'samples without replacement'
            print msg
        samp = imbus.RandomUnderSampler(random_state=random_state,
                                        replacement=False)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und2':
        if verbose:
            msg = 'Under-sampling by generating centroids based on clustering '
            msg += 'methods'
            print msg
        samp = imbus.ClusterCentroids(ratio='auto',
                                      random_state=random_state,
                                      estimator=None,
                                      n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und3':
        if verbose:
            print 'Under-sampling based on NearMiss methods'
        samp = imbus.NearMiss(ratio='auto',
                              return_indices=False,
                              random_state=random_state,
                              version=1,
                              size_ngh=None,
                              n_neighbors=3,
                              ver3_samp_ngh=None,
                              n_neighbors_ver3=3,
                              n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over1':
        if verbose:
            msg = 'Over-sampling the minority class(es) by picking samples at '
            msg += 'random with replacement'
            print
        samp = imbov.RandomOverSampler(random_state=random_state)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over2':
        if verbose:
            msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling '
            msg += 'Technique'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.SMOTE(random_state=random_state,
                               ratio=.99,
                               k=None,
                               k_neighbors=5,
                               m=None,
                               m_neighbors=10,
                               out_step=0.5,
                               kind='regular',
                               svm_estimator=None,
                               n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'over3':
        if verbose:
            msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling '
            msg += 'Approach for Imbalanced Learning'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.ADASYN(ratio=.93,
                                random_state=random_state,
                                k=None,
                                n_neighbors=5,
                                n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'comb1':
        if verbose:
            print 'Combine over- and under-sampling using SMOTE and Tomek links.'
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbcom.SMOTETomek(ratio=.99,
                                     random_state=random_state,
                                     smote=None,
                                     tomek=None,
                                     k=None,
                                     m=None,
                                     out_step=None,
                                     kind_smote=None,
                                     n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    else:
        print 'Sampling approach not recognized'
        return

    if verbose:
        print '\t\t\t1\t2\t3\t4'
        val_y = pd.Series(Y).value_counts(sort=False).values
        msg = 'Counts in y_init:\t{}\t{}\t{}\t{} '
        print msg.format(val_y[0], val_y[1], val_y[2], val_y[3])
        val_yres = pd.Series(y_res).value_counts(sort=False).values
        msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} '
        print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3])

    if split:
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=test_size, random_state=random_state)
        if verbose:
            val_ytr = pd.Series(y_train).value_counts(sort=False).values
            msg = 'Counts in y_train:\t{}\t{}\t{}\t{} '
            print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3])

            val_yte = pd.Series(y_test).value_counts(sort=False).values
            msg = 'Counts in y_test:\t{}\t{}\t{}\t{} '
            print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3])

            print 'X_train:', X_train.shape, ', X_test:', X_test.shape

        return X_train, X_test, y_train, y_test
    else:
        return X_res, y_res
コード例 #21
0
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from imblearn import over_sampling

#oluşturduğumuz veriyi kullanmak için kütüphane import eder gibi alıyoruz
from student import egitimGirdi, egitimCikti, valGirdi, valCikti

print(egitimGirdi.shape)

#### SENTETİK VERİ ÜRETİMİ
ros = over_sampling.RandomOverSampler()
rosEgitimGirdi, rosEgitimCikti = ros.fit_sample(egitimGirdi, egitimCikti)

print(rosEgitimGirdi.shape)

smote = over_sampling.SMOTE()
smoteEgitimGirdi, smoteEgitimCikti = smote.fit_sample(egitimGirdi, egitimCikti)

print(smoteEgitimGirdi.shape)

ada = over_sampling.ADASYN(ratio='minority')
adasynEgitimGirdi, adasynEgitimCikti = ada.fit_sample(egitimGirdi, egitimCikti)

print(adasynEgitimGirdi.shape)
#print(adasynEgitimGirdi.shape)
コード例 #22
0
def main():
    mpl.style.use(['seaborn-white', 'seaborn-paper', 'grayscale'])
    latexify(columns=2)

    #cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
    #poly = preprocessing.PolynomialFeatures(degree=2)
    scaler = preprocessing.StandardScaler()
    resample = over_sampling.RandomOverSampler()

    baseline = pipeline.make_pipeline(
        scaler, resample, dummy.DummyClassifier(strategy='constant',
                                                constant=0))

    logreg = pipeline.make_pipeline(
        scaler,
        resample,
        linear_model.LogisticRegression(),
    )

    sgd = pipeline.make_pipeline(
        scaler,
        resample,
        linear_model.SGDClassifier(),
    )

    dtree = pipeline.make_pipeline(
        scaler,
        resample,
        tree.DecisionTreeClassifier(),
    )

    mlp = pipeline.make_pipeline(scaler, resample,
                                 neural_network.MLPClassifier())

    svc = pipeline.make_pipeline(scaler, resample, svm.LinearSVC())

    RForest = pipeline.make_pipeline(scaler, resample,
                                     ensemble.RandomForestClassifier())

    models = (
        ('Constant', baseline),
        ('Logistic Reg.', logreg),
        ('Decision Tree', dtree),
        #('kNN', knn),
        ('Multi-Layer Perceptron', mlp),
        ('SVM (linear kernel)', svc),
        ('Random Forest', RForest),
    )

    colors = sns.color_palette("cubehelix", len(models))

    fig, ax = plt.subplots(dpi=92)  # Setup a figure

    #ax.set_title('Precision-Recall curve')

    #ax.set_xlim(0, 1)
    #ax.set_ylim(0, 1)

    ax.set_xlabel('Recall = $\\frac{{TP}}{{TP+FN}}$')
    ax.set_ylabel('Precision = $\\frac{{TP}}{{TP+FP}}$')

    # Prepare data for processing
    data = prepare_data()
    X, y = data[['rssi', 'rssi_avg', 'rssi_std']].values, data['class'].ravel()
    Y = preprocessing.label_binarize(y, classes=classes)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, Y, test_size=0.2, random_state=random_state)

    for (name, model), color in zip(models, colors):
        classifier = multiclass.OneVsRestClassifier(
            model)  # Make model support *.decision_function

        classifier.fit(X_train, y_train)

        # generate y_score
        if hasattr(classifier, 'decision_function'):
            y_score = classifier.decision_function(X_test)
        else:
            y_score = classifier.predict_proba(X_test)
            #continue

        # generate probabilities
        #y_proba = classifier.predict_proba(X_test)

        # generate predictions
        y_pred = classifier.predict(X_test)

        precision = dict()
        recall = dict()
        average_precision = dict()

        acc = metrics.accuracy_score(y_test, y_pred)

        for i in [1]:  # We observe only intermediate class
            precision[i], recall[i], _ = metrics.precision_recall_curve(
                y_test[:, i], y_score[:, i])
            average_precision[i] = metrics.average_precision_score(
                y_test[:, i], y_score[:, i])

            ax.step(recall[i],
                    precision[i],
                    where='post',
                    color=color,
                    alpha=0.65,
                    label=f'{name}')

        print(f'Plotted {name}')

    ax.legend(loc="best")
    format_axes_for_chart(ax)
    fig.tight_layout()

    ensure_dir('./output/')
    fig.savefig('./output/precision-recall-curve.pdf',
                dpi=92,
                bbox_inches='tight')
    #plt.show()
    plt.close(fig)
コード例 #23
0

def load_rutgers_with_quantiles():
    from glob import glob
    files = glob('../../featureGenerator/datasets/dataset-2-rutgers-wifi' +
                 '/with-quantiles/*.csv',
                 recursive=True)
    traces = [parse_rutgers_with_quantiles(df) for df in files]
    return traces


cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)

pipe_logreg = pipeline.Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('resample', over_sampling.RandomOverSampler()),
    ('clf', linear_model.LogisticRegression()),
])

pipe_dtree = pipeline.Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('resample', over_sampling.RandomOverSampler()),
    ('clf', tree.DecisionTreeClassifier()),
])


@memory.cache
def prepare_data():
    dataset = load_rutgers_with_quantiles()
    print('Rutgers loaded ...')
コード例 #24
0
ファイル: train.py プロジェクト: acocac/MTLCC-MODIS-GCP
def run_model(args):
    classif_type = ['RF', 'SVM']
    if args.classifier not in classif_type:
        print('ERR: select an available classifier (RF, SVM)')
        sys.exit(1)

    X_train, y_train, ids_train = prep_data('train', args)
    X_val, y_val, ids_val = prep_data('val', args)

    STATUS_OK = 'ok'

    trials = pkl.load(
        open(
            os.path.join(
                args.modeldir, 'hpt',
                'hyperopt_trials_niters{}_ssize{}.pkl'.format(
                    args.trials, args.ssize)), 'rb'))

    bestmodel = getBestModelfromTrials(trials.trials, args.bestmodel,
                                       STATUS_OK)

    resampling = over_sampling.RandomOverSampler(sampling_strategy='auto',
                                                 random_state=42)
    if args.classifier == 'RF':
        if bestmodel['max_features'][0] == 0:
            max_features = 'auto'
        else:
            max_features = 'sqrt'

        if bestmodel['bootstrap'][0] == 0:
            bootstrap = 'True'
        else:
            bootstrap = 'False'

        estimator = RandomForestClassifier(
            n_estimators=int(bestmodel['n_estimators'][0]),
            max_features=max_features,
            max_depth=int(bestmodel['max_depth'][0]),
            min_samples_leaf=int(bestmodel['min_samples_leaf'][0]),
            min_samples_split=int(bestmodel['min_samples_split'][0]),
            bootstrap=bootstrap,
            n_jobs=-1,
            verbose=1)
    else:
        c_lim = (-2, 7)
        g_lim = (-2, 4)

        C_space = [10**exp for exp in range(*c_lim)]
        gamma_space = [10**exp for exp in range(*g_lim)]
        kernel_space = ['rbf']

        C = C_space[int(bestmodel['C'][0])]
        gamma = gamma_space[int(bestmodel['gamma'][0])]
        kernel = kernel_space[int(bestmodel['kernel'][0])]

        print('Best model using C = {} gamma = {} and kernel {}'.format(
            C, gamma, kernel))
        estimator = SVC(C=C, gamma=gamma, kernel=kernel, verbose=1)

    pl = make_pipeline_imb(resampling, estimator)

    #-- train a rf classifier
    print('Training with the best model with parameters: ', bestmodel)
    start_train_time = time.time()
    estimator_fit = pl.fit(X_train, y_train)
    train_time = round(time.time() - start_train_time, 2)
    print('Training time (s): ', train_time)

    #-- test a rf classifier
    start_train_time = time.time()
    test_score = estimator_fit.score(X_val, y_val)
    test_time = round(time.time() - start_train_time, 2)
    print("Test Score: ", test_score)
    print("Time elapsed: ", test_time)

    def makedir(outfolder):
        if not os.path.exists(outfolder):
            os.makedirs(outfolder)

    outdir = os.path.join(args.modeldir, 'models')
    makedir(outdir)

    model_file = os.path.join(
        outdir, 'model-{}_bm{}.h5'.format(args.classifier, args.bestmodel))
    #-- save the model
    joblib.dump(estimator_fit, model_file)
    print("Writing the model over {}".format(model_file))

    eval_label = ['OA', 'train_time', 'test_time']

    history = np.zeros((len(eval_label), 1))

    history_file = os.path.join(
        outdir, 'trainingHistory-{}_bm{}.csv'.format(args.classifier,
                                                     args.bestmodel))

    history[0] = test_score
    history[1] = train_time
    history[2] = test_time

    df = pd.DataFrame(np.transpose(history), columns=eval_label)
    df.to_csv(history_file)