def test_pipeline_ducktyping():
    pipeline = make_pipeline(Mult(5))
    pipeline.predict
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(None)
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    pipeline.inverse_transform

    pipeline = make_pipeline(Transf(), NoInvTransf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    assert not hasattr(pipeline, 'inverse_transform')

    pipeline = make_pipeline(NoInvTransf(), Transf())
    assert not hasattr(pipeline, 'predict')
    pipeline.transform
    assert not hasattr(pipeline, 'inverse_transform')
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
        assert pipeline.memory is memory
        pipeline = make_pipeline(DummyTransf(), SVC())
        assert pipeline.memory is None
    finally:
        shutil.rmtree(cachedir)
def test_make_pipeline_memory():
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
        assert pipeline.memory is memory
        pipeline = make_pipeline(DummyTransf(), SVC())
        assert pipeline.memory is None
    finally:
        shutil.rmtree(cachedir)
Example #4
0
def svm_benchmark(isoform_list):
    start = time.time()
    columns_ = [
        'ACC', 'BA', 'ROC-AUC', 'PR-AUC', 'MCC', 'SN', 'SP', 'PR', 'F1', 'CK'
    ]
    df = pd.DataFrame(columns_)
    isoform_list_ = isoform_list
    for isoform_ in isoform_list_:
        #--------------------
        X_train = np.load("./data/{}/train_data.npy".format(isoform_))
        X_val = np.load("./data/{}/val_data.npy".format(isoform_))
        X_test = np.load("./data/{}/test_data.npy".format(isoform_))

        y_train = np.load("./data/{}/train_label.npy".format(isoform_))
        y_val = np.load("./data/{}/val_label.npy".format(isoform_))
        y_test = np.load("./data/{}/test_label.npy".format(isoform_))
        #--------------------
        # Set Up Parameter
        my_C = [0.001, 0.01, 0.1, 1, 10, 100]
        my_gamma = [0.001, 0.01, 0.1, 1, 10, 100]
        pred_val_list = []
        para_list = []
        for p1 in my_C:
            for p2 in my_gamma:
                para_list.append((p1, p2))
                my_classifier = make_pipeline(
                    VarianceThreshold(threshold),
                    SVC(C=p1, gamma=p2, probability=True))
                pred_val = my_classifier.fit(X_train,
                                             y_train).predict_proba(X_val)[::,
                                                                           1]
                pred_val_list.append(list(pred_val))
        #--------------------
        auc_val_list = []
        for pred in pred_val_list:
            auc = roc_auc_score(y_val, pred)
            auc_val_list.append(auc)
        i = np.argmax(auc_val_list)
        #--------------------
        best_C = para_list[i][0]
        best_gamma = para_list[i][1]
        tuned_classifier = make_pipeline(
            VarianceThreshold(threshold),
            SVC(C=best_C, gamma=best_gamma, probability=True))
        pred_test = tuned_classifier.fit(X_train,
                                         y_train).predict_proba(X_test)[::, 1]
        #--------------------
        metric = printPerformance(y_test, pred_test)
        df1 = pd.DataFrame(metric)
        df = pd.concat([df, df1], axis=1)
    df.columns = ["Metrics"] + isoform_list_
    df.to_csv("svm_benchmark.csv", index=None)
    end = time.time()
    processing_time = (end - start)
    print("Processing time: {}".format(processing_time))
Example #5
0
    def __build_preprocessor(self, useSelector):
        """

        :return:
        """
        extractor = self.__build_extractor()
        if (useSelector):
            selector = self.__build_selector()
            return make_pipeline(extractor, selector)
        else:
            return make_pipeline(extractor)
Example #6
0
def print_metrics(model, X, y, scoring='f1', oversample=False):
    if oversample == True:
        pipeline = make_pipeline(StandardScaler(),
                                 RandomOverSampler(random_state=11), model)
    else:
        pipeline = make_pipeline(StandardScaler(), model)
    score = cross_val_score(pipeline, X, y, scoring=scoring)
    fitted_model = model.fit(X, y)
    cm = confusion_matrix(y, fitted_model.predict(X))
    print(score)
    print(cm)
Example #7
0
def rf_benchmark(isoform_list):
    start = time.time()
    columns_ = [
        'ACC', 'BA', 'ROC-AUC', 'PR-AUC', 'MCC', 'SN', 'SP', 'PR', 'F1', 'CK'
    ]
    df = pd.DataFrame(columns_)
    isoform_list_ = isoform_list
    for isoform_ in isoform_list_:
        #--------------------
        X_train = np.load("./data/{}/train_data.npy".format(isoform_))
        X_val = np.load("./data/{}/val_data.npy".format(isoform_))
        X_test = np.load("./data/{}/test_data.npy".format(isoform_))

        y_train = np.load("./data/{}/train_label.npy".format(isoform_))
        y_val = np.load("./data/{}/val_label.npy".format(isoform_))
        y_test = np.load("./data/{}/test_label.npy".format(isoform_))
        #--------------------
        # Set Up Parameter
        my_n_estimators = np.arange(25, 201, 25)
        pred_val_list = []
        para_list = []
        for p in my_n_estimators:
            para_list.append(p)
            my_classifier = make_pipeline(
                VarianceThreshold(threshold),
                RandomForestClassifier(random_state=42, n_estimators=p))
            pred_val = my_classifier.fit(X_train,
                                         y_train).predict_proba(X_val)[::, 1]
            pred_val_list.append(list(pred_val))
        #--------------------
        auc_val_list = []
        for pred in pred_val_list:
            auc = roc_auc_score(y_val, pred)
            auc_val_list.append(auc)
        i = np.argmax(auc_val_list)
        #--------------------
        best_n_estimators = para_list[i]
        tuned_classifier = make_pipeline(
            VarianceThreshold(threshold),
            RandomForestClassifier(random_state=42,
                                   n_estimators=best_n_estimators))
        pred_test = tuned_classifier.fit(X_train,
                                         y_train).predict_proba(X_test)[::, 1]
        #--------------------
        metric = printPerformance(y_test, pred_test)
        df1 = pd.DataFrame(metric)
        df = pd.concat([df, df1], axis=1)
    df.columns = ["Metrics"] + isoform_list_
    df.to_csv("rf_benchmark.csv", index=None)
    end = time.time()
    processing_time = (end - start)
    print("Processing time: {}".format(processing_time))
def test_make_pipeline():
    t1 = Transf()
    t2 = Transf()
    pipe = make_pipeline(t1, t2)
    assert isinstance(pipe, Pipeline)
    assert pipe.steps[0][0] == "transf-1"
    assert pipe.steps[1][0] == "transf-2"

    pipe = make_pipeline(t1, t2, FitParamT())
    assert isinstance(pipe, Pipeline)
    assert pipe.steps[0][0] == "transf-1"
    assert pipe.steps[1][0] == "transf-2"
    assert pipe.steps[2][0] == "fitparamt"
Example #9
0
def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))
Example #10
0
def test_make_pipeline():
    t1 = Transf()
    t2 = Transf()
    pipe = make_pipeline(t1, t2)
    assert_true(isinstance(pipe, Pipeline))
    assert_equal(pipe.steps[0][0], "transf-1")
    assert_equal(pipe.steps[1][0], "transf-2")

    pipe = make_pipeline(t1, t2, FitParamT())
    assert_true(isinstance(pipe, Pipeline))
    assert_equal(pipe.steps[0][0], "transf-1")
    assert_equal(pipe.steps[1][0], "transf-2")
    assert_equal(pipe.steps[2][0], "fitparamt")
Example #11
0
def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))
def test_make_pipeline():
    t1 = Transf()
    t2 = Transf()
    pipe = make_pipeline(t1, t2)
    assert isinstance(pipe, Pipeline)
    assert pipe.steps[0][0] == "transf-1"
    assert pipe.steps[1][0] == "transf-2"

    pipe = make_pipeline(t1, t2, FitParamT())
    assert isinstance(pipe, Pipeline)
    assert pipe.steps[0][0] == "transf-1"
    assert pipe.steps[1][0] == "transf-2"
    assert pipe.steps[2][0] == "fitparamt"
Example #13
0
def test_make_pipeline():
    t1 = TransfT()
    t2 = TransfT()

    pipe = make_pipeline(t1, t2)
    assert_true(isinstance(pipe, Pipeline))
    assert_equal(pipe.steps[0][0], "transft-1")
    assert_equal(pipe.steps[1][0], "transft-2")

    pipe = make_pipeline(t1, t2, FitParamT())
    assert_true(isinstance(pipe, Pipeline))
    assert_equal(pipe.steps[0][0], "transft-1")
    assert_equal(pipe.steps[1][0], "transft-2")
    assert_equal(pipe.steps[2][0], "fitparamt")
Example #14
0
def test_pipeline_fit_then_sample_of_three_samplers_with_sampler_last_estimator():
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
        n_informative=3, n_redundant=1, flip_y=0,
        n_features=20, n_clusters_per_class=1,
        n_samples=50000, random_state=0)

    rus = RandomUnderSampler(random_state=42)
    enn = ENN()
    pipeline = make_pipeline(rus, enn, rus)
    X_fit_sample_resampled, y_fit_sample_resampled = pipeline.fit_sample(X,y)
    pipeline = make_pipeline(rus, enn, rus)
    pipeline.fit(X,y)
    X_fit_then_sample_resampled, y_fit_then_sample_resampled = pipeline.sample(X,y)
    assert_array_equal(X_fit_sample_resampled, X_fit_then_sample_resampled)
    assert_array_equal(y_fit_sample_resampled, y_fit_then_sample_resampled)
Example #15
0
def data_sampling(X, Y, k, oversampling=True, undersampling=True, class_weight='balanced'):
    over = SMOTE(sampling_strategy=0.55, k_neighbors=k) # environ 55% du jeu de données
    under = RandomUnderSampler(sampling_strategy=1.) # les effectifs de la classe  minoritaire sont 50% de ceux de la classe majoritaire 
    if (oversampling and undersampling):
        pipe=make_pipeline(over, under)
        X1, Y1 = pipe.fit_resample(X, Y)
    elif oversampling:
        pipe=make_pipeline(over)
        X1, Y1 = pipe.fit_resample(X, Y)
    elif undersampling:
        pipe=make_pipeline(under)
        X1, Y1 = pipe.fit_resample(X, Y)
    elif (class_weight=='balanced' or class_weight==None):
        (X1, Y1)=(X, Y)
    return X1, Y1
def sample(ngram_range=(1, 2), n_features=85000, methods=[]):
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import f1_score
    from imblearn.pipeline import make_pipeline

    methodsResults = pandas.DataFrame()
    counter = -2
    for osm in methods:
        counter += 1
        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.2,
                                     random_state=3000)
        # confusion_sum = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        result = []
        for train_index, test_index in sss.split(x, y):
            cvec = CountVectorizer()
            cvec.set_params(max_features=n_features, ngram_range=ngram_range)
            clf = MultinomialNB()

            if (osm == 0):
                pipeline = make_pipeline(cvec, clf)
            else:
                pipeline = make_pipeline(cvec, osm, clf)

            # X = cvec.fit_transform(x)
            x_train, x_test = x[train_index], x[test_index]
            # X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            sentiment_fit = pipeline.fit(x_train, y_train)
            y_pred = sentiment_fit.predict(x_test)

            # conmat = np.array(confusion_matrix(y_test, y_pred, labels=[2.0, 3.0, 4.0]))
            # print(conmat)
            f1 = f1_score(y_test,
                          y_pred,
                          labels=[2.0, 3.0, 4.0],
                          average="micro")
            result.append(f1)
        # print(result)
        if (osm == 0):
            methodsResults["Base Case"] = result
        else:
            methodsResults[type(osm).__name__] = result
        # print(confusion_sum)
    return methodsResults
Example #17
0
def naive_bayse_cross(train_x, train_y, validation, test, test_data):
    print("training data...")

    clf_pipe = make_pipeline(CountVectorizer(ngram_range=(1, 2)),
                             RandomUnderSampler(), MultinomialNB(alpha=0.01))

    scores = cross_val_score(clf_pipe, train_x, train_y, cv=10)

    print("Model is fitted!")
    if validation:
        print("scores: ", scores)
        print("std of score: ", np.std(scores))
        print("Accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))
        y_pred = cross_val_predict(clf_pipe, train_x, train_y, cv=5)

        # Evaluation
        # classification report
        print("classification reports:",
              classification_report(train_y, y_pred))
        # confusion matrix
        conf_mat = confusion_matrix(train_y, y_pred)
        print(conf_mat)
        plot_conf(conf_mat)
    if test:
        naive_bayes(test_data)
def create_pipeline(model, sampling_strategy, y):
    """Wraps a model in a pipeline to resample training data.

    Args:
        model (sklearn Model): The model to wrap.
        sampling_strategy (SamplingStrategy): The sampling strategy for the pipeline.
        y (pandas Dataframe): A dataframe containing targets.

    Returns:
        sklearn pipeline: A pipeline wrapping the model.

    """

    balancer = 'passthrough'

    if sampling_strategy == SamplingStrategy.UNDERSAMPLING:
        # We want to use a random undersampler if we use
        # undersampling is the resample strategy
        databalancing_stats(y, sampling_strategy)
        balancer = RandomUnderSampler(random_state=SEED)

    elif sampling_strategy == SamplingStrategy.OVERSAMPLING:
        # We want to use a SMOTE, the most common oversampler,
        # if we use oversampling is the resample strategy
        databalancing_stats(y, sampling_strategy)
        balancer = SMOTE(random_state=SEED, n_jobs=-1)

    return make_pipeline(balancer, model)
def svm (X_train, X_test, y_train, y_test):

	svm_kernel = 'poly'
	param_grid = {'svr__C': [10],
              'svr__gamma': [0.01], }
	regr = make_pipeline(StandardScaler(), SVR(kernel=svm_kernel))
	grid = GridSearchCV(regr, param_grid,
					    n_jobs=-1,
					    return_train_score=True)
	grid = grid.fit(X_train, y_train)
	y_pred_test = grid.predict(X_test)
	y_pred_train = grid.predict(X_train)
	MSE_test = mean_squared_error(y_true=y_test, y_pred=y_pred_test)
	MSE_train = mean_squared_error(y_true=y_train, y_pred=y_pred_train)

	print(grid)
	# print('cv_results_: ', grid.cv_results_)
	# print('Best score: ', grid.best_score_)
	# print('Best parameter: ', grid.best_estimator_)
	# print('Best parameters: ', grid.best_params_)
	print('MSE train: %.2f , MSE test: %.2f'
      % (MSE_train, MSE_test))

	test_scores = grid.cv_results_['mean_test_score']
	train_scores = grid.cv_results_['mean_train_score'] 

	print('test_scores:', test_scores)
	print('train_scores:', train_scores)

	return test_scores, train_scores, MSE_train, MSE_test
def svm(X_train, X_test, y_train, y_test, param_grid):

    regr = make_pipeline(StandardScaler(), SVR())
    grid = GridSearchCV(regr, param_grid, n_jobs=-1, return_train_score=True)
    grid = grid.fit(X_train, y_train)

    # re-predict using best parameter:

    params = {}
    params['svr__kernel'] = [grid.best_params_['svr__kernel']]

    grid = GridSearchCV(regr, params, n_jobs=-1, return_train_score=True)
    grid = grid.fit(X_train, y_train)

    y_pred_test = grid.predict(X_test)
    y_pred_train = grid.predict(X_train)
    MSE_test = mean_squared_error(y_true=y_test, y_pred=y_pred_test)
    MSE_train = mean_squared_error(y_true=y_train, y_pred=y_pred_train)

    # print(grid)
    # print('cv_results_: ', grid.cv_results_)
    # print('Best score: ', grid.best_score_)
    # print('Best parameter: ', grid.best_estimator_)
    # print('Best parameters: ', grid.best_params_)
    # print('MSE train: %.2f , MSE test: %.2f'
    #      % (MSE_train, MSE_test))

    test_scores = grid.cv_results_['mean_test_score']
    train_scores = grid.cv_results_['mean_train_score']

    slope, intercept, r_value, p_value, std_err = stats.linregress(
        y_test, y_pred_test)
    return test_scores, train_scores, MSE_train, MSE_test, r_value, p_value, std_err
Example #21
0
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0,
    ).fit(X_train, y_train)

    clf2 = make_pipeline(
        RandomUnderSampler(
            random_state=clf1.estimators_[0].steps[0][1].random_state),
        KNeighborsClassifier(),
    ).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
Example #22
0
def sm_col_clf_piper(X_train, y_train, X_test, X_label, parameters, clf, scoring= 'f1'):
    
    # parameters: dict of parameter need to tune
    # clf: classifier
    # n_features: number of features want to find, default is half of all features
    # scoring: type of score using to tune, default is f1 score
    
    # SMOTE training set to deal with imbalance
    sm = SMOTE()
    X_train, X_label = sm.fit_sample(X_train, X_label)
    
    pipe = make_pipeline(
                    (SFS(clf,"best",forward=False,scoring=scoring,cv=5)),
                    (clf)
                    )
    # tune model with different parameters
    grid = GridSearchCV(estimator = pipe, param_grid = parameters, cv = 5, n_jobs = -1, verbose = 50, scoring = scoring)
    grid.fit(X_train, y_train)
    # get the selected feature index 
    best_pipe = grid.best_estimator_
    feature_idx = (best_pipe.named_steps['sequentialfeatureselector'].transform(np.arange(len(X_train.columns)).reshape(1, -1)))[0]
    # use best parameter to predict test label
    pred = grid.predict(X_test)
    
    # calculate different score based on prediction
    conf = confusion_matrix(X_label,pred)
    test_score = {
    "accuracy":accuracy_score(X_label,pred),
    "precision":precision_score(X_label,pred,"binary"),
    "recall":recall_score(X_label,pred,"binary"),
    "f1_score":f1_score(X_label,pred,"binary"),
    "roc_auc":roc_auc_score(X_label,pred)
    }
    return grid.cv_results_['mean_test_score'], grid.best_params_, conf , test_score, feature_idx
Example #23
0
def train_mnb(X, y, **kwargs):
    """
    This function transforms the text corpus with a TfidfVectorizer
    and trains a Naive Bayes model.
    
    Parameter
    ---------
    corpus : array_like
        List of song lyrics as strings.

    artists : array_like
        List of labels/artists as strings.

    **kwargs :  Arbitrary keyword arguments passes as hyperparameters for MultinominalNB.

    Returns
    -------
    A pipeline with Text-Preprocesser and the trained sklearn.naives_bayes.MultinominalNB classification model. 
    """
    tf = TfidfVectorizer()
    ros = RandomOverSampler(random_state=20)
    sm = SMOTE(random_state=20)
    m = MultinomialNB(**kwargs)
    pipeline = make_pipeline(tf, sm, m)
    pipeline.fit(X, y)
    print(f"\ntraining accuracy: {round(pipeline.score(X, y),3)}")
    #print('\nConfusion matrix:')
    #print(f'Classes: {pipeline.classes_}')
    #print(confusion_matrix(y, pipeline.predict(X), labels=pipeline.classes_))
    cross_val = cross_val_score(pipeline, X, y, cv=5)
    print(f'\ncross-validation accuracy: {cross_val.round(3)}')

    return pipeline
Example #24
0
def pipeline(estimator):
    '''
    Model pipeline
    '''
    return make_pipeline(StandardScaler(),
                         RandomOverSampler(random_state=42, ratio='minority'),
                         estimator)
def adjust(dimension=yV, dimName="Valence"):
    FOLDS = 10
    sss = StratifiedShuffleSplit(n_splits=FOLDS,
                                 test_size=0.2,
                                 random_state=3000)
    result = []
    x = emoBank.drop(columns=["id", "sentence", dimName], inplace=False).values
    for train_index, test_index in sss.split(x, dimension):

        clf = MultinomialNB()
        pipeline = make_pipeline(clf)

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = dimension[train_index], dimension[test_index]

        sentiment_fit = pipeline.fit(x_train, y_train)
        y_pred = sentiment_fit.predict(x_test)
        f1 = f1_score(y_test, y_pred, labels=[2.0, 3.0, 4.0], average="micro")
        result.append(f1)

    avgScore = 0
    for score in result:
        avgScore += score

    elapsedTime = time.time() - start_time
    print("elapsed time: " + str(elapsedTime))

    print("F1 score for " + str(dimName) + ": " + str(avgScore / FOLDS))

    return sentiment_fit
Example #26
0
def run(X, y, learning_curve=False, validation_curve=False):
    RANDOM_STATE = 0

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE)

    pipeline = make_pipeline(
        SelectKBest(score_func=f_classif, k=10),
        QuantileTransformer(),
        RandomUnderSampler(random_state=RANDOM_STATE),
        GradientBoostingClassifier(random_state=RANDOM_STATE),
    )

    if learning_curve:
        ax = plot_learning_curve(pipeline,
                                 X_train,
                                 y_train,
                                 cv=5,
                                 scoring=make_scorer(fbeta_score, beta=2))
        plt.show()

    if validation_curve:
        ax = plot_validation_curve(
            pipeline,
            X_train,
            y_train,
            cv=5,
            scoring=make_scorer(fbeta_score, beta=2),
            param_name="selectkbest__k",
            param_range=[10, 20, 30, 40, 50],
        )
        plt.show()
def decisssionTreeSimpleVal(X, Y):
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8)
    #ica = FastICA(n_components=K, whiten=True).fit(X_train, Y)
    #X_red_train = ica.transform(X_train)
    #X_red_val = ica.transform(X_val)

    # Normalization
    scalar = StandardScaler()
    X_train_n = scalar.fit_transform(X_train)
    X_val_n = scalar.fit_transform(X_val)

    undersample = SMOTE()
    tree = DecisionTreeClassifier(criterion="gini",
                                  splitter="best",
                                  max_depth=5,
                                  random_state=RANDOM_STATE,
                                  presort=True)

    classifier = make_pipeline(undersample, tree)

    # Prediction and evaluation
    classifier.fit(X_train_n, Y_train)
    prediction = classifier.predict(X_val_n)
    print("\n", classification_report(prediction, Y_val))
    print("N ones:", len(np.where(prediction == 1)[0]) / len(prediction))
    return classifier, scalar
 def create_pipelines(self):
     self.model_pipelines = []
     for estimator in self.estimators:
         for sampler in self.samplers:
             for scaler in self.scalers:
                 pipeline = make_pipeline(scaler, sampler, estimator)
                 self.model_pipelines.append(pipeline)
def OverSampling_SMOTE(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    train_df_X = train_df.drop('TARGET', axis=1)
    train_df_y = train_df.TARGET

    # SMOTE
    print('Creating Smote Data...')
    smote = SMOTE(k_neighbors=5, n_jobs=-1)
    smote_enn = make_pipeline(SimpleImputer(), SMOTEENN(smote=smote))
    X_res, y_res = smote_enn.fit_resample(train_df_X, train_df_y)

    X_res_df = pd.DataFrame(X_res, columns=train_df_X.columns)

    train_df_new = X_res_df.join(y_res.to_frame())

    df = train_df_new.append(test_df)

    # Save data to csv file
    df.to_csv('data/df_prepared_to_model.csv')

    # Save data to pickle file
    df.to_pickle("data/df_prepared_to_model.pkl")

    return df
def test_pipeline_param_error():
    clf = make_pipeline(LogisticRegression())
    with pytest.raises(
        ValueError,
        match="Pipeline.fit does not accept the sample_weight parameter",
    ):
        clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])
Example #31
0
def svm_text_classification(vec_params,
                            svm_params,
                            train_feat,
                            train_label,
                            test_feat,
                            test_label,
                            random_state=42):
    '''
    A function to classify text data using count vectorization, random under-sampling,
    and a random forest.
    
    train_features = an array of training features
    test_features = an array of testing features
    labels = an array of training labels
    vec_params = parameters for the CountVectorizer
    rf_params = parameters for the RandomForestClassifier
    '''

    pipe = make_pipeline(CountVectorizer(**vec_params),
                         RandomUnderSampler(random_state=random_state),
                         SVC(**svm_params))

    pipe_fit = pipe.fit(train_feat, train_label)
    y_pred = pipe_fit.predict(test_feat)

    cnf_matrix = confusion_matrix(test_label, y_pred)

    return pipe, pipe_fit, y_pred, cnf_matrix
Example #32
0
def mlpSimpleDiv(X, Y):
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8)
    #X_red_train = FastICA(n_components=K, whiten=True).fit_transform(X_train, Y)
    # X_red_val = FastICA(n_components=K, whiten=True).fit_transform(X_val, Y)

    # Normalization
    scalar = StandardScaler()
    X_train_n = scalar.fit_transform(X_train)
    X_val_n = scalar.fit_transform(X_val)

    # MLP creation + trainning
    scalar = StandardScaler()
    mlp = MLPClassifier(activation="relu",
                        verbose=False,
                        solver="adam",
                        max_iter=150,
                        hidden_layer_sizes=(3, 200),
                        early_stopping=True,
                        tol=1e-12,
                        validation_fraction=0.2,
                        alpha=1e-4,
                        learning_rate_init=0.1,
                        beta_1=0.3,
                        warm_start=True,
                        random_state=RANDOM_STATE)

    undersample = SMOTE()
    classifier = make_pipeline(undersample, mlp)
    # Prediction and evaluation
    classifier.fit(X_train_n, Y_train)
    prediction = classifier.predict(X_val_n)
    print("\n", classification_report(prediction, Y_val))
    print("N ones:", len(np.where(prediction == 1)[0]) / len(prediction))
    return classifier, scalar
Example #33
0
def set_pipe(clf, features, filename = 'Untitled'):
    piped_clf = make_pipeline(
        (ColumnSelector(cols = features)),
        (SMOTE()),
        (clf)
    )
    piped_clf.fit(X_train,y_train)
    y_pred = piped_clf.predict(X_test)
    con_mat = confusion_matrix(y_test, y_pred)
    avg_f1 = (model_selection.cross_val_score(piped_clf, X_train, y_train, cv = 5, scoring = 'f1')).mean()
    
    print("Cross Val acc score:         ", (model_selection.cross_val_score(piped_clf, X_train, y_train, cv = 5,)).mean())
    print("Cross Val f1  score:         ", avg_f1)
    print()
    print("Overall Acc score:           ", accuracy_score(y_true=y_test, y_pred=y_pred))
    print("Recall score (Tru Pos Rate): ", recall_score(y_true=y_test, y_pred=y_pred))
    print("Precision score:             ", precision_score(y_true=y_test, y_pred=y_pred))
    print("Neg Predictive Val:          ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0]))
    print("Tru Neg Rate(Specifi):       ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0]))
    print("F1 score:                    ", f1_score(y_true=y_test, y_pred=y_pred))
    print("Auc score:                   ", roc_auc_score(y_true=y_test, y_score=y_pred))
    print(con_mat)
    print()
    (pd.DataFrame(y_pred)).to_csv(filename + 'y_pred_filt_avg.csv')
    return piped_clf, avg_f1
Example #34
0
def svmSimpleVal(X, Y):
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8)
    #ica = FastICA(n_components=K, whiten=True).fit(X_train, Y)
    #X_red_train = ica.transform(X_train)
    #X_red_val = ica.transform(X_val)

    # Normalization
    scalar = StandardScaler()
    X_train_n = scalar.fit_transform(X_train)
    X_val_n = scalar.fit_transform(X_val)

    undersample = SMOTE()
    svm = SVC(
        verbose=True,
        kernel="poly",
        decision_function_shape="ovr",
        random_state=RANDOM_STATE,
        C=0.03,
        degree=3,
    )  #class_weight="balanced"

    classifier = make_pipeline(undersample, svm)

    # Prediction and evaluation
    classifier.fit(X_train_n, Y_train)
    prediction = classifier.predict(X_val_n)
    print("\n", classification_report(prediction, Y_val))
    print("N ones:", len(np.where(prediction == 1)[0]) / len(prediction))
    return classifier, scalar
Example #35
0
def one_cv_for_one_algo(algorithm, X_train, y_train):
    clf = make_pipeline(SMOTE(random_state=0), algorithm)
    clf.fit(X_train, y_train)
    del X_train, y_train

    # For Testing
    # X_test
    fn = 'mem_file_X_test_' + str(cv_idx) + '.dat'
    mem_file_name = make_path(fn, directory='')
    X_test = read_memmap(mem_file_name)
    print('X_test loaded')

    # y_test
    fn = 'mem_file_y_test_' + str(cv_idx) + '.dat'
    mem_file_name = make_path(fn, directory='')
    y_test = read_memmap(mem_file_name)
    print('y_test loaded')

    y_pred = clf.predict(X_test)
    del X_test

    s1 = accuracy_score(y_true=y_test, y_pred=y_pred)
    s2 = precision_score(y_true=y_test, y_pred=y_pred)
    s3 = recall_score(y_true=y_test, y_pred=y_pred)
    s4 = f1_score(y_true=y_test, y_pred=y_pred)
    del y_test

    print('accuracy:', s1)
    print('precision:', s2)
    print('recall:', s3)
    print('f1:', s4)
    return [s1, s2, s3, s4]
def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    with raises(AttributeError):
        getattr(reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1),
                        LogisticRegression(solver='lbfgs', multi_class='auto',
                                           random_state=0))
    with raises(AttributeError):
        getattr(clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = BalancedBaggingClassifier(
        make_pipeline(SelectKBest(k=1),
                      DecisionTreeClassifier()),
        max_features=2)
    estimator.fit(X, y).predict(X)
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = EasyEnsembleClassifier(
        n_estimators=2,
        base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
def test_easy_ensemble_classifier_single_estimator():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(
        X_train, y_train)
    clf2 = make_pipeline(RandomUnderSampler(random_state=0),
                         AdaBoostClassifier(random_state=0)).fit(
                             X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_resampler_last_stage_passthrough():

    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=50000,
        random_state=0)

    rus = RandomUnderSampler(random_state=42)
    pipe = make_pipeline(rus, None)
    pipe.fit_resample(X, y)
def test_pipeline_none_sampler_sample():
    # Test pipeline using None step and a sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipe = make_pipeline(None, rus)
    pipe.fit_resample(X, y)
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    clf2 = make_pipeline(RandomUnderSampler(
        random_state=clf1.estimators_[0].steps[0][1].random_state),
                         KNeighborsClassifier()).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_pipeline_none_classifier():
    # Test pipeline using None as preprocessing step and a classifier
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)
    clf = LogisticRegression(random_state=0)
    pipe = make_pipeline(None, clf)
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.decision_function(X)
    pipe.score(X, y)
def test_pipeline_none_transformer():
    # Test pipeline using None and a transformer that implements transform and
    # inverse_transform
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    pca = PCA(whiten=True)
    pipe = make_pipeline(None, pca)
    pipe.fit(X, y)
    X_trans = pipe.transform(X)
    X_inversed = pipe.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_inversed)
from imblearn import over_sampling as os
from imblearn import pipeline as pl
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Generate a dataset
X, y = datasets.make_classification(n_classes=2, class_sep=2,
                                    weights=[0.1, 0.9], n_informative=10,
                                    n_redundant=1, flip_y=0, n_features=20,
                                    n_clusters_per_class=4, n_samples=5000,
                                    random_state=RANDOM_STATE)

pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
                            LinearSVC(random_state=RANDOM_STATE))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=RANDOM_STATE)

# Train the classifier with balancing
pipeline.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)

# Show the classification report
print(classification_report_imbalanced(y_test, y_pred_bal))
                            random_state=rng,
                            behaviour='new')
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

##############################################################################
# Integrate it within a pipeline
##############################################################################

##############################################################################
# By elimnating outliers before the training, the classifier will be less
# affected during the prediction.

pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
                     LogisticRegression(solver='lbfgs', multi_class='auto',
                                        random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

plt.show()
print(__doc__)

RANDOM_STATE = 42

scorer = metrics.make_scorer(metrics.cohen_kappa_score)

# Generate the dataset
X, y = datasets.make_classification(n_classes=2, class_sep=2,
                                    weights=[0.1, 0.9], n_informative=10,
                                    n_redundant=1, flip_y=0, n_features=20,
                                    n_clusters_per_class=4, n_samples=5000,
                                    random_state=RANDOM_STATE)
smote = os.SMOTE(random_state=RANDOM_STATE)
cart = tree.DecisionTreeClassifier(random_state=RANDOM_STATE)
pipeline = pl.make_pipeline(smote, cart)

param_range = range(1, 11)
train_scores, test_scores = ms.validation_curve(
    pipeline, X, y, param_name="smote__k_neighbors", param_range=param_range,
    cv=3, scoring=scorer, n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

plt.plot(param_range, test_scores_mean, label='SMOTE')
ax.fill_between(param_range, test_scores_mean + test_scores_std,
Example #48
0
def test_X1d_inverse_transform():
    transformer = TransfT()
    pipeline = make_pipeline(transformer)
    X = np.ones(10)
    msg = "1d X will not be reshaped in pipeline.inverse_transform"
    assert_warns_message(FutureWarning, msg, pipeline.inverse_transform, X)
                                     RepeatedEditedNearestNeighbours)

print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=5, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

# Create the classifier
knn = KNN(1)

# Make the splits
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

# Add one transformers and two samplers in the pipeline object
pipeline = make_pipeline(pca, enn, renn, knn)

pipeline.fit(X_train, y_train)
y_hat = pipeline.predict(X_test)

print(classification_report(y_test, y_hat))
y = data.target[idxs]
y[y == majority_person] = 0
y[y == minority_person] = 1

classifier = ['3NN', neighbors.KNeighborsClassifier(3)]

samplers = [
    ['Standard', DummySampler()],
    ['ADASYN', ADASYN(random_state=RANDOM_STATE)],
    ['ROS', RandomOverSampler(random_state=RANDOM_STATE)],
    ['SMOTE', SMOTE(random_state=RANDOM_STATE)],
]

pipelines = [
    ['{}-{}'.format(sampler[0], classifier[0]),
     make_pipeline(sampler[1], classifier[1])]
    for sampler in samplers
]

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

for name, pipeline in pipelines:
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    for train, test in cv.split(X, y):
        probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
X_test = np.vstack([moons, blobs])
y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
                    np.zeros(blobs.shape[0], dtype=np.int8)])

plot_scatter(X_test, y_test, 'Testing dataset')


def outlier_rejection(X, y):
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]


reject_sampler = FunctionSampler(func=outlier_rejection)
X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')

pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
                     LogisticRegression(random_state=rng))
y_pred = pipe.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

clf = LogisticRegression(random_state=rng)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

plt.show()
# does not have any knowledge regarding the underlying distribution. Therefore,
# some noisy samples can be generated, e.g. when the different classes cannot
# be well separated. Hence, it can be beneficial to apply an under-sampling
# algorithm to clean the noisy samples. Two methods are usually used in the
# literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning
# methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek``
# and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than
# ``SMOTETomek``.


fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7))

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (
        SMOTE(random_state=0),
        SMOTEENN(random_state=0),
        SMOTETomek(random_state=0))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title('Decision function for {}'.format(
        sampler.__class__.__name__))
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}'.format(
        sampler.__class__.__name__))
fig.tight_layout()

plt.show()
###############################################################################
# Random over-sampling to balance the data set
###############################################################################

###############################################################################
# Random over-sampling can be used to repeat some samples and balance the
# number of samples between the dataset. It can be seen that with this trivial
# approach the boundary decision is already less biaised toward the majority
# class.

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(RandomOverSampler(random_state=0), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for RandomOverSampler')
fig.tight_layout()

###############################################################################
# More advanced over-sampling using ADASYN and SMOTE
###############################################################################

###############################################################################
# Instead of repeating the same samples when over-sampling, we can use some
# specific heuristic instead. ADASYN and SMOTE can be used in this case.


# Make an identity sampler
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

print(__doc__)

RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 50, 2: 50},
                      random_state=0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=RANDOM_STATE)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2, random_state=RANDOM_STATE),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))