def test_renn_fit_sample_mode():
    nn = NearestNeighbors(n_neighbors=4)
    renn = RepeatedEditedNearestNeighbours(
        n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [2.94290565, -0.13986434], [-1.10146139, 0.91782682],
                     [0.73489726, 0.43915195], [-0.28479268, 0.70459548],
                     [1.84864913, 0.14729596], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [1.67314371, 0.19231498], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [1.32319756, -0.13181616],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #2
0
ファイル: sampler.py プロジェクト: jooglyp/ml101
    def rnn_undersampling(
            self, x: pandas.DataFrame, y: numpy.ndarray,
            neighbors: int) -> typing.Tuple[numpy.ndarray, numpy.ndarray]:
        """
        Repeated Edited Nearest Neighbors.
        Args:
            x: X training covariates for the ML model.
            y: y training binary outcomes of the ML model.

        Returns: resampled (undersampled) observations that reduce bias in the receiving operating characteristic (ROC).

        """
        x = self.check_id(x)
        rnn_undersampler = RepeatedEditedNearestNeighbours(
            random_state=82,
            n_neighbors=neighbors,
            return_indices=True,
            kind_sel="mode",
            max_iter=400,
            ratio="majority",
        )

        X_resampled, y_resampled, resampled_idx = rnn_undersampler.fit_sample(
            copy.deepcopy(x), copy.deepcopy(y))
        LOGGER.info(X_resampled)
        LOGGER.info(
            "RNN undersampling yielded {} number of X_resampled observations".
            format(len(X_resampled)))
        LOGGER.info(y_resampled)
        LOGGER.info(
            "RNN undersampling yielded {} number of y_resampled observations".
            format(len(y_resampled)))
        return X_resampled, y_resampled
def test_renn_fit_sample_with_indices():
    renn = RepeatedEditedNearestNeighbours(
        return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25,
        26, 28, 31, 33, 34, 35, 36
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
コード例 #4
0
def test_renn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=4)
    renn = RepeatedEditedNearestNeighbours(
        n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [2.94290565, -0.13986434], [-1.10146139, 0.91782682],
                     [0.73489726, 0.43915195], [-0.28479268, 0.70459548],
                     [1.84864913, 0.14729596], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [1.67314371, 0.19231498], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [1.32319756, -0.13181616],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(
        return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25,
        26, 28, 31, 33, 34, 35, 36
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_renn_fit_resample():
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [
        -0.46226554, -0.50481004
    ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [
        1.12202806, 0.33811558
    ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [
        0.84929742, 0.41042894
    ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [
        0.69804044, 0.44810796
    ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [
        0.34218094, -0.58781961
    ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [
        0.73418199, -0.02222847
    ], [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #8
0
ファイル: decision_tree.py プロジェクト: nikonovd/wikionto
def train_decisiontree_with(configurationname,
                            train_data,
                            k,
                            score_function,
                            undersam=False,
                            oversam=False,
                            export=False,
                            **kwargs):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data

    max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"]

    dtc = DecisionTreeClassifier(criterion="entropy",
                                 random_state=0,
                                 max_depth=max_depth)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    selector = SelectKBest(score_function, k=k)
    selector = selector.fit(X_train, y_train)

    X_train = selector.transform(X_train)

    fitted_ids = [i for i in selector.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        print("Exporting tree to graph...")
        export_graphviz(dtc,
                        out_file=DATAP + "/temp/trees/sltree_" +
                        configurationname + ".dot",
                        filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
def test_renn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    renn.fit(X, Y)
    assert_raises(RuntimeError, renn.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
コード例 #10
0
def test_renn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    renn.fit(X, Y)
    assert_raises(RuntimeError, renn.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
コード例 #11
0
def classification_results(train,test):
    #Derivation of NBDriver using training data 
    """
    Arguments:
        train = feature matrix derived from Brown et al.
        test= feature matrix derived from Martelotto et al.
    Returns:
        best_model = Best ensemble model derived using the training data
        X_red= Dataframe derived after sampling that was used to train the model
        scores= probability based classification scores
    """
    sen=[];spe=[];acc=[];auc=[];c=[];m=[];s=[]
    train_x=train.drop('Label',axis=1);train_y=train['Label'];    
    test_x=test.drop('Label',axis=1);test_y=test['Label'];
    #Random undersampling to reduce the majority class size
    samp=RepeatedEditedNearestNeighbours(random_state=42)
    X_samp,y_samp=samp.fit_resample(train_x,train_y)
    X_samp = pd.DataFrame(X_samp, columns = train_x.columns)
    #Experimenting with different numbers of top features derived from the tree-based feature extraction method 
    top_n_feats=[30,40,50,60,70]
    X_r=feature_reduction_using_trees(X_samp,y_samp) 
    cols=X_r.columns
    for n in top_n_feats:
        print("For top: ",n," features")
        X_red=X_r[cols[0:n]]
        sv=SVC(kernel="linear",probability=True,C=0.01,random_state=42) #chosen from 5foldCV based grid search
        kde=KDEClassifier(bandwidth=1.27) #chosen from 5foldCV based grid search
        best_model = VotingClassifier(estimators=[('sv', sv), ('kde', kde)],
                        voting='soft',weights=[4, 7]) #best combination of weights selected by a brute force search (possible weights 1-10) using a cross-validation approach on the training data  
        
        best_model.fit(X_red,y_samp)
        y_probs = best_model.predict_proba(test_x[X_red.columns])[:,1]
        thresholds = arange(0, 1, 0.001)
        scores = [roc_auc_score(test_y, to_labels(y_probs, t)) for t in thresholds]
        ix= argmax(scores)
        y_test_predictions = np.where(best_model.predict_proba(test_x[X_red.columns])[:,1] > thresholds[ix], 2, 1)
        print("Thresh: ",thresholds[ix])
        sensi= sensitivity_score(test_y, y_test_predictions, pos_label=2)
        speci=specificity_score(test_y,y_test_predictions,pos_label=2)
        accu=accuracy_score(test_y,y_test_predictions)
        auro=roc_auc_score(test_y,y_test_predictions)
        mcc=metrics.matthews_corrcoef(test_y,y_test_predictions)
        tn, fp, fn, tp = confusion_matrix(test_y, y_test_predictions).ravel()
        ppv=tp/(tp+fp)
        npv=tn/(tn+fn)
        sen=tp/(tp+fn)
        spe=tn/(tn+fp)
        score=ppv+npv+sen+spe
        print("For kmer size: ",len(train.columns[0]))
        print("for top ",n," features")
        print(list(X_red.columns.values),"\n")
        score_dict={"Sen":sen,"Spe":spe,"PPV":ppv,"NPV":npv,"AUC":auro,"MCC":mcc,"ACC":accu}
        print(score)
        print(score_dict)
        df=pd.DataFrame(y_test_predictions)
        y_samp = pd.DataFrame(y_samp, columns = ['x'])
    return best_model,X_red,scores
コード例 #12
0
def load_from_csv(input_dir: str,
                  counts_file: str = "normalized_counts.csv.gz",
                  n_jobs=1,
                  low_expression=0.1) -> (AnnData, AnnData):
    u"""
    load data from csv files
    :param input_dir:
    :param counts_file:
    :param n_jobs
    :param str
    :return:
    """
    logger.info("Reading {0}".format(input_dir))

    input_file = os.path.join(input_dir, counts_file)

    # if not os.path.exists(input_file):
    #     input_file += ".gz"

    mtx = pd.read_csv(input_file, index_col=0)
    meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0)
    meta = meta.loc[meta.index, :]

    logger.info(mtx.shape)
    # filter low expressed genes
    genes_sum = [x / mtx.shape[1] > low_expression for x in mtx.sum(axis=1)]

    mtx = mtx.loc[genes_sum, :]

    logger.info(mtx.shape)
    mtx = mtx.transpose()

    data = AnnData(mtx, obs=meta)
    data.obs = meta

    logger.info("Perform ENN")
    enn = EditedNearestNeighbours(n_jobs=n_jobs, return_indices=True)

    mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"])

    data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :])

    data_enn.obs = meta.iloc[idx_enn, :]

    logger.info("Perform RENN")
    renn = RepeatedEditedNearestNeighbours(n_jobs=n_jobs, return_indices=True)

    mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"])

    data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :])

    data_renn.obs = meta.iloc[idx_renn, :]

    return data, data_enn, data_renn
コード例 #13
0
def rep_edited_KNN(X, Y):
    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
    renn = RepeatedEditedNearestNeighbours()
    renn.fit_resample(X, Y)
    indexes = renn.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
コード例 #14
0
ファイル: imbalance.py プロジェクト: jcheminform/flame
def rep_edited_KNN(X, Y):
    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
    renn = RepeatedEditedNearestNeighbours()
    renn.fit_resample(X, Y)
    indexes = renn.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
def test_renn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #16
0
def test_renn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #17
0
def repeated_edited_nearest_neighbours(X,
                                       y,
                                       visualize=False,
                                       pca2d=True,
                                       pca3d=True,
                                       tsne=True,
                                       pie_evr=True):
    renn = RepeatedEditedNearestNeighbours()
    X_res, y_res = renn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
コード例 #18
0
def test_renn_fit():
    """Test the fitting method"""

    # Create the object
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    # Fit the data
    renn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(renn.min_c_, 0)
    assert_equal(renn.maj_c_, 1)
    assert_equal(renn.stats_c_[0], 500)
    assert_equal(renn.stats_c_[1], 4500)
def test_renn_fit():
    """Test the fitting method"""

    # Create the object
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    # Fit the data
    renn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(renn.min_c_, 0)
    assert_equal(renn.maj_c_, 1)
    assert_equal(renn.stats_c_[0], 500)
    assert_equal(renn.stats_c_[1], 4500)
def test_renn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(return_indices=True,
                                           random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'renn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
コード例 #21
0
def test_renn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    renn = RepeatedEditedNearestNeighbours(return_indices=True,
                                           random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'renn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
コード例 #22
0
ファイル: decision_tree.py プロジェクト: nikonovd/wikionto
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
コード例 #23
0
 def getsampler(self, type):
     if type == 'none':
         sampler = NoSampler()
     elif type == 'randomunder':
         sampler = RandomUnderSampler()
     elif type == 'nearmiss':
         sampler = NearMiss()
     elif type == 'allknn':
         sampler = AllKNN()
     elif type == 'condensednn':
         sampler = CondensedNearestNeighbour()
     elif type == 'editednn':
         sampler = EditedNearestNeighbours()
     elif type == 'repeatededitednn':
         sampler = RepeatedEditedNearestNeighbours()
     elif type == 'tomeklinks':
         sampler = TomekLinks()
     elif type == 'randomover':
         sampler = RandomOverSampler()
     elif type == 'smote':
         sampler = SMOTE()
     elif type == 'adasyn':
         sampler = ADASYN()
     elif type == 'smotenc':
         sampler = SMOTENC()
     elif type == 'quality':  # and self.quality_model_selection_type == 'extended':
         sampler = QualitySampler(self.n_init)
     else:
         print("Unsupported sampler %s" % type)
         exit(1)
     if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params(
     ).keys():
         sampler.set_params(random_state=self.random_state)
     return sampler
コード例 #24
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
コード例 #25
0
ファイル: Utils.py プロジェクト: Lipairui/Deal_with_Imbalance
def under_sampling(X, y, method):
    if method == 'ClusterCentroids':
        model = ClusterCentroids()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RandomUnderSampler':
        model = RandomUnderSampler()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NearMiss':
        model = NearMiss()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'EditedNearestNeighbours':
        model = EditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RepeatedEditedNearestNeighbours':
        model = RepeatedEditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'AllKNN':
        model = AllKNN()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NeighbourhoodCleaningRule':
        model = NeighbourhoodCleaningRule()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'OneSidedSelection':
        model = OneSidedSelection()
        X_resampled, y_resampled = model.fit_resample(X, y)
    return X_resampled, y_resampled
コード例 #26
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 378)
    assert_equal(count_y_res[1], 1828)
    assert_equal(count_y_res[2], 5)
コード例 #28
0
def undersampled_data_split(df, test_size=0.3):
    X = df.loc[:, ~df.columns.isin(['class'])]
    y = df.loc[:, df.columns.isin(['class'])]
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values.flatten(), test_size=test_size,
                                                        random_state=42)
    X_train, y_train = RepeatedEditedNearestNeighbours().fit_resample(X_train, y_train)
    return X_train, X_test, y_train, y_test
コード例 #29
0
def test_renn_init():
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)

    assert_equal(renn.n_neighbors, 3)
    assert_equal(renn.kind_sel, 'all')
    assert_equal(renn.n_jobs, -1)
    assert_equal(renn.random_state, RND_SEED)
コード例 #30
0
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs):
    if name == "rus":
        sampler = RandomUnderSampler(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "nm":
        sampler = NearMiss(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "enn":
        sampler = EditedNearestNeighbours(return_indices=return_indices,
                                          random_state=random_state,
                                          **kwargs)
    elif name == "renn":
        sampler = RepeatedEditedNearestNeighbours(
            return_indices=return_indices, random_state=random_state, **kwargs)
    elif name == "allknn":
        sampler = AllKNN(return_indices=return_indices,
                         random_state=random_state,
                         **kwargs)
    elif name == "tl":
        sampler = TomekLinks(return_indices=return_indices,
                             random_state=random_state,
                             **kwargs)
    else:
        raise ValueError
    return sampler
def test_renn_init():
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)

    assert renn.n_neighbors == 3
    assert renn.kind_sel == 'all'
    assert renn.n_jobs == 1
    assert renn.random_state == RND_SEED
コード例 #32
0
def test_renn_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    assert_raises(RuntimeError, renn.sample, X, Y)
コード例 #33
0
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 40)
    enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    assert_warns(UserWarning, enn.fit, X, y)
コード例 #34
0
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
コード例 #35
0
def test_renn_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(RuntimeWarning, renn.fit, X, y_single_class)
コード例 #36
0
def test_renn_not_good_object():
    """Test either if an error is raised while a wrong type of NN is given"""

    # Resample the data
    nn = 'rnd'
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn,
                                           random_state=RND_SEED,
                                           kind_sel='mode')
    assert_raises(ValueError, renn.fit_sample, X, Y)
コード例 #37
0
def test_renn_iter_wrong():
    """Test either if an error is raised when the numbr of iteration
    is wrong"""

    # Create the object
    max_iter = -1
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter,
                                           random_state=RND_SEED)
    assert_raises(ValueError, renn.fit_sample, X, Y)
コード例 #38
0
ファイル: decision_tree.py プロジェクト: softlang/wikionto
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False,
                            export=False):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
def test_renn_not_good_object():
    nn = 'rnd'
    renn = RepeatedEditedNearestNeighbours(
        n_neighbors=nn, kind_sel='mode')
    with raises(ValueError):
        renn.fit_sample(X, Y)
def test_deprecation_random_state():
    renn = RepeatedEditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        renn.fit_sample(X, Y)
def test_renn_iter_wrong():
    max_iter = -1
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter)
    with raises(ValueError):
        renn.fit_sample(X, Y)
コード例 #42
0
ファイル: plot_allknn.py プロジェクト: dvro/imbalanced-learn
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('Edited nearest neighbours')

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax3.set_title('Repeated Edited nearest neighbours')

# Apply the AllKNN
print('AllKNN')
allknn = AllKNN()