Beispiel #1
0
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Beispiel #2
0
def under_sampling_algs():
    algs = list()
    algs.append(("No Rs Undersampling case", "No Re-sampling"))
    algs.append((RandomUnderSampler(random_state=1), 'RU'))
    algs.append((ClusterCentroids(random_state=1), 'CC'))
    algs.append((TomekLinks(), 'TL'))
    algs.append((NearMiss(version=1), 'NM1'))
    algs.append((NearMiss(version=2), 'NM2'))
    algs.append((NearMiss(version=3), 'NM3'))
    algs.append((CondensedNearestNeighbour(random_state=1), 'CNN'))
    algs.append((OneSidedSelection(random_state=1), 'OSS'))
    algs.append((EditedNearestNeighbours(), 'ENN'))
    algs.append((NeighbourhoodCleaningRule(), 'NCL'))
    algs.append((InstanceHardnessThreshold(random_state=1), 'IHT'))
    algs.append((RepeatedEditedNearestNeighbours(), 'RENN'))
    algs.append((AllKNN(), 'AllKNN'))
    return algs
def equalize_training_dataset_with_CondensedNN(x_train, y_train):
    from imblearn.under_sampling import CondensedNearestNeighbour

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = CondensedNearestNeighbour(
        sampling_strategy={i: 180
                           for i in range(0, 43)},
        n_neighbors=5,
        n_jobs=8).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
Beispiel #4
0
def load_data(mode: str, normalize: bool = True):
    df, hidden_df = __load_data_first_time()

    # Extract x and y
    y = np.array(df['earnings'].to_numpy(), dtype=int)
    del df['earnings']

    x = np.array(df.to_numpy(), dtype=float)

    # Hidden to numpy
    hidden = hidden_df.to_numpy()

    if mode == 'vanilla':
        pass

    elif mode == 'smote':
        x, y = SMOTE().fit_sample(x, y)

    elif mode == 'adasyn':
        x, y = ADASYN().fit_sample(x, y)

    elif mode == 'bordersmote':
        x, y = BorderlineSMOTE().fit_sample(x, y)

    elif mode == 'randomover':
        x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'randomunder':
        x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'tomek':
        x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'knn':
        x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y)
        hidden = hidden[idxs]

    if normalize:
        x -= np.mean(x, axis=0)
        x /= np.std(x, axis=0)

    return x, y, hidden
def test_cnn_fit_resample():
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_resample(X, Y)

    X_gt = np.array([
        [-0.10903849, -0.12085181],
        [0.01936241, 0.17799828],
        [0.05230552, 0.09043907],
        [-1.25020462, -0.40402054],
        [0.70524765, 0.39816382],
        [0.35831463, 1.33483198],
        [-0.284881, -0.62730973],
        [0.03394306, 0.03986753],
        [-0.01252787, 0.34102657],
        [0.15198585, 0.12512646],
    ])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #6
0
def get_data(force_reload=False, strategy='oversampling', test_size=0.15):
    train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy))
    train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy))
    val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy))
    val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy))

    training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file)
    val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file)

    if not force_reload and training_files_exist and val_files_exist:
        X_train = np.load(train_data_file)
        y_train = np.load(train_labels_file)

        X_val = np.load(val_data_file)
        y_val = np.load(val_labels_file)
    else:
        train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
        X, y = to_data_format(train_df)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

        print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape))

        if strategy == 'oversampling':
            X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train)
        elif strategy == 'combine':
            smote = SMOTE(n_jobs=n_jobs)
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train)
        elif strategy == 'undersampling':
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = enn.fit_resample(X_train, y_train)
        elif strategy == 'condensed-undersampling':
            cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3)
            X_train, y_train = cnn.fit_resample(X_train, y_train)

        print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape))

        np.save(train_data_file, X_train)
        np.save(train_labels_file, y_train)
        np.save(val_data_file, X_val)
        np.save(val_labels_file, y_val)

    return X_train, X_val, y_train, y_val
def train_stage(df_path, cb_path):

    print('Load Train Data.')
    df = pd.read_csv(df_path)
    print('\nShape of Train Data: {}'.format(df.shape))

    y_df = np.array(df['target'])
    df_ids = np.array(df.index)
    df.drop(['ID_code', 'target'], axis=1, inplace=True)

    cb_cv_result = np.zeros(df.shape[0])

    skf = StratifiedKFold(n_splits=15, shuffle=False, random_state=42)
    skf.get_n_splits(df_ids, y_df)

    #sm = TomekLinks(random_state=42)
    sm = CondensedNearestNeighbour(random_state=42, n_jobs=3)

    print('\nModel Fitting...')
    for counter, ids in enumerate(skf.split(df_ids, y_df)):
        print('\nFold {}'.format(counter + 1))
        X_fit, y_fit = df.values[ids[0]], y_df[ids[0]]
        X_val, y_val = df.values[ids[1]], y_df[ids[1]]

        X_fit, y_fit = sm.fit_sample(X_fit, y_fit)

        print('CatBoost')
        cb_cv_result[ids[1]] += fit_cb(X_fit,
                                       y_fit,
                                       X_val,
                                       y_val,
                                       counter,
                                       cb_path,
                                       name='cb')

        del X_fit, X_val, y_fit, y_val
        gc.collect()

    auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4)
    print('Catboost VAL AUC: {}'.format(auc_cb))

    return 0
Beispiel #8
0
def resample_data(predictors, target, df_data, method):
    """
    This function resamples training datasets prior to training models.
    """
    if method=='adasyn':
        util = ADASYN()
    elif method=='random-over-sampler':
        util = RandomOverSampler()
    elif method=='smote':
        util = SMOTE(kind='borderline2')
    elif method=='smote-tomek':
        util = SMOTETomek()
    elif method=='smote-enn':
        util = SMOTEENN()
    elif method=='edited-nn':
        util = EditedNearestNeighbours()
    elif method=='repeated-edited-nn':
        util = RepeatedEditedNearestNeighbours()
    elif method=='all-knn':
        util = AllKNN()
    elif method=='one-sided-selection':
        util = OneSidedSelection()
    elif method=='cluster-centroids':
        util = ClusterCentroids()
    elif method=='random-under-sampler':
        util = RandomUnderSampler()
    elif method=='neighbourhood-cleaning-rule':
        util = NeighbourhoodCleaningRule()
    elif method=='condensed-nearest-neighbour':
        util = CondensedNearestNeighbour()
    elif method=='near-miss':
        util = NearMiss(version=1)
    elif method=='instance-hardness-threshold':
        util = InstanceHardnessThreshold()
    
    x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target])
    x_resampled = pd.DataFrame(x_resampled, columns=predictors)
    y_resampled = pd.DataFrame(y_resampled, columns=[target])
    return x_resampled, y_resampled
Beispiel #9
0
def readFile(path, y_label,method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False,fea_eng=True):
    raw = pd.read_csv(path)
    n, d = raw.shape
   

    if (shuffle):
        raw = raw.sample(frac=1).reset_index(drop=True)  # shuffle
    
    if (needSkew):
        skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(skew_exempted)].apply(lambda x: skew(x.dropna()))
        skewed = skewed[skewed > 0.75].index
        raw[skewed] = np.log1p(raw[skewed])  # reduce skewness
    
    raw = pd.get_dummies(raw, columns=encode_features)  # encode categorical features
    raw = raw.fillna(raw.mean())
    # if(method=='OverSample'):
    #     ind_more=np.argmax(np.bincount(raw[y_label]))
    #     more=raw[ind]
    #     less=raw[-ind]
    #     x = [randint(0, len(less)) for a in range(0, len(more)-len(less))]
    #     raw.
    X=raw.drop(y_label,axis=1)
    y=raw[y_label]
    if(method=='OverSample'):        
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(X, y)
        X=X_res
        y=y_res
    if(method=='UnderSample'): 
        # for i in []   
        model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP
        X_res, y_res = model.fit_resample(X, y) #doctest: +SKIP    \      
        X=X_res
        y=y_res
    # if(method=='Weights'): 
    # if(fea_eng==True):
    #     # X,y=feature_eng(X,y)
    X_train, X_test, y_train, y_test=split(X,y, training_ratio)
    return X_train, X_test, y_train, y_test
    def __init__(self):
        from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
        from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \
            TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \
            CondensedNearestNeighbour, NeighbourhoodCleaningRule
        from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \
            BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier

        self.oversamplers = {
            'ADASYN': ADASYN(),
            'RandomOverSampler': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'BorderlineSMOTE': BorderlineSMOTE(),
            'SVMSMOTE': SVMSMOTE()
        }
        self.undersamplers = {
            'ClusterCentroids': ClusterCentroids(),
            'RandomUnderSampler': RandomUnderSampler(),
            'InstanceHardnessThreshold': InstanceHardnessThreshold(),
            'NearMiss': NearMiss(),
            'TomekLinks': TomekLinks(),
            'EditedNearestNeighbours': EditedNearestNeighbours(),
            'RepeatedEditedNearestNeighbours':
            RepeatedEditedNearestNeighbours(),
            'AllKNN': AllKNN(),
            'OneSidedSelection': OneSidedSelection(),
            'CondensedNearestNeighbour': CondensedNearestNeighbour(),
            'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule()
        }
        self.ensemblesamplers = {
            'EasyEnsemble': EasyEnsemble(),
            'EasyEnsembleClassifier': EasyEnsembleClassifier(),
            'BalancedBaggingClassifier': BalancedBaggingClassifier(),
            'BalanceCascade': BalanceCascade(),
            'BalancedRandomForestClassifier': BalancedRandomForestClassifier,
            'RUSBoostClassifier': RUSBoostClassifier()
        }
Beispiel #11
0
from matplotlib import pyplot
import matplotlib.pyplot as plt
from pylab import subplot, title 
from matplotlib.colors import ListedColormap
from imblearn.under_sampling import CondensedNearestNeighbour

X1, y1 = make_blobs(n_samples=150, centers=4, n_features=2,random_state=21)
X2, y2 = make_gaussian_quantiles(mean=(2,2),cov=3., n_samples=150, n_features=2, n_classes=3, random_state=9)
X3, y3 = make_gaussian_quantiles(mean=(5,5),cov=5., n_samples=150, n_features=2, n_classes=2, random_state=15)

X = concatenate([X1,X2,X3])
y = concatenate([y1,y2,y3])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=35)

cnn = CondensedNearestNeighbour(random_state=0)   #random_state is used to get the same result for every run
X_res1, y_res1 = cnn.fit_sample(X, y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_res1, y_res1, test_size=0.25, random_state=35)    #CNN İLE ALAKALI ACCURACY İÇİN!

h = .02

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF', '#8B008B'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#8B008B'])

clf1 = KNeighborsClassifier(n_neighbors=1, weights='uniform')
clf2 = KNeighborsClassifier(n_neighbors=1, weights='uniform')
clf1.fit(X_train, y_train)
clf2.fit(X_train1,y_train1)
pred1 = clf1.predict(X_test)
def test_cnn_fit_resample_with_wrong_object():
    knn = 'rnd'
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int or an "):
        cnn.fit_resample(X, Y)
Beispiel #13
0
from feature_creation import X_train, y_train
from feature_creation import selector, idx, df_reduced_train
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, ClusterCentroids, NearMiss, CondensedNearestNeighbour, RandomUnderSampler
from imblearn.under_sampling import OneSidedSelection, InstanceHardnessThreshold
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.linear_model import SGDClassifier

imbalances = [
    RandomUnderSampler(),
    TomekLinks(),
    ClusterCentroids(),
    NearMiss(version=1, size_ngh=5),
    NearMiss(version=2, size_ngh=7),
    NearMiss(version=3, size_ngh=3),
    CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51),
    OneSidedSelection(size_ngh=5, n_seeds_S=51),
    OneSidedSelection(size_ngh=5, n_seeds_S=35),
    InstanceHardnessThreshold(),
    RandomOverSampler(ratio='auto'),
    ADASYN(ratio='auto', k=3),
    ADASYN(ratio=0.1, k=5),
    ADASYN(ratio=0.2, k=7),
    ADASYN(ratio=0.4, k=7),
    SMOTE(ratio='auto', kind='regular', k=5),
    SMOTE(ratio=0.1, kind='regular', k=5),
    SMOTE(ratio='auto', kind='regular', k=7),
    SMOTE(ratio='auto', kind='regular', k=9, out_step=0.6),
    SMOTE(ratio=0.4, kind='regular', k=5, out_step=0.5),
    SMOTE(ratio='auto', kind='borderline1'),
    SMOTE(ratio='auto', kind='borderline2'),
Beispiel #14
0
def sample_data_by_cnn(X, y):
    cnn = CondensedNearestNeighbour(random_state=42)
    return cnn.fit_resample(X, y)
Beispiel #15
0
def test_cnn_fit_sample_with_wrong_object():
    knn = 'rnd'
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    assert_raises_regex(ValueError, "has to be a int or an ", cnn.fit_sample,
                        X, Y)
Beispiel #16
0
scores = cross_validate(enn_pipe_rf,
                        X_train,
                        y_train,
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()
# (0.9248526844001812, 0.6883592815252976)

######### Condensed Nearest Neighbor #########

from imblearn.under_sampling import CondensedNearestNeighbour

# opposite of ENN; iteratively adds points to the data that are misclassified by KNN

cnn = CondensedNearestNeighbour()
X_train_cnn, y_train_cnn = cnn.fit_sample(X_train, y_train)
print(X_train_cnn.shape)
print(np.bincount(y_train_cnn))

### Pipeline method

cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(), LogisticRegression())

scores = cross_validate(cnn_pipe,
                        X_train,
                        y_train,
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
pd.DataFrame(scores)[['test_roc_auc', 'test_average_precision']].mean()
def Sampling(X, y, method):
    """
    function to sample imbalanced dataset:

    Arguments:
    X -- trainset features
    y -- trainset labels
    method -- sampling method

    Return:
    X_res -- sampled trainset features
    y_res -- sampled trainset labels
    """

    #Under-sampling:
    if method == 'RandomUnderSampler':
        from imblearn.under_sampling import RandomUnderSampler
        us = RandomUnderSampler()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'TomekLinks':
        from imblearn.under_sampling import TomekLinks
        us = TomekLinks()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'OneSidedSelection':
        from imblearn.under_sampling import OneSidedSelection
        us = OneSidedSelection()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'NeighbourhoodCleaningRule':
        from imblearn.under_sampling import NeighbourhoodCleaningRule
        us = NeighbourhoodCleaningRule()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'NearMiss':
        from imblearn.under_sampling import NearMiss
        us = NearMiss()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'InstanceHardnessThreshold':
        from imblearn.under_sampling import InstanceHardnessThreshold
        us = InstanceHardnessThreshold()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'AllKNN':
        from imblearn.under_sampling import AllKNN
        us = AllKNN()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'RepeatedEditedNearestNeighbours':
        from imblearn.under_sampling import RepeatedEditedNearestNeighbours
        us = RepeatedEditedNearestNeighbours()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'EditedNearestNeighbours':
        from imblearn.under_sampling import EditedNearestNeighbours
        us = EditedNearestNeighbours()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'CondensedNearestNeighbour':
        from imblearn.under_sampling import CondensedNearestNeighbour
        us = CondensedNearestNeighbour()
        X_res, y_res = us.fit_resample(X, y)

    # Combination of over- and under-sampling:
    elif method == 'SMOTEENN':
        from imblearn.combine import SMOTEENN
        us = SMOTEENN()
        X_res, y_res = us.fit_resample(X, y)

    elif method == 'SMOTETomek':
        from imblearn.combine import SMOTETomek
        us = SMOTETomek()
        X_res, y_res = us.fit_resample(X, y)

    return X_res, y_res
Beispiel #18
0
def balance_cnn(input):
    input_x, input_y = input
    cnn = CondensedNearestNeighbour(random_state=42)
    X_res, y_res = cnn.fit_sample(input_x, input_y)
    return X_res, y_res
    def condensedNearestNeighbour(self, n_neighbors=1):

        under = CondensedNearestNeighbour(n_neighbors=n_neighbors)
        self.steps.append(('u', under))
        return resampling(data=self.data, target=self.target, steps=self.steps)
Beispiel #20
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

#initial statistics
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
#using standard scaling on sepa-length,sepal-width ,petal -length,petal-width and encoding on
#different species of iris
x = iris.data[:, 0:4]
y = iris.target
X_normalized = normalize(x, axis=0)
x_train, x_test, y_train, y_test = train_test_split(X_normalized,
                                                    y,
                                                    test_size=0.20)
cnn = CondensedNearestNeighbour(return_indices=True)
X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_normalized, y)
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_resampled, y_resampled)

y_pred = clf.predict(x_test)
print(confusion_matrix(y_test, y_pred))
target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
print(classification_report(y_test, y_pred, target_names=target_names))
        X = np.vstack((data_major, data_minor))
        y = np.hstack((y, np.ones(len(data_minor))))

        if i1 not in [0, 8]:
            if i1 == 1:
                res = RandomUnderSampler(random_state=seed)
            if i1 == 2:
                res = NearMiss(version=1)
            if i1 == 3:
                res = NearMiss(version=2)
            if i1 == 4:
                res = ClusterCentroids(random_state=seed)
            if i1 == 5:
                res = EditedNearestNeighbours()
            if i1 == 6:
                res = CondensedNearestNeighbour(random_state=seed)
            if i1 == 7:
                res = TomekLinks()
            X_re, y_re = res.fit_resample(X, y)
        else:
            if i1 == 0:
                X_re, y_re = X, y
            if i1 == 8:
                X_re, y_re = PSU(X, y)

        temp_color = []

        for i2 in range(len(y_re)):
            if y_re[i2] > 0:
                temp_color.append("chocolate")
            else:
Beispiel #23
0
def test_cnn_init():
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)

    assert_equal(cnn.n_seeds_S, 1)
    assert_equal(cnn.n_jobs, 1)
Beispiel #24
0
            os.makedirs(out_dir)

        # save preprocessing transforms
        dump(self.df_imputer,
             open(os.path.join(out_dir, 'df_imputer.pkl'), 'wb'))
        dump(self.ord_enc, open(os.path.join(out_dir, 'ord_enc.pkl'), 'wb'))

        # save undersampling transforms
        for key, transform in self.undersamples.items():
            dump(transform,
                 open(os.path.join(out_dir, f'undersampler_{key}.pkl'), 'wb'))


if __name__ == '__main__':
    trans_data = transaction_table('../data/train_transaction.csv')
    print(trans_data.df.head(10))
    trans_data.add_undersampling_transform(
        'random',
        RandomUnderSampler(sampling_strategy='majority', random_state=0))
    trans_data.add_undersampling_transform(
        'CNN',
        CondensedNearestNeighbour(n_neighbors=1, random_state=0, n_jobs=-1))
    trans_data.add_undersampling_transform('Tomek', TomekLinks(n_jobs=-1))
    trans_data.add_undersampling_transform(
        'OSS',
        OneSidedSelection(n_neighbors=1,
                          n_seeds_S=200,
                          random_state=0,
                          n_jobs=-1))
    trans_data.save_trained_transforms()
Beispiel #25
0
                GNG_RES = fill_table(GNG_RES, inds, j, interm, final)

            except Exception as e:
                print('an issue with {}, rate: {}, variant: {}'.format(
                    dataset, rate, variant))
                print(e)

        if CNN_FLAG:
            variant = 'CNN'
            print('>> {}, rate: {}, variant: {}'.format(
                dataset, rate, variant))
            try:
                dataset_size = X_train.shape[0]
                coreset_size = max(int(dataset_size * rate / 100), 1)
                startTime = datetime.now()
                cnn = CondensedNearestNeighbour(random_state=SEED)
                observers, total_labels = cnn.fit_sample(X_train, y_train)
                observers, total_labels = fix_rate(X_train, y_train, observers,
                                                   total_labels, coreset_size)
                interm = (datetime.now() - startTime).total_seconds()
                startTime = datetime.now()
                try:
                    neigh = KNeighborsClassifier(n_neighbors=5)
                except:
                    neigh = KNeighborsClassifier(n_neighbors=1)
                neigh.fit(observers, total_labels)
                y_pred = neigh.predict(X_test)
                final = (datetime.now() - startTime).total_seconds()
                inds = get_indices_(y_test, y_pred)
                CNN_RES = fill_table(CNN_RES, inds, j, interm, final)
def under_sample_CondensedNearestNeighbour(train_inputs, train_targets):
    sampler = CondensedNearestNeighbour(random_state=32)
    train_inputs, train_targets = _sampler_helper(sampler, train_inputs,
                                                  train_targets)
    return train_inputs, train_targets
Beispiel #27
0
# ``CondensedNearestNeighbour`` is sensitive to noise by preserving the noisy
# samples. ``OneSidedSelection`` also used the 1-NN and use ``TomekLinks`` to
# remove the samples considered noisy. The ``NeighbourhoodCleaningRule`` use a
# ``EditedNearestNeighbours`` to remove some sample. Additionally, they use a 3
# nearest-neighbors to remove samples which do not agree with this rule.

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,
                                                         2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(
        ax_arr,
    (
        CondensedNearestNeighbour(random_state=0),
        OneSidedSelection(random_state=0),
        NeighbourhoodCleaningRule(),
    ),
):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title(f"Decision function for {sampler.__class__.__name__}")
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title(f"Resampling using {sampler.__class__.__name__}")
fig.tight_layout()

###############################################################################
# ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude
# samples. All samples which are classified with a low probability will be
def main():
    print("Symptom prediction")
    global variable_file
    print("Connecting to ML4H DB..")

    conn = pymysql.connect(host='nightmare.cs.uct.ac.za', port=3306, user='******', passwd='oesaerex', db='ochomo001')

    print("Connected")

    cur = conn.cursor()
    print("Executing SQL query..")
    #Query to get the totals of all the symptoms that a patient has reported to the clinic
    #Used as features to predict the next possible that a patient will report
    #cur.execute("select person_id, count(*) AS Tot_Num_Sympt, sum(case when value_coded_name_id =110 then 1 else 0 end) AS Cough, sum(case when value_coded_name_id = 4315 then 1 else 0 end) AS Fever, sum(case when value_coded_name_id = 156 then 1 else 0 end) AS Abdominal_pain, sum(case when value_coded_name_id = 524 then 1 else 0 end) AS skin_rash,sum(case when value_coded_name_id=1576 then 1 else 0 end) AS Lactic_acisdosis, sum(case when value_coded_name_id=2325 then 1 else 0 end) AS Lipodystrophy,sum(case when value_coded_name_id=3 then 1 else 0 end) AS Anemia,sum(case when value_coded_name_id=888 then 1 else 0 end) AS Anorexia,sum(case when value_coded_name_id=11335 then 1 else 0 end) AS Cough_any_duration,sum(case when value_coded_name_id=17 then 1 else 0 end) AS Diarrhea,sum(case when value_coded_name_id=10894 then 1 else 0 end) AS Leg_pain,sum(case when value_coded_name_id=4407 then 1 else 0 end) AS Night_Sweats,sum(case when value_coded_name_id=9345 then 1 else 0 end) AS Other,sum(case when value_coded_name_id=838 then 1 else 0 end) AS Peripheral_neuropathy,sum(case when value_coded_name_id=4355 then 1 else 0 end) AS Vomiting,sum(case when value_coded_name_id=11333 then 1 else 0 end) AS Weight_loss,min(obs_datetime), max(obs_datetime) from Obs_artvisit_sympt a inner join concept_name n on a.value_coded_name_id=n.concept_name_id group by person_id")
    cur.execute("select * from ML4H_symptom_totals")
    print("Executed")
    cur.close()

    #Loading features
    patient_ids = []
    sum_of_features = []
    for k in range( len(Patient.features) ):
        sum_of_features.append(0)

    patients = {}
    print("loading in data into program...")
    for row in cur:
        if (row[0] not in patient_ids):
            patient_ids.append(row[0])
            patients[row[0]] = (Patient(int(row[0])))
        patients[row[0]].feature_symptom_array = [ int(row[2]) + int(row[10]),int(row[3]),int(row[4]),int(row[5]),int(row[6]),
                                                   int(row[7]), int(row[8]), int(row[9]), int(row[11]),
                                                   int(row[12]), int(row[13]), int(row[14]), int(row[15]),int(row[16]),
                                                   int(row[17]), 0, 0, 0 ]
        #To check the balance of features
        for i in range( len(sum_of_features)- 3 ):
            if i == 8: #Exception for Cough for duration - joining with Cough feature
                sum_of_features[0] += int(row[i + 2])
            elif i > 8:
                sum_of_features[i-1]+= int(row[i + 2])
            else:
                sum_of_features[i] += int(row[i + 2])

    cur1 = conn.cursor()
    print("Executing SQL query..")
    #Gets the last reported symptom that a patient reported - RESULT CLASS (Trying to predict this)
    cur1.execute("select person_id, value_coded_name_id,name, MAX(obs_datetime) from Obs_artvisit_sympt a inner join concept_name n on a.value_coded_name_id = n.concept_name_id where value_coded_name_id IN (524,110,4315,156,3,888,11335,17,2325,4407,10894,9345,838,4355,11333) Group by person_id")
    print("Executed")
    cur1.close()

    #Loads result set - last symptom reported by patient
    for row in cur1:
        if row[2] == "None" or row[2] =='None' or row[2] is None:
            continue
        if (row[0] not in patient_ids):
            patient_ids.append( row[0] )
            patients[row[0]] = (Patient( int(row[0])) )
        if row[2] == "Cough of any duration":
            patients[row[0]].last_symptom = "Cough"
        else:
            patients[row[0]].last_symptom = row[2]
        patients[row[0]].set_sympt_class() #Indexing - Binaryzing classes for ROC scores later on


    print("Loaded data")
    print()
    print("Patient Ids:")
    #print(patient_ids)
    print("Patient objects")
    #print(patients)

    # Adding temporal aspect and adding more patient specific  - Sex, Age, Last Drug, num of symptoms in prev month
    # If last reported symptom is longer than a specified number of days(eg. 30) then change result to No symptom
    # Reason: Too long after to be helpful i.e predicting that someone will eventually report a skin rash is not as helpful
    # as predicting a skin rash next month.
    print("Executing temporal query...")
    cur4 = temporal_symptoms_ML.query_for_40_day_prev_symptoms_FROM_TABLE(conn)
    #cur4 = temporal_symptoms_ML.query_for_10_day_prev_symptoms(conn)
    #cur4 = temporal_symptoms_ML.query_for_70_day_prev_symptoms(conn)
    for row in cur4:
        if  (row[0] in patient_ids):
            if int(row[3]) < 11 and patients[row[0]].last_symptom != "None"  :
                patients[row[0]].last_symptom = "No symptoms"
                patients[row[0]].set_sympt_class()  # Reindexing - Binaryzing classes for ROC scores later on
            patients[row[0]].feature_symptom_array[ Patient.features.index("Age") ] = int( row[5].year )
            patients[row[0]].feature_symptom_array[ Patient.features.index("Last Drug") ] = int( row[6] )
            patients[row[0]].feature_symptom_array[ Patient.features.index("Tot Prev Month Symptoms") ] = int( row[7] )
            if row[3] == None:
                patients[row[0]].time_between_symptom_reports = 1000
            if int(row[3]) > 60:
                patients[row[0]].time_between_symptom_reports = 1000
            else:
                patients[row[0]].time_between_symptom_reports = int(row[3])

    print("Executed.")

    conn.close()

    #Load all data into array for ML application
    ml4hX = []
    ml4hY = []
    ml4hY_multiclass = []
    ml4hY_multilabel = []
    for id in patient_ids:
        if patients[id].check_if_null_features():
            continue
        if patients[id].last_symptom == "None":
            continue
        if patients[id].feature_symptom_array[ Patient.features.index("Age") ] == 0:
            continue
        ml4hX.append( patients[id].feature_symptom_array )
        ml4hY.append(patients[id].last_symptom)
        ml4hY_multiclass.append(patients[id].last_symptom_class)
        ml4hY_multilabel.append( patients[id].time_between_symptom_reports )

    print("Feature set:")
    #print(ml4hX)
    print("Result set:")
    #print(ml4hY)

    print()
    print(len(ml4hX))
    print(len(ml4hY))


    # Opening significance file for writing
    variable_file = open("symptom_variable_significance.csv",'w')
    for symptom in Patient.features:
        variable_file.write("," + symptom)
    variable_file.write("\n")


    print("Orig")
    check_symptom_result_distribution(ml4hY)
    print()
    X_train1, X_validation1, Y_train1, Y_validation1 = model_selection.train_test_split(ml4hX, ml4hY_multilabel,
                                                                                        test_size=0.2,
                                                                                        random_state=7)
    kfold2 = model_selection.KFold(n_splits=10, random_state=7)
    cv = RandomForestRegressor()
    cv.fit( X_train1,Y_train1 )
    print( explained_variance_score( Y_validation1, cv.predict(X_validation1) ) )
    for h in range(15):
        print(X_validation1[h])
        print("True :" + str(Y_validation1[h]) + " Pred : " + str(cv.predict([X_validation1[h] ])) )

    #cv_results2 = model_selection.cross_val_score(RandomForestClassifier(), X_train1, Y_train1, cv=kfold2, scoring='accuracy')
    #print(cv_results2.mean())

    print("ORIGINAL")
    apply_machine_learning_techniques(ml4hX,ml4hY_multiclass, "Original")

    # Trying different samplers
    samplers = [
        ["RandomUnderSampler_0.6", RandomUnderSampler()],
        ["NearMiss_0.025", NearMiss(ratio=0.005)],
        #[ "RandomOver_0.3",  RandomOverSampler() ],
        ["CondensedNearestNeighbour0.3", CondensedNearestNeighbour(ratio=0.3)],
        #["RepeatedEditedNearestNeighbours0.2",RepeatedEditedNearestNeighbours(ratio=0.2)],
        #["ALLKNN_0.4",AllKNN(ratio=0.005)],
        ["TomekLinks_0.3", TomekLinks(ratio=0.005)]
    ]

    for sampler in samplers:
        print(sampler[0])
        X_resamp, Y_resamp = sampler[1].fit_sample(ml4hX, ml4hY_multiclass)
        # check_symptom_result_distribution(Y_resamp)
        # print()
        apply_machine_learning_techniques(X_resamp, Y_resamp, sampler[0])
        print("............")


    nearmiss = NearMiss(ratio=0.03)
    X_resampled_nm, y_resampled_nm = nearmiss.fit_sample(ml4hX, ml4hY_multiclass)
    # check_symptom_result_distribution(y_resampled_nm)
    # print()

    print("RandomOVer")
    randomOVer = RandomOverSampler()
    X_resampled_ranO, y_resampled_ranO = randomOVer.fit_sample(X_resampled_nm, y_resampled_nm)
    check_symptom_result_distribution(y_resampled_ranO)
    print()

    print("NEAR MISS ADJUSTED OVERSAMPLE")
    apply_machine_learning_techniques(X_resampled_ranO, y_resampled_ranO, "NM_ADJ_Over")

    print("feature distribution")
    for j in range(len(sum_of_features)):
        print(Patient.features[j] + " : " + str(sum_of_features[j]))

    print()
    print("result class distribution")

    check_symptom_result_distribution(ml4hY)
    variable_file.close()
Beispiel #29
0


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour()
X_resampled, y_resampled = cnn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
def test_cnn_init():
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)

    assert cnn.n_seeds_S == 1
    assert cnn.n_jobs == 1