Ejemplo n.º 1
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=5000,
                               random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit_resample(X, y)
    X_trans2, y_trans2 = rus.fit_resample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()), ('rus', rus)])

    X_trans, y_trans = pipeline.fit_resample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_resample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def svmsampler(X, y, over_pct=0.1, under_pct=1):
    over = SVMSMOTE(random_state=42, sampling_strategy=over_pct)
    under = RandomUnderSampler(random_state=42, sampling_strategy=under_pct)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(X, y)
    return X, y
Ejemplo n.º 3
0
def smote_under(x_train, y_train, smote_ss=0.25, under_ss=0.75, rs_val=42):
    """
    Creates artificial training dataset data points for "1" label,
    undersamples "0" label.

    Input:
    x_train: Training dataset features.
    y_train: Training dataset labels.
    smote_ss: Percentage of minority label in artificial dataset.
    under_ss: Percentage of majority label that will be kept in
              artificial dataset.
    rs_val: Random state value.

    Output:
    x_train: Features for artificial training dataset.
    y_train: Labels for artificial training dataset.
    """
    #   Create list of column names for x_train and y_train
    x_cols = list(x_train.columns)
    y_cols = list(y_train.columns)

    #   Create artificial SMOTE data points for minority label,
    #   undersample majority label.
    over = SMOTE(sampling_strategy=smote_ss, random_state=42)
    under = RandomUnderSampler(sampling_strategy=0.75, random_state=42)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    x_train, y_train = pipeline.fit_resample(x_train, y_train)

    #   Change new dataset into Pandas dataframes.
    x_train = pd.DataFrame(x_train, columns=x_cols)
    y_train = pd.DataFrame(y_train, columns=y_cols)
    return x_train, y_train
def run_smote_oversampling_and_undersampling():
    # Define dataset
    X, y = make_classification(n_samples=10000,
                               n_features=2,
                               n_redundant=0,
                               n_clusters_per_class=1,
                               weights=[0.99],
                               flip_y=0,
                               random_state=1)

    # Summarize class distribution
    counter = Counter(y)
    print(counter)

    # Define pipeline
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)

    # Transform the dataset
    X, y = pipeline.fit_resample(X, y)

    # Summarize the new class distribution
    counter = Counter(y)
    print(counter)

    # Scatter plot of examples by class label
    for label, _ in counter.items():
        row_ix = where(y == label)[0]
        pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))

    pyplot.legend()
    pyplot.show()
Ejemplo n.º 5
0
def smote(X, y):
    over = SMOTE(sampling_strategy=0.9)
    under = RandomUnderSampler(sampling_strategy=0.9)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(X, y)
    return X, y
Ejemplo n.º 6
0
    def sample_data(self, Xtrain, ytrain, over=0.4, under=0.7):

        print("Sampling data...")
        over = SMOTE(sampling_strategy=over)
        under = RandomUnderSampler(sampling_strategy=under)
        steps = [("o", over), ("u", under)]
        pipeline = Pipeline(steps=steps)
        Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)

        return Xtrain, ytrain
def sample_data(x, y, choice):
    seed = 42
    k = 8

    if choice == 'both':
        # over = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=seed)
        # under = RandomUnderSampler(sampling_strategy='auto')

        # strategy = {0: 1000, 1: 1000, 2: 1000}
        # over = RandomOverSampler(sampling_strategy=strategy, random_state=seed)
        # under = TomekLinks(sampling_strategy='majority')
        choice = 'Over_and_Under_Sampling'

        over = RandomOverSampler(sampling_strategy=0.5, random_state=seed)
        under = TomekLinks(sampling_strategy=0.5)

        steps = [('o', over), ('u', under)]
        pipeline = Pipeline(steps=steps)
        x, y = pipeline.fit_resample(x, y)
    elif choice == 'u':
        choice = 'Under_Sampling'
        print('Performing Random Under Sample')
        strategy = 'auto'
        under = RandomUnderSampler(sampling_strategy=strategy)
        steps = [('u', under)]
        pipeline = Pipeline(steps=steps)
        x, y = pipeline.fit_resample(x, y)
    elif choice == 'o':
        choice = 'Over_Sampling'
        print('Performing Random Over Sample')
        over = RandomOverSampler(random_state=seed)
        steps = [('o', over)]
        pipeline = Pipeline(steps=steps)
        x, y = pipeline.fit_resample(x, y)
    elif choice == 'smote':
        choice = 'SMOTE'
        strategy = 'auto'
        smote_over_sample = SMOTE(sampling_strategy=strategy,
                                  k_neighbors=k,
                                  random_state=seed)
        x, y = smote_over_sample.fit_resample(x, y)

    return x, y, choice
Ejemplo n.º 8
0
def get_SMOTE_UnderSampler(X, Y,do_debug=False):
    pipeline = Pipeline(steps=[('o', SMOTE(sampling_strategy=0.1)), ('u', RandomUnderSampler(sampling_strategy=0.5))])
    X_Sampled, Y_Sampled = pipeline.fit_resample(X, Y)
    if do_debug:
        x_range = numpy.array([X[:, 0].min(), X[:, 0].max()])
        y_range = numpy.array([X[:, 1].min(), X[:, 1].max()])
        df_sampled = pd.DataFrame(numpy.concatenate((Y_Sampled.reshape(-1, 1), X_Sampled), axis=1), columns=['Y', 'x0', 'x1'])
        df         = pd.DataFrame(numpy.concatenate((Y.reshape(-1, 1), X), axis=1),columns=['Y', 'x0', 'x1'])
        customPalette = ['#808080', '#C00000']
        P.plot_2D_features_v3(df, x_range=x_range,y_range=y_range,palette=customPalette,transparency=0.5,figsize=(6,4),filename_out='original.png')
        P.plot_2D_features_v3(df_sampled, x_range=x_range,y_range=y_range,palette=customPalette,transparency=0.5,figsize=(6,4),filename_out='SMOTE_UnderSampler.png')
    return X_Sampled, Y_Sampled
def balanced_classes(X, y, n, digit1, digit2):
    unique, counts = np.unique(y, return_counts=True)
    print('\nClassi non bilanciate', dict(zip(unique, counts)))

    under = RandomUnderSampler(sampling_strategy={digit1:int(n/2)})
    over = RandomOverSampler(sampling_strategy={digit2:int(n/2)})
    pipeline = Pipeline(steps=[('o', over), ('u', under)])
    X, y = pipeline.fit_resample(X, y)

    unique, counts = np.unique(y, return_counts=True)
    print('Classi bilanciate', dict(zip(unique, counts)))
    return X, y
Ejemplo n.º 10
0
def simple_model(X_train, y_train):

    # define the methods
    over = SMOTE(k_neighbors=7)
    under = RandomUnderSampler()

    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)

    # transform the dataset
    new_X_train, new_y_train = pipeline.fit_resample(X_train, y_train)

    return new_X_train, new_y_train
Ejemplo n.º 11
0
def ensemble_model(X_train, y_train):

    # define the methods
    over = BorderlineSMOTE(k_neighbors=7, kind="borderline-1")
    under = EasyEnsemble(random_state=1)

    steps = [('o', over), ('u', under)]

    pipeline = Pipeline(steps=steps)

    # transform the dataset
    new_X_train, new_y_train = pipeline.fit_resample(X_train, y_train)

    return new_X_train[0], new_y_train[0]
Ejemplo n.º 12
0
def apply_over_random_under_sample_smote(X, y):
    # Oversample with SMOTE and random undersample for imbalanced dataset
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.pipeline import Pipeline
    over = SMOTE(sampling_strategy=0.5)
    under = RandomUnderSampler(sampling_strategy=0.5)
    # over = SMOTE(ratio=0.1)
    # under = RandomUnderSampler(ratio=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    # transform the dataset
    X_smt, y_smt = pipeline.fit_resample(X, y)
    return X_smt, y_smt
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit_resample(X, y)
    X_trans2, y_trans2 = rus.fit_resample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()), ('rus', rus)])

    X_trans, y_trans = pipeline.fit_resample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_resample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
Ejemplo n.º 14
0
    def __init__(self, data, target, features=None, steps=[]):

        super().__init__(data, target, features, steps)
        self.OverSampling = object_over(self.data, self.target, steps=steps)
        self.UnderSampling = object_under(self.data, self.target, steps=steps)

        if not self.steps == []:
            pipeline = Pipeline(steps=self.steps)
            features_resample, target_resample = pipeline.fit_resample(
                self.features_numpy, self.target_numpy)
            self.data = self.resample_dataframe(
                features_resample=features_resample,
                target_resample=target_resample,
                features=self.features,
                target=self.target)
            self.resample = self.data
Ejemplo n.º 15
0
def smote_sampling(x_train, y_train):
  
    # summarize class distribution
    counter = Counter(y_train)
    print(counter)
    # define pipeline
    over = SMOTE(sampling_strategy=0.2)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    # transform the dataset
    X_result, y_result = pipeline.fit_resample(x_train, y_train)
    # summarize the new class distribution
    counter = Counter(y_result)
    print(counter)
   

    return X_result, y_result
Ejemplo n.º 16
0
def over_under_sample_func(train_x, train_y, target):
    try:
        logger.info(
            f"counter before over_under_sample is: {train_y[target].value_counts()}"
        )
        # transform the dataset
        over = SMOTE(sampling_strategy=0.1)
        under = RandomUnderSampler(sampling_strategy=0.5)
        steps = [('o', over), ('u', under)]
        pipeline = Pipeline(steps=steps)
        # transform the dataset
        train_x, train_y = pipeline.fit_resample(train_x, train_y)
        # summarize the new class distribution
        logger.info(
            f"counter after over_under_sample is: {train_y[target].value_counts()}"
        )
        return train_x, train_y
    except Exception as ex:
        logger.error(f"failed to run over_under_sample_func due to: {ex}")
Ejemplo n.º 17
0
def balanceSampling(X_tr, y_train, up_ratio=1,dn_ratio=1):
    """
    Docstring: up and under sampling data
    
    Parameters
    ----------
    up_ratio: upsampling ratio
    dn_ratio: downsampling ratio

    """
    # Ratio argument is the percentage of the upsampled minority class in relation to the majority class. Default is 1.0
    over = SMOTE(sampling_strategy = up_ratio)
    under = RandomUnderSampler(sampling_strategy = dn_ratio)
    steps = [('over', over), ('under', under)]
    pipeline = Pipeline(steps=steps)
    X_train_sm, y_train_sm = pipeline.fit_resample(X_tr, y_train)
    
    print(X_train_sm.shape, y_train_sm.shape)
    return X_train_sm, y_train_sm
Ejemplo n.º 18
0
 def oversample_smote_undersampling(self, X_train, y_train):
     print("SMOTE WITH UNDERSAMPLING")
     print(f"Shape before smote: {X_train.shape}")
     sampling_strategy = {
         'RESIDENTIAL': 1000,
         'INDUSTRIAL': 2000,
         'PUBLIC': 1000,
         'OFFICE': 1000,
         'OTHER': 1500,
         'RETAIL': 10000,
         'AGRICULTURE': 1500
     }
     over = SMOTE()
     under = RandomUnderSampler(sampling_strategy=sampling_strategy)
     steps = [('o', over), ('u', under)]
     pipeline = Pipeline(steps=steps)
     X, y = pipeline.fit_resample(X_train, y_train)
     print(f"Shape after SMOTE and undersampling: {X.shape}")
     return X, y
Ejemplo n.º 19
0
def under_sample_with_SMOTE(X, y):
    '''
    Undersample the date with SMOTE algorithm
    :param X:
    :param y: labels
    :return:
    '''
    counter = collections.Counter(y)
    print(counter)
    # define pipeline
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    # transform the dataset
    X, y = pipeline.fit_resample(X, y)
    # summarize the new class distribution
    counter = collections.Counter(y)
    print(counter)
    return X, y
def SMOTE_Analysis(k, o, u):
    try:
        model = DecisionTreeClassifier()
        over = SMOTE(sampling_strategy=o, k_neighbors=k, random_state=2)
        under = RandomUnderSampler(sampling_strategy=u)
        steps = [('over', over), ('under', under)]
        pipeline = Pipeline(steps=steps)
        Xn, yn = pipeline.fit_resample(X, y.ravel())
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        scores = cross_val_score(model,
                                 Xn,
                                 yn,
                                 scoring='roc_auc',
                                 cv=cv,
                                 n_jobs=-1)
        score = np.mean(scores)
        print("k={}, over={}, under={}, Mean ROC AUC: {:.3f}".format(
            k, o, u, score))
        return [k, o, u]
    except Exception as e:
        return ""
    def resampling(self,
                   oversample_ratio=0.3,
                   minority_num=368,
                   majority_num=10000,
                   minority_label='1.0',
                   majority_label='0.0'):
        # define resampling
        under = RandomUnderSampler(sampling_strategy={
            majority_label: majority_num,
            minority_label: minority_num
        })
        over = SMOTE(sampling_strategy=oversample_ratio)

        # define pipeline
        pipeline = Pipeline(steps=[('u', under), ('o', over)])

        X_sm, y_sm = pipeline.fit_resample(self.X, self.y)

        print('Proportion in data after resample: ', Counter(y_sm))

        return X_sm, y_sm
Ejemplo n.º 22
0
def syntetic_sampling(X, y, over_sampling, under_sampling):
  """
  Apply Synthetic Minority Oversampling Technique (SMOTE)
  to tn unbalanced class

  :type X: pandas DataFrame
  :param X: Training Features

  :type y: pandas Series
  :param y: Training Features

  :return: resampled data
  :rtype: tuple
  """

  over = SMOTE(sampling_strategy=over_sampling)
  under = RandomUnderSampler(sampling_strategy=under_sampling)
  steps = [('o', over), ('u', under)]
  pipeline = Pipeline(steps=steps)

  return pipeline.fit_resample(X, y)
def split_smote(drug_df, drug_name):
    X = drug_df.drop([drug_name], axis=1)
    y = drug_df[drug_name]
    counter = Counter(y)
    print('Originally, the distribution of classes is: {}'.format(counter))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=42,
                                                        stratify=y)
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    Xsm_train, ysm_train = pipeline.fit_resample(X_train, y_train)
    counter_balance = Counter(ysm_train)
    print(
        'After SMOTE sampling, the distribution of classes in Training set is: {}'
        .format(counter_balance))
    XSM_train = pd.DataFrame(Xsm_train, columns=X_train.columns)
    return XSM_train, ysm_train, X_test, y_test
Ejemplo n.º 24
0
def under_over_sample(X,
                      y,
                      under_samp_rate=0.15,
                      over_samp_rate=0.75,
                      random_state=42):
    under = RandomUnderSampler(
        sampling_strategy=under_samp_rate,
        random_state=random_state,
    )
    over = RandomOverSampler(sampling_strategy=over_samp_rate,
                             random_state=random_state)
    steps = [('under', under), ('over', over)]
    pipeline = Pipeline(steps=steps)

    X_res, y_res = pipeline.fit_resample(np.array(X).reshape(-1, 1), y)

    combined = pd.DataFrame(data={
        "TEXT": X_res.squeeze(),
        "OUTPUT_LABEL": y_res
    })

    return combined.fillna("")
Ejemplo n.º 25
0
def build_loaders(titles, labels, batch_size,
                  under_sample=False, over_sample=False):
    train_titles, test_titles, train_labels, test_labels = \
        train_test_split(titles, labels, test_size=0.1)
    val_titles, test_titles, val_labels, test_labels = \
        train_test_split(test_titles, test_labels, test_size=0.01)

    steps = []
    if under_sample:
        steps.append(("Under", EditedNearestNeighbours(n_neighbors=2)))
    if over_sample:
        steps.append(("Over", SMOTE(sampling_strategy=1)))
    if under_sample or over_sample:
        pipeline = Pipeline(steps=steps)
        train_titles, train_labels = pipeline.fit_resample(train_titles,
                                                           train_labels)
    print("Train:")
    calc_ratio(train_labels)
    print("Validation:")
    calc_ratio(val_labels)
    print("Test:")
    calc_ratio(test_labels)

    train = TensorDataset(torch.from_numpy(train_titles),
                          torch.from_numpy(train_labels))
    val = TensorDataset(torch.from_numpy(val_titles),
                        torch.from_numpy(val_labels))
    test = TensorDataset(torch.from_numpy(test_titles),
                         torch.from_numpy(test_labels))

    train_loader = DataLoader(train, shuffle=True, batch_size=batch_size,
                              drop_last=True)
    test_loader = DataLoader(test, shuffle=True, batch_size=batch_size,
                             drop_last=True)
    val_loader = DataLoader(val, shuffle=True, batch_size=batch_size,
                            drop_last=True)

    return train_loader, test_loader, val_loader
Ejemplo n.º 26
0
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        print ("Running on:",device)

        scores=np.array([])
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=2)
        for train_index, test_index in cv.split(X, y):
            #Put data in dataloaders
            print ("Augmenting Data...")
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            over = SMOTE(random_state=2)
            under = RandomUnderSampler(random_state=2)
            steps = [('o', over), ('u', under)]
            pipeline = Pipeline(steps=steps)
            X_train=X_train.reshape(X_train.shape[0],-1)
            X_train, y_train = pipeline.fit_resample(X_train, y_train)
            X_train = X_train.reshape(-1,X.shape[1], X.shape[2], X.shape[3])
            X_test = X_test.reshape(-1,X.shape[1], X.shape[2], X.shape[3])
            train_loader = make_into_dataloader(X_train, y_train,batch_size)
            test_loader = make_into_dataloader(X_test,y_test,batch_size)

            #Create model
            model = fmriNet(insize)
            model.to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, betas = betas, weight_decay = l2)

            #Training
            losses = train(model, train_loader, num_epochs, criterion, optimizer, losses = [])
            #plt.plot(losses)
            #plt.show()
            #Testing
Ejemplo n.º 27
0
def load():

    dtype = [
        'characteristic_B', 'characteristic_C', 'characteristic_D',
        'characteristic_E', 'characteristic_G', 'characteristic_M',
        'characteristic_P', 'characteristic_Q', 'characteristic_R',
        'characteristic_S', 'characteristic_Y', 'characteristic_Z',
        'catering_C', 'catering_F', 'catering_H', 'catering_M', 'catering_R',
        'catering_T', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
        'freight', 'bank_holiday_running', 'length', 'speed', 'delayed'
    ]

    dtype = {key: "uint8" for key in dtype}

    categories = {
        "status": "category",
        "category": "category",
        "power_type": "category",
        "timing_load": "category",
        "seating": "category",
        "reservations": "category",
        "ATOC_code": "category",
        "destination_stanox_area": "category",
        "origin_stanox_area": "category"
    }

    dtype.update(categories)

    start = time.time()

    print("Loading data...", end="")

    df = pd.read_csv("data/dscm_w.csv",
                     index_col=["uid"],
                     parse_dates=["std", "sta", "atd", "ata"],
                     dtype=dtype)

    path = os.path.join("models", "select")

    if not os.path.exists(path):
        os.mkdir(path)

    Y = df["delayed"]
    X = df.drop(["delay", "delayed", "atd", "ata", "origin", "destination"],
                axis=1)

    print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n")

    print(X.info())

    X = RailEncoder().transform(X)

    categorical_features = [
        "status", "category", "power_type", "timing_load", "seating",
        "reservations", "characteristics", "catering", "ATOC_code",
        "origin_stanox_area", "destination_stanox_area"
    ]

    for c in categorical_features:
        X[c] = X[c].cat.codes

    datetime_features = X.select_dtypes(include="datetime").columns.values
    datetime_transformer = Pipeline([("cyclical",
                                      DatetimeEncoder(cyclical=True))])

    preprocessor = ColumnTransformer([
        ("datetime", datetime_transformer, datetime_features),
    ],
                                     remainder="passthrough")

    resampler = IPipeline([
        # ('over', SMOTE(sampling_strategy=0.2, random_state=1)),                 # Increase minority to 20% of majority
        ('under', RandomUnderSampler(sampling_strategy=1.0, random_state=1)
         ),  # Reduce majority to 50% of minority
    ])

    start = time.time()

    print("\nPreprocessing data...", end="")

    X = preprocessor.fit_transform(X, Y)

    print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n")

    print(X.shape)

    print("\nResampling data...", end="")

    X, Y = resampler.fit_resample(X, Y)

    print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n")

    print("{}, delayed: {}, not delayed: {}\n".format(X.shape, Y.sum(),
                                                      len(Y) - Y.sum()))

    return X, Y
Ejemplo n.º 28
0
def lr_cv(disease,
          year_survival=5,
          period_of_analysis_days=None,
          kfold=5,
          random_state=17,
          authortype_list=None,
          added_features_list=None):
    """This is the main function to obtain the NLP experiment results.
    
    The function lr_cv (logistic regression - cross validation) performs
    n-year survival prediction (year_survival) using text notes and stage/grade,
    independently. Term-frequency inverse document-frequency (tf-idf) is
    applied to the text. 
    Also, the function use the Scikit-learn SelectFromModel Meta-transformer
    for selecting features based on importance weights for each time point.

    Parameters
    ----------
     disease:
         One of the values from ('breast','prostate','lung','glioma').
     year_survival:
         Threshold to define survival.
     period_of_analysis_days:
         List of number of days after diagnosis considered to select the notes.
     kfold:
         Number of folds in the cross validation.
     random_state:
         Seed used by the random number generator.
     authortype_list:
         List of authors considered as valid.
     added_features_list:
         Features from the input dataset conserved in the output.

    Return
    ------
    Dictionary with all results.

    Dictionary keys: 
     val_f1: 
         List of tuples (mean, std) of test sets F1 metric in the grid search,
         best index for each time point.
     val_area_under_curve:
         List of tuples (mean, std) of test sets AUC metric in the grid search,
         best index for each time point.
     tfidf_param_text: 
         List of hyperparameter max_features for tfidfvectorizer in the grid
         search for each time point.
     C_param_text: 
         List of hyperparameter C for logistic regression in the grid search
         for each time point.
     f1_train:
         List of tuples (F1 score, 0) of training set for each time point.
     area_under_curve_train:
         List of tuples (AUC score, 0) of training set for each time point.
     n:
         List of train set size for each time point.
     f1: 
         List of tuples (F1 score, 0) of test set for each time point.
     area_under_curve:
         List of tuples (AUC score, 0) of test set for each time point.
     n_test:
         List of test set size for each time point.
     feature_names:
         List of importants features for each time point.
     predictions:
         List of predictions for the test set for each time point.
     val_f1_s:
         List of tuples (mean, std) of test sets F1 metric in the grid search
         best index for each time point. (stage/grade approach)
     val_area_under_curve_s:
         List of tuples (mean, std) of test sets AUC metric in the grid search
         best index for each time point. (stage/grade approach)
     C_param_s: 
         List of hyperparameter C for logistic regression in the grid search
         for each time point. (stage/grade approach)
     f1_train_s:
         List of F1 score of training set for each time point.
         (stage/grade approach)
     area_under_curve_train_s:
         List of AUC score of training set for each time point.
         (stage/grade approach)
     f1_s:
         List of tuples (F1 score, 0) of test set for each time point.
         (stage/grade approach)
     area_under_curve_s:
         List of tuples (AUC score, 0) of test set for each time point.
         (stage/grade approach)
     predictions_s:
         Lis of predictions for the test set for each time point.
         (stage/grade approach)
     train: 
         List of the complete training sets with added columns with the
         predictions for the two approaches for each time point.
     test: 
         List of the complete test sets with added columns with the
         predictions for the two approaches for each time point.
     random_state:
         Seed used by the random number generator.
    """

    # Initializations:
    val_f1 = []
    val_area_under_curve = []
    tfidf_param_text = []
    C_param_text = []
    f1_train = []
    area_under_curve_train = []
    n = []
    f1 = []
    area_under_curve = []
    n_test = []
    feature_names = []
    predictions = []
    tain_list = []
    val_f1_s = []
    val_area_under_curve_s = []
    C_param_s = []
    f1_train_s = []
    area_under_curve_train_s = []
    f1_s = []
    area_under_curve_s = []
    predictions_s = []
    test_list = []
    id_list_flag = True
    idlist = []
    test_ids = []
    train = None
    test = None
    train_frac = 0.8
    unic_label = False  # Nested stratification is done when value is False.
    ngram = 1
    max_features = 200  # For feature importance.
    examples_col_names = [
        'id', 'overallsurvival', 'vitalstatusbinary', 'stage_grade', 'id_count'
    ] + added_features_list + ['text_length', 'text']
    labels_col_names = ['stage_grade', 'label']
    scoring = {
        'f1': make_scorer(f1_score, average='macro'),
        'auc': make_scorer(roc_auc_score)
    }

    parameter_grid = {
        'logisticregression__C': [0.1, 1, 10, 100, 1000],
        'tfidfvectorizer__max_features': [500, 1000, None]
    }
    parameter_grid_stage = {'logisticregression__C': [0.1, 1, 10, 100, 1000]}
    if period_of_analysis_days is None:
        period_of_analysis_days = [30, 365]

    # Main loop: Solving the problem at each time point.
    for period in period_of_analysis_days:
        print(f"Period in days: {period}")

        # 1. Define train and test set.

        if id_list_flag:
            id_list_flag = False
            df = combined_notes(disease,
                                year_survival=year_survival,
                                period=period,
                                authortype_list=authortype_list,
                                added_features_list=added_features_list)
            idlist = df['id'].copy().tolist()
            if unic_label:
                examples = df[examples_col_names].copy()
                labels = df['label'].copy()
                X_tr, X_te, y_tr, y_te = train_test_split(
                    examples,
                    labels,
                    train_size=train_frac,
                    stratify=labels,
                    random_state=random_state)
                train = pd.concat([y_tr, X_tr], axis=1).reset_index(drop=True)
                test = pd.concat([y_te, X_te], axis=1).reset_index(drop=True)
            else:
                train, test = nested_train_test_split(
                    df,
                    examples_col_names,
                    labels_col_names,
                    train_frac=train_frac,
                    random_state=random_state)
            test_ids = test['id'].copy().tolist()
            print('text is in df:', df.columns)
        else:
            df = combined_notes(disease,
                                period=period,
                                year_survival=year_survival,
                                idlist=idlist,
                                authortype_list=authortype_list,
                                added_features_list=added_features_list)
            train = df[~df['id'].isin(test_ids)].copy().reset_index(drop=True)
            test = df[df['id'].isin(test_ids)].copy().reset_index(drop=True)

        train['is_test'] = False
        test['is_test'] = True

        # 2. NLP.

        # Create a temporary folder to store the transformers of the pipeline.
        cachedir = mkdtemp()
        memory = Memory(location=cachedir, verbose=10)
        pipeline = Pipeline([('tfidfvectorizer',
                              TfidfVectorizer(ngram_range=(1, ngram),
                                              tokenizer=tokenize,
                                              min_df=3,
                                              max_df=0.9,
                                              strip_accents='unicode',
                                              use_idf=1,
                                              smooth_idf=1,
                                              sublinear_tf=1)),
                             ('randomOversampler',
                              RandomOverSampler(random_state=random_state)),
                             ('logisticregression',
                              LogisticRegression(random_state=random_state))],
                            memory=memory)

        if unic_label:
            x_train = train['text'].copy()
            y_train = train['label'].values.copy()
            cross_validation = StratifiedKFold(n_splits=kfold,
                                               shuffle=True,
                                               random_state=random_state)
        else:
            # Nested stratification.
            x_train_2col = train[['stage_grade', 'text']].copy()
            x_train = train['text'].copy()
            y_train = train['label'].values.copy()
            cross_validation = CatStratifiedKFold(
                n_splits=kfold, shuffle=True,
                random_state=random_state).split(x_train_2col, y_train)

        x_test = test['text'].copy()
        y_test = test['label'].values.copy()

        grid_search = GridSearchCV(pipeline,
                                   param_grid=parameter_grid,
                                   scoring=scoring,
                                   refit='f1',
                                   cv=cross_validation)

        # Fit.
        grid_search.fit(x_train, y_train)

        # Clear the cache directory when you don't need it anymore.
        rmtree(cachedir)

        # Record cross validation metrics.
        val_f1.append(
            (grid_search.cv_results_['mean_test_f1'][grid_search.best_index_],
             grid_search.cv_results_['std_test_f1'][grid_search.best_index_]))

        val_area_under_curve.append(
            (grid_search.cv_results_['mean_test_auc'][grid_search.best_index_],
             grid_search.cv_results_['std_test_auc'][grid_search.best_index_]))

        # Record cross validation hyperparameters.
        tfidf_param_text.append(
            grid_search.best_params_['tfidfvectorizer__max_features'])
        C_param_text.append(grid_search.best_params_['logisticregression__C'])

        # Final model.
        pipeline_final = Pipeline([
            ('tfidfvectorizer',
             TfidfVectorizer(ngram_range=(1, ngram),
                             tokenizer=tokenize,
                             min_df=3,
                             max_df=0.9,
                             strip_accents='unicode',
                             use_idf=1,
                             smooth_idf=1,
                             sublinear_tf=1,
                             max_features=grid_search.
                             best_params_['tfidfvectorizer__max_features'])),
            ('randomOversampler',
             RandomOverSampler(random_state=random_state)),
            ('logisticregression',
             LogisticRegression(
                 random_state=random_state,
                 C=grid_search.best_params_['logisticregression__C']))
        ])

        final_model = pipeline_final.fit(x_train, y_train)
        preds_train = final_model.predict(x_train)

        # Add predictions in train DF.
        train[str(period) + '_tf_pred'] = preds_train
        train_f1 = f1_score(y_train, preds_train, average='macro')
        f1_train.append((train_f1, 0))  # Record train f1
        train_auc = roc_auc_score(y_train, preds_train)
        area_under_curve_train.append((train_auc, 0))  # Record train auc.
        n.append(len(train))  # Add number of examples in train.
        preds_test = final_model.predict(x_test)

        # Add predictions in test DF.
        test[str(period) + '_tf_pred'] = preds_test
        predictions.append(preds_test)  # Add predictions in list.
        test_f1 = f1_score(y_test, preds_test, average='macro')
        f1.append((test_f1, 0))  # Record test f1.
        test_auc = roc_auc_score(y_test, preds_test)
        area_under_curve.append((test_auc, 0))  # Record test auc.
        n_test.append(len(test))  # Add number of examples in test.

        # Selecting features.
        pip_tfidf_ros = Pipeline([
            ('tfidfvectorizer',
             TfidfVectorizer(ngram_range=(1, ngram),
                             tokenizer=tokenize,
                             min_df=3,
                             max_df=0.9,
                             strip_accents='unicode',
                             use_idf=1,
                             smooth_idf=1,
                             sublinear_tf=1,
                             max_features=grid_search.
                             best_params_['tfidfvectorizer__max_features'])),
            ('randomOversampler', RandomOverSampler(random_state=random_state))
        ])

        X_res, y_res = pip_tfidf_ros.fit_resample(x_train, y_train)
        clf = LogisticRegression(
            random_state=random_state,
            C=grid_search.best_params_['logisticregression__C'])
        sfm = SelectFromModel(clf,
                              threshold=-np.inf,
                              max_features=max_features)
        sfm.fit(X_res, y_res)
        embeded_lr_support = sfm.get_support()
        X_res_pandas = pd.DataFrame(X_res.todense())
        embeded_lr_feature = X_res_pandas.loc[:,
                                              embeded_lr_support].columns.tolist(
                                              )
        feature_names_list = np.array(
            pip_tfidf_ros['tfidfvectorizer'].get_feature_names(
            ))[embeded_lr_feature].tolist()
        feature_names.append(feature_names_list)  # Add importants features.

        # 3. Stage.

        x_train_s = train[['stage_grade']].copy()
        y_train_s = train['label'].values.copy()
        x_test_s = test[['stage_grade']].copy()
        y_test_s = test['label'].values.copy()

        # Create a temporary folder to store the transformers of the pipeline.
        cachedir = mkdtemp()
        memory = Memory(location=cachedir, verbose=10)
        pipeline = Pipeline(
            [('randomOversampler',
              RandomOverSampler(random_state=random_state)),
             ('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
             ('logisticregression',
              LogisticRegression(random_state=random_state))],
            memory=memory)

        cross_validation = StratifiedKFold(n_splits=kfold,
                                           shuffle=True,
                                           random_state=random_state)
        grid_search = GridSearchCV(pipeline,
                                   param_grid=parameter_grid_stage,
                                   scoring=scoring,
                                   refit='f1',
                                   cv=cross_validation)

        grid_search.fit(x_train_s, y_train_s)

        # Clear the cache directory when you don't need it anymore.
        rmtree(cachedir)

        val_f1_s.append(
            (grid_search.cv_results_['mean_test_f1'][grid_search.best_index_],
             grid_search.cv_results_['std_test_f1'][grid_search.best_index_]))
        val_area_under_curve_s.append(
            (grid_search.cv_results_['mean_test_auc'][grid_search.best_index_],
             grid_search.cv_results_['std_test_auc'][grid_search.best_index_]))

        # Final model.
        pipeline_final = Pipeline([
            ('randomOversampler',
             RandomOverSampler(random_state=random_state)),
            ('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
            ('logisticregression',
             LogisticRegression(
                 random_state=random_state,
                 C=grid_search.best_params_['logisticregression__C']))
        ])

        C_param_s.append(grid_search.best_params_['logisticregression__C'])
        final_model = pipeline_final.fit(x_train_s, y_train_s)
        preds_train_s = final_model.predict(x_train_s)

        # Fill the output dictionary values
        train[str(period) + '_s_pred'] = preds_train_s
        train_f1_s = f1_score(y_train_s, preds_train_s, average='macro')
        f1_train_s.append(train_f1_s)
        train_auc_s = roc_auc_score(y_train_s, preds_train_s)
        area_under_curve_train_s.append(train_auc_s)
        preds_test_s = final_model.predict(x_test_s)
        test[str(period) + '_s_pred'] = preds_test_s
        predictions_s.append(preds_test_s)
        test_f1_s = f1_score(y_test_s, preds_test_s, average='macro')
        f1_s.append((test_f1_s, 0))
        test_auc_s = roc_auc_score(y_test_s, preds_test_s)
        area_under_curve_s.append((test_auc_s, 0))
        tain_list.append(train)
        test_list.append(test)

    return dict(val_f1=val_f1,
                val_area_under_curve=val_area_under_curve,
                tfidf_param_text=tfidf_param_text,
                C_param_text=C_param_text,
                f1_train=f1_train,
                area_under_curve_train=area_under_curve_train,
                n=n,
                f1=f1,
                area_under_curve=area_under_curve,
                n_test=n_test,
                feature_names=feature_names,
                predictions=predictions,
                val_f1_s=val_f1_s,
                val_area_under_curve_s=val_area_under_curve_s,
                C_param_s=C_param_s,
                f1_train_s=f1_train_s,
                area_under_curve_train_s=area_under_curve_train_s,
                f1_s=f1_s,
                area_under_curve_s=area_under_curve_s,
                predictions_s=predictions_s,
                train=tain_list,
                test=test_list,
                random_state=random_state)


# ---------------------------------------------------------------------------

# ***************************************************************************
Ejemplo n.º 29
0
def plot_ROC_wCV(ax, X, y, names, save=True, balance=True):
    sns.set_context("paper")
    nsplit = 5
    cv = StratifiedKFold(n_splits=nsplit)
    classes = np.unique(y)
    colors = plt.cm.Dark2(np.linspace(0, 1, len(classes)))
    rf = RandomForestClassifier(n_estimators=1400,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features='sqrt',
                                max_depth=90,
                                bootstrap=False)
    tprs = []
    allAcc = []
    aucs = []
    all_confMatrices = []
    mean_fpr = np.linspace(0, 1, 100)
    accuracy_tot = 0
    nclass = len(classes)
    wrong = []
    for j in range(nclass):
        i = 0
        for train, test in cv.split(X, y):
            names_test = names[test]
            if nclass == 3:
                sampling1 = {
                    'SN Ia': Counter(y[train])['SN Ia'],
                    'Core Collapse': Counter(y[train])['Core Collapse'],
                    'SLSN': 1000
                }
                sampling2 = {
                    'SN Ia': 1000,
                    'SLSN': 1000,
                    'Core Collapse': 1000
                }
            elif nclass == 4:
                sampling1 = {
                    'SN Ia': Counter(y[train])['SN Ia'],
                    'Core Collapse': Counter(y[train])['Core Collapse'],
                    'SN Ia Pec': 500,
                    'SLSN': 500
                }
                sampling2 = {
                    'SN Ia': 1000,
                    'Core Collapse': 1000,
                    'SN Ia Pec': 500,
                    'SLSN': 500
                }
            elif nclass == 2:
                sampling1 = {
                    'SN Ia': Counter(y[train])['SN Ia'],
                    'Core Collapse': 3500
                }
                sampling2 = {'SN Ia': 3500, 'Core Collapse': 3500}
            if balance:
                over = SMOTE(sampling_strategy=sampling1)
                under = RandomUnderSampler(sampling_strategy=sampling2)
                steps = [('o', over), ('u', under)]
                pipeline = Pipeline(steps=steps)
                Xtrain_resampled, ytrain_resampled = pipeline.fit_resample(
                    X[train], y[train])
            else:
                Xtrain_resampled = X[train]
                ytrain_resampled = y[train]
            print('Distribution after imbalancing: {}'.format(
                Counter(ytrain_resampled)))
            print('Distribution of test set: {}'.format(Counter(y[test])))

            probas_ = rf.fit(Xtrain_resampled,
                             ytrain_resampled).predict_proba(X[test])
            predictions = rf.predict(X[test])
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(y[test],
                                             probas_[:, j],
                                             pos_label=classes[j])
            tprs.append(interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)
            i += 1
            tempAccuracy = np.sum(predictions == y[test]) / len(y[test]) * 100
            wrong.append(names_test[y[test] != predictions])
            print(tempAccuracy)
            allAcc.append(tempAccuracy)
            matr = sklearn.metrics.confusion_matrix(y[test],
                                                    predictions,
                                                    normalize='true')
            all_confMatrices.append(matr)
            print(matr)
            accuracy_tot += tempAccuracy
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        accuracy = accuracy_tot / (nsplit * len(classes))
        if True:
            if classes[j] == 'Core Collapse':
                ax.plot(mean_fpr,
                        mean_tpr,
                        color=colors[j],
                        label=r'CC (%0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                        lw=2,
                        alpha=.8)
            elif classes[j] == 'SLSN':
                ax.plot(mean_fpr,
                        mean_tpr,
                        color=colors[j],
                        label=r'%s (%0.2f $\pm$ %0.2f)' %
                        (classes[j], mean_auc, std_auc),
                        lw=2,
                        alpha=.8)
            else:
                ax.plot(mean_fpr,
                        mean_tpr,
                        color=colors[j],
                        label=r'%s (%0.2f $\pm$ %0.2f)' %
                        (classes[j].strip("SN "), mean_auc, std_auc),
                        lw=2,
                        alpha=.8)
        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        ax.fill_between(mean_fpr,
                        tprs_lower,
                        tprs_upper,
                        color=colors[j],
                        alpha=.05)

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', alpha=.8)
    #if ~foley:
    ax.set_xlabel("False Positive Rate", fontsize=16)
    ax.set_ylabel("True Positive Rate", fontsize=16)
    #plt.title("ROC Curve, %i Classes" % (len(classes)), fontsize=26)
    ax.legend(loc=4, fontsize=12)
    #    plt.text(0.1, 0.9, r'$N_{tot} = %i$'%len(y),fontsize=12)
    plt.text(0.1, 0.9, r'$N_{train} = 7000$')
    plt.text(0.1, 0.82, r'$N_{test} = 2226$')
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    #plt.savefig("Combined_MeanROC_Curve_%i_Classes_dataML_noOverlapCuts.png" % len(classes))
    return accuracy, rf, all_confMatrices, allAcc, wrong
Ejemplo n.º 30
0
                                                               random_state=42)
x8_scaled_train.shape, x8_test.shape, y8_train.shape, y8_test.shape
Counter(y5_train.failure), Counter(y5_test.failure)

#Dataset # 4
# Run this for checking results with NO UPSAMPLE or DOWNSAMPLE of data.
classify_hdd_failure(x8_scaled_train, x8_test, y8_train.values.ravel(),
                     y8_test.values.ravel())

# Method 1: Upsample minority class and Downsample majority class
from imblearn.pipeline import Pipeline
oversample = SMOTE(sampling_strategy=0.2, random_state=42)
undersample = RandomUnderSampler(sampling_strategy=0.3, random_state=42)
steps = [('o', oversample), ('u', undersample)]
pipeline = Pipeline(steps=steps)
x8_scaled_train_s, y8_train_s = pipeline.fit_resample(x8_scaled_train,
                                                      y8_train)

# the Dataset has now reduced to 354K rows. This move was mainly to reduce the size of the dataset. Computational resources.
x8_scaled_train_s.shape, x8_test.shape

# After the SMOTE, the failure percentage in the data has now increased to 33%. Could look at how the results in the analysis change with this %
Counter(y8_train_s.failure)
print(
    'The percentage of failure in the dataset is now: ',
    Counter(y8_train_s.failure)[1] /
    (Counter(y8_train_s.failure)[0] + Counter(y8_train_s.failure)[1]))
'''
# Method 2: Upsample minority class. This method upsamples the minority class to 50% of the data.
sm = SMOTE(random_state=42)
x_scaled_sm , y_sm = sm.fit_sample(x_scaled , y)
print('The percentage of failures now in data = ',Counter(y_sm.failure)[1]/(Counter(y_sm.failure)[1]+Counter(y_sm.failure)[0]))
Ejemplo n.º 31
0
# These ratio values correspond to the percentages of oversampling that were tested: 4%, 10%, 25%, 35% and 50%.
ratio_list = [0.042, 0.111, 0.333, 0.538, 1]
percentage_list = [4, 10, 25, 35, 50]

#This loop executes the oversampling strategy (In this case ADASYN) for all the ratio's that were tested.
for ratio, percentage in zip(ratio_list, percentage_list):
    #Create a train-test split where the ratio of target class is maintained
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y)
    #Initialize a ADASYN sampler with ratio that will be tested
    over = ADASYN(sampling_strategy=ratio)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [ ('o', over)]
    pipeline = Pipeline(steps)
    #Resample data
    x_res, y_res = pipeline.fit_resample(x_train, y_train)
    print('resample finished')
    #Train an xg_boost model with resampled data
    xgb = xg_boost(x_res, y_res, x_test, y_test, f"ADASYN_{percentage}")


# The code below was used to calculate the running times.
# Since some running times were very long, we let the code time-out after 10 hours.
# It is less relevant for WWF, hence it is commented out.

#List of sub-sample sizes that were evaluated to calculate running times.
# subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000]
# times_subsetsize_list = []

# def calculate_running_times():
#     for i in subset_list: