def execute(trainfile, sampler):

    print("--- Executing")
    print("Using trainfile:  ", trainfile)

    print("--- Loading (transformed) data")
    data = Data.Data()
    train_df = data.load(trainfile)
    y = train_df["is_attributed"]
    X = train_df.drop(["is_attributed"], axis=1)
    columns = X.columns.values

    before_class_weight = dict(
        zip([0, 1], compute_class_weight('balanced', [0, 1], y)))
    print("Original weights: ", before_class_weight)

    X_resampled = None
    y_resampled = None
    if sampler == "RANDOM":
        oversampler = RandomOverSampler(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    elif sampler == "ADASYN":
        oversampler = ADASYN(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    elif sampler == "SMOTE":
        oversampler = SMOTE(random_state=0)
        oversampler.fit(X, y)
        X_resampled, y_resampled = oversampler.sample(X, y)

    else:
        print("Invalid sampler: ", sampler)

    after_class_weight = dict(
        zip([0, 1], compute_class_weight('balanced', [0, 1], y_resampled)))
    print("Sampler: ", sampler, ", weights: ", after_class_weight)

    X_resampled = X_resampled.astype(int)
    y_resampled = y_resampled.astype(int)

    # print("X_resampled: ", X_resampled)
    # print("y_resampled: ", y_resampled)

    df = pd.DataFrame(data=X_resampled, columns=columns)
    df["is_attributed"] = y_resampled
    # df["is_attributed"] = df["is_attributed"].astype(int)

    compressor = "blosc"
    outfilename = trainfile + "." + sampler
    print("Output file (over-sampled): ", outfilename)
    df.to_hdf(outfilename,
              "table",
              mode="w",
              append=True,
              complevel=9,
              complib=compressor)
Esempio n. 2
0
    def __init__(self, data_pool, parameters, training):
        self.data_pool = data_pool
        self.parameters = parameters
        self.batch_size = parameters['batch_size']
        self.training = training
        # Training is defined as the boolean flag of whether the data is for training or test
        # During training, the data is sampled from a pool
        # During test, the data is sampled sequentially, and exhaustively.
        # A vector needs to be given whether the data is padding data at the end of the dataset
        # A return state needs to be given to state if all test data is given.
        self.categorical = True
        self.d_thresh_range = None

        self.val_minibatch_idx = 0
        self.d_thresh = None
        self.reduced_pool = None
        self.distance_pool_cache = {}
        self.input_mask = pd.Series([
            np.tile(self.parameters['input_mask'],
                    (self.parameters['observation_steps'], 1))
            for x in range(self.batch_size)
        ],
                                    dtype=object,
                                    index=([0] * self.batch_size))

        # Generate balanced index list
        ros = RandomOverSampler()
        if 'relative' in self.parameters['ibeo_data_columns'][0]:
            selection_data = list(data_pool.relative_destination.values)
        else:
            selection_data = list(data_pool.track_class.values)
        le = preprocessing.LabelEncoder()
        le.fit(selection_data)
        indexed_classes = np.array(le.transform(selection_data))
        ros.fit(np.expand_dims(range(len(indexed_classes)), 1),
                indexed_classes)
        balanced_idxs, balanced_classes = ros.sample(
            np.expand_dims(range(len(indexed_classes)), 1), indexed_classes)
        self.balanced_idxs = np.squeeze(balanced_idxs)
        # bf = data_pool.iloc[balanced_idxs]
        # class_dict = {}
        # for class_t in data_pool.track_class.unique():
        #     class_dict[class_t] = len(bf[bf.track_class==class_t])/float(len(bf))
        return
            TerminateOnNaN(),
            ReduceLROnPlateau(verbose=1, patience=3)
        ]
        m = build_keras_embedding_classifier(
            embeddings=embeddings,
            #activation='elu',
            lr=args.learning_rate,
            depth=args.depth,
            hidden_size=args.hidden_size,
            #lr=2.5e-7, depth=5, hidden_size=20,
            decay=args.decay,
            dropout=args.dropout,
            recurrent_dropout=args.recurrent_dropout)
        print("Using random over-sample")
        rand_os = RandomOverSampler().fit(sequences, Y)
        os_sequences, os_Y = rand_os.sample(sequences, Y)
        print(sequences.shape)
        print(os_sequences.shape)

        hist = m.fit(os_sequences,
                     os_Y,
                     epochs=100,
                     batch_size=128,
                     validation_data=(test_sequences, Y_test),
                     callbacks=cb)

        pred = m.predict(test_sequences).round().astype(int)

        metrics = binary_classification_metrics(Y_test, pred)
        print("Dev perf")
        print(metrics)
Esempio n. 4
0
plot(hist_iphone_3v)

galaxy_cor_3v = galaxy_corr
galaxy_cor_3v['galaxysentiment'] = galaxy_cor_3v['galaxysentiment'].map(mapper)
galaxy_cor_3v['galaxysentiment'] = pd.Series(galaxy_cor_3v['galaxysentiment'],
                                             dtype="category")
galaxy_cor_3v.dtypes
galaxy_cor_3v['galaxysentiment'].unique()
hist_galaxy_3v = px.histogram(galaxy_cor_3v, x="galaxysentiment")
plot(hist_galaxy_3v)

### Over sampling
# Random over sampler
ros = RandomOverSampler(random_state=0)
ros.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment'])
iphone_resampled, isent_resampled = ros.sample(iphone_corr.iloc[:, 0:46],
                                               iphone_corr['iphonesentiment'])
iphone_resampled_complete = pd.DataFrame(iphone_resampled)
iphone_resampled_complete['iphonesentiment'] = isent_resampled
hist_iphone_resampled = px.histogram(iphone_resampled_complete,
                                     x='iphonesentiment')
plot(hist_iphone_resampled)

ros.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment'])
galaxy_resampled, gsent_resampled = ros.sample(galaxy_corr.iloc[:, 0:45],
                                               galaxy_corr['galaxysentiment'])
galaxy_resampled_complete = pd.DataFrame(galaxy_resampled)
galaxy_resampled_complete['galaxysentiment'] = gsent_resampled
hist_galaxy_resampled = px.histogram(galaxy_resampled_complete,
                                     x='galaxysentiment')
plot(hist_galaxy_resampled)