Esempio n. 1
0
def test_min_max_scaler_1d():
    # Test scaling of dataset along single axis
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # Constant feature.
    X = np.zeros(5)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_greater_equal(X_scaled.min(), 0.)
    assert_less_equal(X_scaled.max(), 1.)
Esempio n. 2
0
def test_min_max_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # Constant feature.
    X = np.zeros(5)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_greater_equal(X_scaled.min(), 0.)
    assert_less_equal(X_scaled.max(), 1.)
Esempio n. 3
0
def learn(examples,
          Classifier,
          classifierArgs,
          develFolds=10,
          verbose=3,
          n_jobs=1,
          predKey="ml_comb_pred",
          limitTerms=None):
    print "Parameter grid search"
    develExamples = getSubset(examples, ["devel"])
    clf = GridSearchCV(Classifier(),
                       classifierArgs,
                       cv=develFolds,
                       verbose=verbose,
                       n_jobs=n_jobs,
                       scoring="f1_micro")
    clf.fit(develExamples["features"], develExamples["classes"])
    print "Best params", (clf.best_params_, clf.best_score_)
    print "Predicting all examples"
    minMax = MinMaxScaler((0.03, 1.0))
    allPredictions = clf.predict(examples["features"])
    if hasattr(clf, "predict_proba"):
        allProbabilities = clf.predict_proba(examples["features"])
    else:
        allProbabilities = clf.decision_function(examples["features"])
        #import pdb; pdb.set_trace()
        minMax.fit(
            allProbabilities)  #minmax_scale(testProbabilities, (0.03, 1.0))
        allProbabilities = minMax.transform(
            allProbabilities
        )  #allProbabilities = minmax_scale(allProbabilities, (0.03, 1.0))
    print "Predicting the test set"
    testExamples = getSubset(examples, ["test"])
    testPredictions = clf.predict(testExamples["features"])
    if hasattr(clf, "predict_proba"):
        testProbabilities = clf.predict_proba(testExamples["features"])
    else:
        testProbabilities = clf.decision_function(testExamples["features"])
        testProbabilities = minMax.transform(testProbabilities)
    binaryToMultiLabel(testExamples, testPredictions, testProbabilities,
                       predKey)
    print "Evaluating test set ensemble predictions"
    testProteins = {x["id"]: x for x in testExamples["proteins"]}
    multiLabelTestExamples = evaluateFile.makeExamples(testProteins,
                                                       limitTerms=limitTerms,
                                                       limitToSets=["test"],
                                                       predKey=predKey)
    loading.vectorizeExamples(multiLabelTestExamples, None, sparseLabels=True)
    results = evaluation.evaluate(multiLabelTestExamples["labels"],
                                  multiLabelTestExamples["predictions"],
                                  multiLabelTestExamples,
                                  terms=None,
                                  averageOnly=True,
                                  noAUC=True)
    print "Average for test set:", evaluation.metricsToString(
        results["average"])
    binaryToMultiLabel(examples, allPredictions, allProbabilities, predKey)
Esempio n. 4
0
class Scaler(TransformerMixin):
    def __init__(self):
        self._scaler = MinMaxScaler(feature_range=(-1, 1))

    def transform(self, df, *_):
        assert_all_finite(df)
        scaled = self._scaler.transform(df)
        df = pd.DataFrame(scaled, columns=df.columns)
        assert_all_finite(df)
        return df

    def fit(self, df, *_):
        self._scaler.fit(df)
        return self
Esempio n. 5
0
class MinMaxScalerImpl():
    def __init__(self, feature_range=(0, 1), copy=True):
        self._hyperparams = {'feature_range': feature_range, 'copy': copy}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
Esempio n. 6
0
class CreateMinMaxScaler(CreateModel):
    def fit(self, data, args):
        self.model = MinMaxScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def test(self, data):
        assert self.model is not None

        return self.model.transform(data.X_test)

    def predict(self, data):
        with Timer() as t:
            self.predictions = self.test(data)

        data.learning_task = LearningTask.REGRESSION
        return t.interval
def create_scaler():
    global data_source
    global data

    my_data = data[data_source]

    all_performances = []
    for method, performances in my_data.items():
        all_performances = all_performances + performances

    min_v = min(all_performances)
    max_v = max(all_performances)
    lower = floor(min_v)
    upper = ceil(max_v)

    if max_v < 1:
        upper = max_v
        lower = min_v

    scaler = MinMaxScaler(feature_range=(lower, upper))
    scaler.fit(np.array(all_performances).reshape(-1, 1))

    return scaler
Esempio n. 8
0
def correct_values(values, min_value, max_value):
    '''
    Ensures that values are in given range
    @param values: 1d numpy array
    '''
    # scale
    # do nothing if valid values
    lowest_val = np.min(values)
    largest_val = np.max(values)
    lowest_val_valid = lowest_val >= min_value and lowest_val < max_value
    largest_val_valid = largest_val <= max_value and largest_val > min_value
    #print("allowed: min_val: ", min_value, " max_val: ", max_value)
    #print("current: min_val: ", lowest_val, "max_val: ", largest_val)
    if lowest_val_valid and largest_val_valid:
        pass
    else:
        #print("at least one not valid")
        # +/-1 to prevent AssertionErrors caused by rounding errors
        # -> +/-1 introduces new excpetion: "ValueError: Minimum of desired
        # feature range must be smaller than maximum. Got (84.80001171045868,
        # 84). -> Therefore used without +-1 and adapted assertions.
        min_value_for_scaler = min_value  # + 1
        max_value_for_scaler = max_value  # - 1
        # re-use max/min values in data if valid, otherwise all functions would
        # be in same range
        if lowest_val_valid:
            #print("lowest valid")
            min_value_for_scaler = lowest_val
        if largest_val_valid:
            #print("largest valid")
            max_value_for_scaler = largest_val
        scaler = MinMaxScaler(feature_range=(
            min_value_for_scaler, max_value_for_scaler))
        reshaped_values = values.reshape(-1, 1)  # otherwise DeprecationWarning
        scaler = scaler.fit(reshaped_values)
        values = scaler.transform(reshaped_values)
        values = np.reshape(values, len(values))  # original shape
    # print("afterwards: min_val: ", np.min(
    #    values), " max_val: ", np.max(values))
    min_in_scaled = np.min(values)
    max_in_scaled = np.max(values)
    # test whether min_value <= min_in_scaled
    assert min_value - min_in_scaled <= 0.0000001, "current min: " + \
        str(min_in_scaled) + "but allowed min is: " + str(min_value)
    # test wheter max_in_scaled <= max_value
    assert max_in_scaled - max_value <= 0.000001, "current max: " + str(max_in_scaled) + \
        " but allowed max is: " + str(max_value)
    return values
Esempio n. 9
0
def split_train_validation_test(multi_time_series_df,
                                valid_start_time,
                                test_start_time,
                                features,
                                time_step_lag=1,
                                horizon=1,
                                target='target',
                                time_format='%Y-%m-%d %H:%M:%S',
                                freq='H'):

    if not isinstance(features, list) or len(features) < 1:
        raise Exception(
            "Bad input for features. It must be an array of dataframe colummns used"
        )

    train = multi_time_series_df.copy()[
        multi_time_series_df.index < valid_start_time]
    train = train[features]

    X_scaler = MinMaxScaler()

    if 'load' in features:
        y_scaler = MinMaxScaler()
        y_scaler.fit(train[['load']])
    else:
        y_scaler = MinMaxScaler()

        tg = train[target]
        y_scaler.fit(tg.values.reshape(-1, 1))

    train[features] = X_scaler.fit_transform(train)

    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    train_inputs = TimeSeriesTensor(train,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(train_inputs.dataframe.head())

    look_back_dt = dt.datetime.strptime(
        valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1)
    valid = multi_time_series_df.copy()[
        (multi_time_series_df.index >= look_back_dt)
        & (multi_time_series_df.index < test_start_time)]
    valid = valid[features]
    valid[features] = X_scaler.transform(valid)
    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    valid_inputs = TimeSeriesTensor(valid,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(valid_inputs.dataframe.head())

    # test set
    # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1)
    test = multi_time_series_df.copy()[test_start_time:]
    test = test[features]
    test[features] = X_scaler.transform(test)
    test_inputs = TimeSeriesTensor(test,
                                   target=target,
                                   H=horizon,
                                   freq=freq,
                                   tensor_structure=tensor_structure)

    print("time lag:", time_step_lag, "original_feature:", len(features))

    return train_inputs, valid_inputs, test_inputs, y_scaler
Esempio n. 10
0
import tensorflow as tf

# Non-normalized inputs
xy = np.array([[828.659973, 833.450012, 908100, 828.349976, 831.659973],
               [823.02002, 828.070007, 1828100, 821.655029, 828.070007],
               [819.929993, 824.400024, 1438100, 818.97998, 824.159973],
               [816, 820.958984, 1008100, 815.48999, 819.23999],
               [819.359985, 823, 1188100, 818.469971, 818.97998],
               [819, 823, 1198100, 816, 820.450012],
               [811.700012, 815.25, 1098100, 809.780029, 813.669983],
               [809.51001, 816.659973, 1398100, 804.539978, 809.559998]])

# Nomalization - MinMaxScaler => 0 ~ 1
from sklearn.preprocessing.data import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(xy)
xy = scaler.transform(xy)

x_data = xy[:,0:-1]
y_data = xy[:,[-1]]

X = tf.placeholder(tf.float32, shape=[None,4])
Y = tf.placeholder(tf.float32, shape=[None,1])
W = tf.Variable(tf.random_normal([4,1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

hypothesis = tf.matmul(X, W) + b
cost = tf.reduce_mean(tf.square(hypothesis - Y))

# minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-5)